In [2]:
from transformers import Trainer, BertTokenizer, BertForSequenceClassification
from datasets import load_from_disk
import optuna
import torch
import math
import base

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU.")

GPU is available and will be used: NVIDIA A100 80GB PCIe MIG 1g.10gb


In [4]:
DATASET = "trec"

In [5]:
train_data = load_from_disk(f"~/data/{DATASET}/train-logits_coarse")
eval_data = load_from_disk(f"~/data/{DATASET}/eval-logits_coarse")
test_data = load_from_disk(f"~/data/{DATASET}/test-logits_coarse")

all_train_data = load_from_disk(f"~/data/{DATASET}/train-logits-augmented_coarse")
tokenizer = BertTokenizer.from_pretrained("ndavid/autotrain-trec-fine-bert-739422530")

In [6]:
train = train_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the train dataset")
eval = eval_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the eval dataset")
test = test_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the test dataset")

train_aug = all_train_data.map(lambda e: tokenizer(e["sentence"], truncation=True, padding="max_length", return_tensors="pt", max_length=300), batched=True, desc="Tokenizing the augmented dataset")

In [7]:
num_epochs = 15
batch_size = 128

In [8]:
#Nápočet epoch na steps
data_length = len(train_data)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [9]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up)
    }   
    print(f"Trial {trial.number} with params: {params}")
    return params

In [10]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [11]:
def get_Bert():
    return BertForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2", num_labels=6)

In [12]:
base.reset_seed()

In [12]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_coarse_hp-search", logging_dir=f"~/logs/{DATASET}/bert-base_coarse_hp-search", epochs=num_epochs, batch_size=batch_size)

In [13]:
trainer = Trainer(
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert(),
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-base",
    n_trials=150
)

[I 2025-03-27 19:01:11,849] A new study created in memory with name: Test-base


Trial 0 with params: {'learning_rate': 4.3284502212938785e-05, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7507,1.682168,0.289643,0.180136,0.215374,0.174118
2,1.6614,1.604126,0.413382,0.203241,0.312456,0.235986
3,1.5842,1.504832,0.451879,0.363096,0.348685,0.29192
4,1.4826,1.398842,0.547204,0.527174,0.444096,0.427751
5,1.3804,1.299083,0.626031,0.574708,0.525069,0.524653
6,1.2849,1.213912,0.681943,0.604434,0.580901,0.579261
7,1.2015,1.141451,0.705775,0.617201,0.602667,0.600785
8,1.1341,1.079239,0.721357,0.624162,0.61742,0.614636
9,1.0732,1.033059,0.732356,0.633858,0.626941,0.623792
10,1.0285,0.994233,0.744271,0.639393,0.637056,0.633078


[I 2025-03-27 19:02:01,862] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.00010401663679887307, 'weight_decay': 0.001, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6932,1.588349,0.428048,0.21529,0.321611,0.257904
2,1.4973,1.339411,0.610449,0.568526,0.504036,0.500063
3,1.2475,1.113223,0.703941,0.633796,0.599443,0.600553
4,1.0278,0.925281,0.756187,0.650666,0.647153,0.644288
5,0.8576,0.791263,0.802016,0.675802,0.688156,0.681036


[I 2025-03-27 19:02:26,927] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 1.2551115172973821e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7971,1.762378,0.232814,0.197897,0.208418,0.151198
2,1.7484,1.715631,0.270394,0.354262,0.201944,0.154271
3,1.7151,1.689168,0.306141,0.348082,0.228391,0.188557
4,1.6903,1.667396,0.343721,0.177081,0.258559,0.209691
5,1.6714,1.647971,0.378552,0.187486,0.285441,0.225798
6,1.6546,1.628233,0.40055,0.19632,0.302051,0.235047
7,1.641,1.609832,0.411549,0.20577,0.310843,0.23411
8,1.626,1.593057,0.417965,0.37389,0.316122,0.241766
9,1.6115,1.577995,0.421632,0.375558,0.318851,0.244296
10,1.6004,1.564592,0.428048,0.342203,0.324214,0.251841


[I 2025-03-27 19:03:17,062] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.00015958573588141273, 'weight_decay': 0.0, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.671,1.507241,0.445463,0.362478,0.340553,0.27868
2,1.353,1.136776,0.699358,0.616359,0.596553,0.594424
3,1.0039,0.857615,0.775435,0.666377,0.662519,0.661546
4,0.759,0.694952,0.812099,0.68948,0.692513,0.690045
5,0.5946,0.581507,0.847846,0.714099,0.72485,0.719171


[I 2025-03-27 19:03:41,855] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00025959425503112657, 'weight_decay': 0.002, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5924,1.316899,0.60495,0.57099,0.493022,0.487579
2,1.1026,0.841237,0.784601,0.672465,0.67075,0.670515
3,0.6848,0.588435,0.843263,0.713186,0.719973,0.716147
4,0.4648,0.49405,0.860678,0.728364,0.732217,0.729603
5,0.3466,0.456953,0.862511,0.726279,0.73781,0.731351
6,0.2749,0.441177,0.874427,0.869412,0.783153,0.799988
7,0.2209,0.422862,0.875344,0.882524,0.810024,0.831908
8,0.1791,0.42591,0.87901,0.886019,0.821395,0.842664
9,0.1578,0.417921,0.883593,0.890669,0.825956,0.84719
10,0.1355,0.420935,0.883593,0.88883,0.826273,0.846645


[I 2025-03-27 19:04:59,983] Trial 4 finished with value: 0.8440468911949951 and parameters: {'learning_rate': 0.00025959425503112657, 'weight_decay': 0.002, 'warmup_steps': 0}. Best is trial 4 with value: 0.8440468911949951.


Trial 5 with params: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7816,1.729019,0.263061,0.234422,0.202012,0.165461
2,1.7163,1.683784,0.309808,0.188471,0.231,0.187396
3,1.6804,1.644429,0.384051,0.188576,0.289532,0.226168
4,1.6458,1.607213,0.410632,0.201187,0.310024,0.235392
5,1.6101,1.567363,0.423465,0.341728,0.320824,0.247272


[I 2025-03-27 19:05:30,210] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 5.4182823195332406e-05, 'weight_decay': 0.003, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7395,1.665097,0.308891,0.177313,0.231143,0.190252
2,1.6363,1.561136,0.421632,0.371409,0.318824,0.242928
3,1.5299,1.431357,0.516957,0.528092,0.412695,0.383426
4,1.3954,1.300258,0.618698,0.574069,0.516677,0.516812
5,1.27,1.181868,0.689276,0.61247,0.586098,0.587243
6,1.1568,1.08532,0.72044,0.62222,0.616618,0.612741
7,1.0635,1.006964,0.739688,0.637146,0.632954,0.630908
8,0.9891,0.944066,0.750687,0.642604,0.642469,0.638988
9,0.9235,0.898518,0.759853,0.65299,0.650312,0.647591
10,0.878,0.859458,0.768103,0.655821,0.65768,0.653614


[I 2025-03-27 19:06:57,077] Trial 6 finished with value: 0.6847452659408965 and parameters: {'learning_rate': 5.4182823195332406e-05, 'weight_decay': 0.003, 'warmup_steps': 3}. Best is trial 4 with value: 0.8440468911949951.


Trial 7 with params: {'learning_rate': 1.7258215396625005e-05, 'weight_decay': 0.003, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7865,1.740285,0.249313,0.175827,0.202254,0.162277
2,1.727,1.694637,0.28506,0.183234,0.211199,0.161653
3,1.6931,1.663747,0.342805,0.178602,0.257657,0.209979
4,1.6642,1.632715,0.40055,0.196821,0.302063,0.233561
5,1.6366,1.602218,0.415215,0.370883,0.313908,0.241908


[I 2025-03-27 19:07:26,693] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 5.954553793888986e-05, 'weight_decay': 0.008, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7275,1.658258,0.312557,0.192902,0.233043,0.187782
2,1.6199,1.534851,0.433547,0.376192,0.328262,0.256478
3,1.4949,1.389836,0.536205,0.535339,0.433807,0.411261
4,1.3457,1.246576,0.659945,0.605567,0.557241,0.561003
5,1.211,1.122166,0.708524,0.619138,0.605747,0.604238


[I 2025-03-27 19:07:54,755] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 7.475992999956501e-05, 'weight_decay': 0.006, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7143,1.631602,0.358387,0.192674,0.269247,0.221886
2,1.5812,1.469944,0.503208,0.491567,0.393548,0.35285
3,1.409,1.289908,0.615949,0.581605,0.51442,0.512084
4,1.2275,1.120943,0.704858,0.621233,0.600412,0.601767
5,1.0737,0.989765,0.744271,0.638551,0.638647,0.633736
6,0.9454,0.889868,0.762603,0.650474,0.652666,0.648718
7,0.8501,0.80981,0.796517,0.675224,0.681693,0.67779
8,0.7681,0.748371,0.816682,0.69088,0.698412,0.694232
9,0.7035,0.708204,0.820348,0.695434,0.701532,0.697816
10,0.6586,0.674665,0.828598,0.699001,0.709421,0.703863


[I 2025-03-27 19:09:19,637] Trial 9 finished with value: 0.7188441224183952 and parameters: {'learning_rate': 7.475992999956501e-05, 'weight_decay': 0.006, 'warmup_steps': 0}. Best is trial 4 with value: 0.8440468911949951.


Trial 10 with params: {'learning_rate': 0.0004587604755149822, 'weight_decay': 0.002, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4741,1.061512,0.734189,0.641556,0.625139,0.62845
2,0.7946,0.569879,0.836847,0.706814,0.714074,0.710071
3,0.4113,0.440312,0.868011,0.731259,0.739497,0.735191
4,0.254,0.430421,0.871677,0.883811,0.814863,0.838237
5,0.1853,0.433,0.878093,0.880477,0.824011,0.840781
6,0.1389,0.452823,0.87901,0.889098,0.832111,0.851865
7,0.098,0.475012,0.874427,0.884798,0.81889,0.839416
8,0.0752,0.487514,0.873511,0.851077,0.826018,0.836433
9,0.0632,0.495835,0.871677,0.866845,0.826572,0.841239
10,0.0511,0.505614,0.87626,0.873946,0.838081,0.852026


[I 2025-03-27 19:10:52,630] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.00023012528778943483, 'weight_decay': 0.006, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6099,1.359435,0.568286,0.548079,0.456613,0.437887
2,1.1652,0.911453,0.765353,0.660596,0.653968,0.654387
3,0.7605,0.643944,0.833181,0.703556,0.712724,0.7079
4,0.5286,0.529864,0.852429,0.722077,0.725643,0.723234
5,0.3985,0.479638,0.864345,0.728917,0.739143,0.733174
6,0.3184,0.448628,0.871677,0.899761,0.771199,0.787058
7,0.2558,0.422232,0.877177,0.878024,0.793826,0.813867
8,0.2125,0.419556,0.880843,0.886169,0.814235,0.83606
9,0.1868,0.412516,0.88451,0.890573,0.826747,0.847706
10,0.166,0.412409,0.879927,0.887195,0.82305,0.844151


[I 2025-03-27 19:12:19,823] Trial 11 finished with value: 0.847382425561633 and parameters: {'learning_rate': 0.00023012528778943483, 'weight_decay': 0.006, 'warmup_steps': 0}. Best is trial 11 with value: 0.847382425561633.


Trial 12 with params: {'learning_rate': 0.00035174585398257074, 'weight_decay': 0.007, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.535,1.178179,0.672777,0.61251,0.564097,0.572484
2,0.9284,0.677736,0.818515,0.692967,0.700151,0.696422
3,0.5189,0.493472,0.861595,0.724912,0.735465,0.730018
4,0.3392,0.438411,0.868928,0.735166,0.739105,0.73659
5,0.2481,0.425054,0.874427,0.871611,0.792397,0.810262
6,0.1852,0.420885,0.882676,0.888112,0.825563,0.846038
7,0.1544,0.422834,0.880843,0.888868,0.823728,0.84482
8,0.1193,0.430541,0.882676,0.889203,0.825231,0.845997
9,0.101,0.432265,0.888176,0.892447,0.829703,0.850103
10,0.0834,0.430838,0.890009,0.897681,0.830618,0.853044


[I 2025-03-27 19:13:15,638] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.00021976631986270965, 'weight_decay': 0.005, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6262,1.386829,0.551787,0.561145,0.440862,0.410699
2,1.1898,0.931067,0.758937,0.654479,0.648156,0.648399
3,0.7828,0.661505,0.833181,0.704378,0.712011,0.707797
4,0.5531,0.547112,0.845096,0.715631,0.71947,0.716726
5,0.423,0.482418,0.865261,0.727888,0.739428,0.733231
6,0.3352,0.451224,0.872594,0.899499,0.753834,0.7566
7,0.2723,0.425051,0.875344,0.871774,0.782944,0.801065
8,0.2266,0.415936,0.88176,0.884224,0.805792,0.827307
9,0.1975,0.409258,0.886343,0.888722,0.820015,0.840291
10,0.1787,0.409433,0.878093,0.882764,0.812512,0.833562


[I 2025-03-27 19:14:39,978] Trial 13 finished with value: 0.8447284223217645 and parameters: {'learning_rate': 0.00021976631986270965, 'weight_decay': 0.005, 'warmup_steps': 2}. Best is trial 11 with value: 0.847382425561633.


Trial 14 with params: {'learning_rate': 0.00024129748744731343, 'weight_decay': 0.004, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6135,1.351736,0.566453,0.542447,0.456088,0.435794
2,1.1436,0.877147,0.774519,0.664832,0.661377,0.661249
3,0.7252,0.616742,0.834097,0.706406,0.712219,0.708866
4,0.5022,0.516972,0.857929,0.725992,0.729519,0.726987
5,0.3813,0.462515,0.868928,0.730044,0.742523,0.735947
6,0.2992,0.439151,0.872594,0.899832,0.772246,0.787606
7,0.2424,0.416089,0.882676,0.89012,0.815916,0.838534
8,0.1997,0.412817,0.879927,0.885671,0.813819,0.835623
9,0.1731,0.40975,0.882676,0.888246,0.825929,0.846239
10,0.1531,0.411398,0.88176,0.887751,0.825274,0.845617


[I 2025-03-27 19:15:36,176] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.0003984445466132129, 'weight_decay': 0.008, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5424,1.16127,0.671861,0.603963,0.56635,0.571236
2,0.8899,0.634674,0.827681,0.698897,0.70728,0.702866
3,0.474,0.481855,0.851512,0.717917,0.725443,0.720855
4,0.3004,0.426754,0.865261,0.730997,0.736426,0.733135
5,0.2173,0.411137,0.883593,0.888487,0.826802,0.846774
6,0.159,0.415926,0.879927,0.869832,0.824954,0.839924
7,0.1253,0.419562,0.885426,0.891271,0.828369,0.848374
8,0.0974,0.423533,0.885426,0.891091,0.827785,0.848393
9,0.0792,0.441286,0.882676,0.873423,0.825986,0.842056
10,0.0677,0.429578,0.887259,0.879766,0.828077,0.846569


[I 2025-03-27 19:17:04,408] Trial 15 finished with value: 0.8497021629312894 and parameters: {'learning_rate': 0.0003984445466132129, 'weight_decay': 0.008, 'warmup_steps': 3}. Best is trial 15 with value: 0.8497021629312894.


Trial 16 with params: {'learning_rate': 0.00012760914244204434, 'weight_decay': 0.007, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6877,1.556505,0.44088,0.384324,0.332479,0.264812
2,1.438,1.258452,0.648029,0.597064,0.545248,0.545449
3,1.1455,1.006419,0.735105,0.646814,0.629218,0.628904
4,0.9075,0.816001,0.770852,0.658975,0.659093,0.656091
5,0.7314,0.681456,0.827681,0.696665,0.709253,0.702797


[I 2025-03-27 19:17:32,932] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.00033744255997700685, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5664,1.22435,0.658112,0.598972,0.552549,0.557218
2,0.9722,0.705806,0.813932,0.689326,0.695806,0.692352
3,0.5438,0.507694,0.853346,0.720615,0.727276,0.723478
4,0.3526,0.444498,0.865261,0.734086,0.734235,0.733113
5,0.2647,0.426253,0.882676,0.879243,0.798953,0.817407
6,0.1982,0.425277,0.879927,0.885515,0.824042,0.843722
7,0.1595,0.412908,0.88451,0.891161,0.827123,0.847868
8,0.1203,0.424183,0.883593,0.88925,0.826521,0.84699
9,0.1001,0.434816,0.880843,0.88634,0.824341,0.844373
10,0.0846,0.433405,0.886343,0.893275,0.827863,0.849586


[I 2025-03-27 19:18:59,993] Trial 17 finished with value: 0.8430105970289516 and parameters: {'learning_rate': 0.00033744255997700685, 'weight_decay': 0.01, 'warmup_steps': 3}. Best is trial 15 with value: 0.8497021629312894.


Trial 18 with params: {'learning_rate': 0.0002950137270531351, 'weight_decay': 0.01, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5824,1.274277,0.626948,0.575968,0.519731,0.521062
2,1.0449,0.775332,0.802016,0.682332,0.684687,0.68304
3,0.6129,0.542413,0.846929,0.716939,0.72161,0.718637
4,0.4056,0.466787,0.860678,0.730714,0.731418,0.729945
5,0.3039,0.439402,0.877177,0.875863,0.794383,0.81338
6,0.2324,0.433276,0.875344,0.87756,0.801676,0.82159
7,0.1908,0.413657,0.878093,0.885749,0.811553,0.834278
8,0.1493,0.422759,0.87901,0.886782,0.822088,0.84346
9,0.1267,0.423971,0.885426,0.890167,0.827881,0.848068
10,0.1093,0.421631,0.882676,0.887505,0.825849,0.845755


[I 2025-03-27 19:20:25,954] Trial 18 finished with value: 0.8461067000708867 and parameters: {'learning_rate': 0.0002950137270531351, 'weight_decay': 0.01, 'warmup_steps': 1}. Best is trial 15 with value: 0.8497021629312894.


Trial 19 with params: {'learning_rate': 0.00030738776631319915, 'weight_decay': 0.008, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5846,1.267248,0.607699,0.570689,0.499665,0.493897
2,1.0252,0.752363,0.808433,0.687009,0.690985,0.68845
3,0.5919,0.528764,0.854262,0.722458,0.727779,0.724619
4,0.3896,0.457729,0.862511,0.731226,0.732514,0.730898
5,0.2917,0.430801,0.873511,0.866798,0.781867,0.798469
6,0.2242,0.424255,0.880843,0.887122,0.824692,0.844679
7,0.1813,0.408568,0.87901,0.888572,0.821521,0.843984
8,0.1392,0.418612,0.879927,0.887993,0.823121,0.844627
9,0.1166,0.419283,0.883593,0.888294,0.827017,0.846593
10,0.0991,0.423574,0.883593,0.888315,0.826053,0.846382


[I 2025-03-27 19:21:22,725] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 0.00015132692292924965, 'weight_decay': 0.008, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6701,1.512888,0.455545,0.364476,0.349106,0.293202
2,1.3684,1.159361,0.694775,0.616341,0.591536,0.590916
3,1.0318,0.886899,0.76352,0.658884,0.653049,0.652
4,0.7894,0.717624,0.810266,0.68652,0.691546,0.68811
5,0.6227,0.600448,0.846013,0.712465,0.723314,0.717697
6,0.5016,0.533369,0.849679,0.715189,0.72574,0.720214
7,0.4186,0.492164,0.854262,0.720407,0.729349,0.724508
8,0.3571,0.46954,0.863428,0.727126,0.736389,0.731602
9,0.3186,0.449888,0.867094,0.730025,0.740418,0.735061
10,0.2903,0.444641,0.869844,0.730992,0.742525,0.736656


[I 2025-03-27 19:22:18,746] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 0.00047289634499031154, 'weight_decay': 0.01, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5027,1.079311,0.72319,0.637002,0.613743,0.618476
2,0.8081,0.577343,0.835014,0.705548,0.712848,0.708867
3,0.4206,0.457151,0.857012,0.72261,0.730699,0.726019
4,0.2567,0.421834,0.878093,0.906329,0.757455,0.761707
5,0.1837,0.426329,0.878093,0.883044,0.823305,0.84196
6,0.1395,0.430927,0.879927,0.885403,0.824299,0.843601
7,0.1011,0.429255,0.87626,0.870542,0.820404,0.837906
8,0.0733,0.464558,0.877177,0.863237,0.830074,0.842973
9,0.0582,0.480965,0.875344,0.856705,0.820379,0.833482
10,0.0483,0.472192,0.883593,0.871527,0.844676,0.855675


[I 2025-03-27 19:23:42,960] Trial 21 finished with value: 0.8300787364403353 and parameters: {'learning_rate': 0.00047289634499031154, 'weight_decay': 0.01, 'warmup_steps': 1}. Best is trial 15 with value: 0.8497021629312894.


Trial 22 with params: {'learning_rate': 0.0004809554823253182, 'weight_decay': 0.008, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5074,1.074366,0.722273,0.635681,0.614586,0.619265
2,0.8018,0.571176,0.842346,0.712396,0.718998,0.715384
3,0.4118,0.448533,0.855179,0.721142,0.728916,0.724461
4,0.2519,0.419583,0.877177,0.904586,0.79315,0.816448
5,0.1806,0.409558,0.883593,0.887156,0.827653,0.846088
6,0.1314,0.436211,0.882676,0.888324,0.826961,0.846391
7,0.0991,0.428668,0.87626,0.870266,0.821073,0.838183
8,0.0719,0.454021,0.885426,0.880923,0.836569,0.852864
9,0.0588,0.466621,0.877177,0.862922,0.830666,0.843298
10,0.0524,0.451754,0.885426,0.874172,0.844734,0.857227


[I 2025-03-27 19:25:09,095] Trial 22 finished with value: 0.854168263050791 and parameters: {'learning_rate': 0.0004809554823253182, 'weight_decay': 0.008, 'warmup_steps': 2}. Best is trial 22 with value: 0.854168263050791.


Trial 23 with params: {'learning_rate': 0.00034257541186872263, 'weight_decay': 0.007, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5645,1.219224,0.653529,0.595712,0.547405,0.551074
2,0.9658,0.699307,0.813932,0.689527,0.695827,0.692436
3,0.537,0.505476,0.852429,0.719624,0.726394,0.722503
4,0.3473,0.442919,0.864345,0.733473,0.733603,0.732483
5,0.2614,0.424419,0.88176,0.878778,0.798272,0.816789
6,0.1941,0.424117,0.880843,0.885873,0.825035,0.844453
7,0.1557,0.412908,0.886343,0.893424,0.828373,0.849464
8,0.118,0.421804,0.882676,0.888938,0.824994,0.8461
9,0.0976,0.432413,0.882676,0.887781,0.825723,0.845765
10,0.0832,0.429953,0.890009,0.896335,0.830341,0.852357


[I 2025-03-27 19:26:34,685] Trial 23 finished with value: 0.8454988068018388 and parameters: {'learning_rate': 0.00034257541186872263, 'weight_decay': 0.007, 'warmup_steps': 3}. Best is trial 22 with value: 0.854168263050791.


Trial 24 with params: {'learning_rate': 0.0004848441062547992, 'weight_decay': 0.007, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4969,1.066588,0.72594,0.639767,0.615978,0.620532
2,0.7965,0.571442,0.836847,0.706657,0.714464,0.710191
3,0.4131,0.454835,0.859762,0.724801,0.733228,0.728476
4,0.2506,0.420278,0.878093,0.905314,0.785033,0.805419
5,0.1773,0.433992,0.877177,0.881816,0.822969,0.84099
6,0.1328,0.435547,0.880843,0.886635,0.824885,0.844637
7,0.0977,0.441177,0.874427,0.867785,0.82036,0.836273
8,0.0705,0.467399,0.870761,0.854249,0.815841,0.829888
9,0.0574,0.488544,0.875344,0.858942,0.830407,0.841153
10,0.0462,0.47847,0.882676,0.865355,0.835609,0.847058


[I 2025-03-27 19:28:02,116] Trial 24 finished with value: 0.8436849772669791 and parameters: {'learning_rate': 0.0004848441062547992, 'weight_decay': 0.007, 'warmup_steps': 1}. Best is trial 22 with value: 0.854168263050791.


Trial 25 with params: {'learning_rate': 0.00039147085940282174, 'weight_decay': 0.006, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5483,1.17625,0.673694,0.605478,0.56745,0.573021
2,0.9111,0.645706,0.826764,0.699731,0.706483,0.702882
3,0.4863,0.478373,0.857929,0.721705,0.732122,0.726663
4,0.3079,0.431036,0.860678,0.729357,0.732146,0.729973
5,0.2237,0.41045,0.87901,0.882921,0.814266,0.834571
6,0.1617,0.41395,0.882676,0.886376,0.826919,0.845616
7,0.1273,0.407248,0.88451,0.89129,0.826739,0.847765
8,0.0977,0.430025,0.886343,0.892292,0.827936,0.848983
9,0.0803,0.440346,0.880843,0.861213,0.824611,0.838016
10,0.0675,0.436203,0.887259,0.895378,0.828058,0.850696


[I 2025-03-27 19:28:59,542] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.00021226717708304382, 'weight_decay': 0.006, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6209,1.38676,0.551787,0.565522,0.439848,0.41077
2,1.2025,0.954009,0.756187,0.653944,0.646706,0.646712
3,0.8084,0.683596,0.827681,0.69996,0.708745,0.703999
4,0.5738,0.559699,0.842346,0.713996,0.717223,0.714884
5,0.4375,0.494989,0.858845,0.724019,0.734516,0.728546
6,0.348,0.456089,0.868011,0.729849,0.740566,0.735002
7,0.2805,0.427889,0.872594,0.860476,0.771246,0.785719
8,0.2348,0.42199,0.878093,0.876799,0.793643,0.813531
9,0.2064,0.413832,0.882676,0.886705,0.816395,0.837375
10,0.1864,0.411596,0.87901,0.883752,0.813158,0.834414


[I 2025-03-27 19:29:56,510] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.0004960148652071169, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5011,1.060391,0.734189,0.645153,0.624389,0.628575
2,0.7881,0.565189,0.835014,0.706473,0.71291,0.709372
3,0.4025,0.444282,0.862511,0.7277,0.733902,0.730191
4,0.2453,0.422238,0.87626,0.903149,0.792899,0.81551
5,0.1742,0.41209,0.883593,0.887172,0.827353,0.845951
6,0.1278,0.447615,0.872594,0.88191,0.817991,0.838412
7,0.097,0.424677,0.885426,0.891468,0.828245,0.848707
8,0.0701,0.448473,0.885426,0.876575,0.827573,0.844626
9,0.0586,0.479812,0.87901,0.857631,0.841712,0.847915
10,0.045,0.477217,0.889093,0.882549,0.840065,0.855945


[I 2025-03-27 19:31:23,741] Trial 27 finished with value: 0.8577395866152763 and parameters: {'learning_rate': 0.0004960148652071169, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}. Best is trial 27 with value: 0.8577395866152763.


Trial 28 with params: {'learning_rate': 0.00041323006630510317, 'weight_decay': 0.005, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5344,1.145207,0.684693,0.615991,0.578161,0.581099
2,0.8707,0.621601,0.824931,0.697642,0.704187,0.700683
3,0.4613,0.48074,0.855179,0.720024,0.730836,0.724445
4,0.2896,0.431471,0.871677,0.84842,0.759126,0.771007
5,0.2118,0.413111,0.880843,0.886235,0.824809,0.844535
6,0.1549,0.426882,0.87901,0.883663,0.824649,0.842995
7,0.1165,0.417485,0.885426,0.891884,0.828862,0.849097
8,0.0946,0.440597,0.877177,0.885382,0.821145,0.842045
9,0.0771,0.462166,0.878093,0.870118,0.822776,0.838733
10,0.0621,0.449662,0.880843,0.886274,0.824023,0.844324


[I 2025-03-27 19:32:20,054] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 0.00044698080382935565, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5225,1.113271,0.703941,0.618754,0.59851,0.603224
2,0.8381,0.594406,0.835014,0.707289,0.712843,0.709722
3,0.4359,0.462433,0.858845,0.724674,0.731756,0.727405
4,0.2685,0.419954,0.875344,0.904547,0.763712,0.775702
5,0.1931,0.399893,0.888176,0.892184,0.830871,0.85048
6,0.1378,0.416167,0.885426,0.889582,0.828794,0.848159
7,0.1061,0.436196,0.882676,0.890612,0.827006,0.846765
8,0.0816,0.441143,0.878093,0.886018,0.821816,0.84261
9,0.0655,0.460696,0.87626,0.874425,0.838787,0.852867
10,0.0566,0.442703,0.882676,0.873706,0.851837,0.861414


[I 2025-03-27 19:33:45,364] Trial 29 finished with value: 0.8478474776739985 and parameters: {'learning_rate': 0.00044698080382935565, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}. Best is trial 27 with value: 0.8577395866152763.


Trial 30 with params: {'learning_rate': 0.0004679821820075427, 'weight_decay': 0.007, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5155,1.086986,0.712191,0.628159,0.605894,0.610655
2,0.8123,0.57636,0.840513,0.71244,0.716223,0.713872
3,0.4188,0.4558,0.855179,0.721241,0.728206,0.723938
4,0.2557,0.4236,0.877177,0.905147,0.784098,0.804781
5,0.183,0.403024,0.891842,0.895866,0.833057,0.853042
6,0.134,0.424523,0.88451,0.888032,0.828106,0.846864
7,0.0996,0.427171,0.883593,0.891087,0.826759,0.847516
8,0.073,0.45192,0.882676,0.880941,0.843341,0.858245
9,0.0573,0.482259,0.871677,0.87087,0.835092,0.849023
10,0.053,0.484766,0.886343,0.886921,0.865201,0.873996


[I 2025-03-27 19:35:10,322] Trial 30 finished with value: 0.8596597868624403 and parameters: {'learning_rate': 0.0004679821820075427, 'weight_decay': 0.007, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 31 with params: {'learning_rate': 0.0004765325084499721, 'weight_decay': 0.007, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5124,1.079928,0.716774,0.629339,0.610084,0.613439
2,0.8064,0.571418,0.837764,0.710451,0.71391,0.711724
3,0.4139,0.454587,0.857929,0.723399,0.730687,0.726438
4,0.2524,0.424751,0.875344,0.903459,0.782765,0.803249
5,0.1793,0.406122,0.887259,0.891452,0.829675,0.84925
6,0.1322,0.429158,0.882676,0.886434,0.82694,0.845352
7,0.099,0.420169,0.88451,0.889916,0.827863,0.847924
8,0.0712,0.448626,0.883593,0.879451,0.834721,0.851626
9,0.0548,0.473549,0.878093,0.873909,0.831263,0.846978
10,0.0501,0.467877,0.890926,0.873775,0.868893,0.870659


[I 2025-03-27 19:36:36,487] Trial 31 finished with value: 0.8526728931329014 and parameters: {'learning_rate': 0.0004765325084499721, 'weight_decay': 0.007, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 32 with params: {'learning_rate': 0.0004269457066379035, 'weight_decay': 0.007, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5344,1.142795,0.692942,0.605065,0.588481,0.590454
2,0.8581,0.606578,0.829514,0.700327,0.709311,0.704629
3,0.4476,0.468894,0.855179,0.720706,0.729383,0.724345
4,0.2784,0.422912,0.87626,0.904706,0.782761,0.803935
5,0.1987,0.401843,0.887259,0.891795,0.829584,0.849649
6,0.1466,0.418903,0.885426,0.875678,0.829239,0.844945
7,0.115,0.421184,0.887259,0.877331,0.830199,0.846231
8,0.0853,0.439088,0.879927,0.874289,0.822382,0.840462
9,0.0708,0.457413,0.878093,0.862463,0.831484,0.843332
10,0.0584,0.443837,0.888176,0.874634,0.847296,0.858675


[I 2025-03-27 19:38:04,299] Trial 32 finished with value: 0.8521291430243659 and parameters: {'learning_rate': 0.0004269457066379035, 'weight_decay': 0.007, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 33 with params: {'learning_rate': 0.0004980480029650571, 'weight_decay': 0.007, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5002,1.058659,0.736022,0.645491,0.626252,0.62997
2,0.7863,0.564298,0.835014,0.706225,0.712579,0.709063
3,0.4011,0.443563,0.864345,0.729519,0.735221,0.731808
4,0.2443,0.42217,0.87626,0.903084,0.792567,0.815366
5,0.1733,0.41326,0.88451,0.887957,0.828284,0.84677
6,0.1259,0.448093,0.870761,0.879486,0.817138,0.836787
7,0.0965,0.430059,0.882676,0.890311,0.82665,0.847012
8,0.0703,0.4476,0.88451,0.868613,0.836295,0.849013
9,0.0587,0.480685,0.87626,0.874274,0.839747,0.852813
10,0.0456,0.478104,0.888176,0.884505,0.848236,0.862587


[I 2025-03-27 19:39:33,613] Trial 33 finished with value: 0.8517816428966953 and parameters: {'learning_rate': 0.0004980480029650571, 'weight_decay': 0.007, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 34 with params: {'learning_rate': 0.0004642129651010182, 'weight_decay': 0.008, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5146,1.092543,0.715857,0.628304,0.609261,0.613782
2,0.8186,0.581944,0.840513,0.711526,0.71732,0.714018
3,0.4235,0.457098,0.859762,0.724712,0.732555,0.728056
4,0.2596,0.420257,0.877177,0.905656,0.783686,0.804822
5,0.1877,0.401515,0.888176,0.892277,0.830703,0.850328
6,0.1332,0.421846,0.888176,0.892563,0.830727,0.850524
7,0.1029,0.42886,0.880843,0.887514,0.825737,0.845038
8,0.0787,0.44196,0.879927,0.888689,0.832764,0.852198
9,0.0602,0.477989,0.874427,0.85395,0.837613,0.844455
10,0.0584,0.457882,0.886343,0.872192,0.83673,0.850971


[I 2025-03-27 19:41:02,124] Trial 34 finished with value: 0.8500009819048704 and parameters: {'learning_rate': 0.0004642129651010182, 'weight_decay': 0.008, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 35 with params: {'learning_rate': 0.000483446374017663, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5097,1.07338,0.715857,0.62775,0.609272,0.611956
2,0.8005,0.566785,0.83868,0.711413,0.715124,0.712832
3,0.409,0.45355,0.857012,0.722829,0.729658,0.72568
4,0.2499,0.425374,0.873511,0.901778,0.781431,0.801768
5,0.1766,0.408866,0.887259,0.890939,0.829653,0.849
6,0.1305,0.436958,0.877177,0.881907,0.822829,0.84091
7,0.1032,0.429098,0.87901,0.885976,0.823297,0.843635
8,0.0718,0.4538,0.882676,0.879278,0.834693,0.851256
9,0.0545,0.484059,0.875344,0.850657,0.830016,0.837993
10,0.0486,0.47455,0.890926,0.890563,0.868893,0.87789


[I 2025-03-27 19:42:26,589] Trial 35 finished with value: 0.8540741759588646 and parameters: {'learning_rate': 0.000483446374017663, 'weight_decay': 0.01, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 36 with params: {'learning_rate': 1.0625556226593494e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7995,1.770072,0.217232,0.168505,0.200401,0.137416
2,1.7572,1.726412,0.264895,0.358478,0.201388,0.159221
3,1.7258,1.699987,0.291476,0.361778,0.216728,0.173732
4,1.7016,1.681279,0.316224,0.174522,0.236816,0.195657
5,1.6849,1.66561,0.349221,0.180761,0.262785,0.2133
6,1.6712,1.649617,0.367553,0.183747,0.276986,0.220893
7,1.6614,1.634007,0.396884,0.195228,0.29941,0.230323
8,1.649,1.621049,0.404216,0.199809,0.305087,0.232067
9,1.6381,1.609698,0.407883,0.201454,0.307816,0.234521
10,1.6307,1.599583,0.415215,0.370152,0.31397,0.241306


[I 2025-03-27 19:43:21,808] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 0.00027695003215900754, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5965,1.301354,0.598533,0.560488,0.489794,0.484028
2,1.0747,0.802619,0.791934,0.67436,0.675529,0.674307
3,0.6459,0.560532,0.847846,0.717227,0.722805,0.719522
4,0.4346,0.479954,0.857929,0.726442,0.729343,0.727083
5,0.326,0.446102,0.869844,0.863534,0.77909,0.795362


[I 2025-03-27 19:43:52,105] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.0004828808002209901, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.51,1.073985,0.715857,0.62775,0.609272,0.611956
2,0.801,0.567138,0.83868,0.711413,0.715124,0.712832
3,0.4094,0.453786,0.857929,0.723498,0.730374,0.72636
4,0.2502,0.425234,0.873511,0.901886,0.781431,0.801812
5,0.1768,0.408611,0.888176,0.891706,0.83032,0.849694
6,0.1305,0.436652,0.877177,0.881907,0.822829,0.84091
7,0.1031,0.428762,0.87901,0.885976,0.823297,0.843635
8,0.0718,0.453594,0.883593,0.879715,0.835373,0.85184
9,0.055,0.481684,0.875344,0.850756,0.829685,0.837914
10,0.049,0.472433,0.889093,0.888871,0.867511,0.876367


[I 2025-03-27 19:45:17,849] Trial 38 finished with value: 0.8539261539863684 and parameters: {'learning_rate': 0.0004828808002209901, 'weight_decay': 0.01, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 39 with params: {'learning_rate': 1.1310667716871232e-05, 'weight_decay': 0.002, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8001,1.768926,0.219982,0.167077,0.201933,0.139653
2,1.7551,1.723187,0.268561,0.360849,0.20322,0.160277
3,1.7224,1.696245,0.302475,0.349943,0.224949,0.182203
4,1.6977,1.676206,0.32539,0.173891,0.244168,0.200585
5,1.6801,1.659214,0.355637,0.179573,0.267891,0.214997
6,1.6654,1.64175,0.386801,0.190153,0.291688,0.229057
7,1.6542,1.625337,0.404216,0.202102,0.305101,0.231737
8,1.6409,1.611143,0.412466,0.204631,0.311399,0.235926
9,1.6288,1.598468,0.417049,0.372913,0.315331,0.242318
10,1.6201,1.587268,0.417965,0.371456,0.316081,0.242765


[I 2025-03-27 19:46:14,910] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.0004890143053924719, 'weight_decay': 0.01, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5041,1.066741,0.730522,0.641764,0.621577,0.625779
2,0.7942,0.567216,0.837764,0.708376,0.71529,0.711535
3,0.4062,0.445423,0.860678,0.725683,0.733,0.72884
4,0.248,0.42178,0.87626,0.90322,0.792885,0.815549
5,0.1769,0.411455,0.886343,0.889199,0.829666,0.8482
6,0.1298,0.441341,0.878093,0.884863,0.82298,0.84251
7,0.0988,0.42689,0.880843,0.874174,0.824522,0.84172
8,0.0707,0.453276,0.883593,0.87531,0.827029,0.843512
9,0.0578,0.476518,0.87626,0.85633,0.839712,0.846394
10,0.0491,0.473177,0.887259,0.870295,0.837514,0.850565


[I 2025-03-27 19:47:15,598] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 1.2431112024586663e-05, 'weight_decay': 0.0, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7944,1.759834,0.233731,0.196667,0.206403,0.152788
2,1.7468,1.714814,0.263061,0.352184,0.19611,0.146528
3,1.7147,1.689681,0.304308,0.351405,0.22642,0.184349
4,1.6906,1.66876,0.340055,0.177543,0.255408,0.208212
5,1.6721,1.649952,0.370302,0.18516,0.279026,0.222566
6,1.6556,1.629885,0.398717,0.195192,0.300656,0.234245
7,1.6424,1.611491,0.410632,0.204269,0.310163,0.23364
8,1.6275,1.594998,0.417049,0.372876,0.315455,0.240957
9,1.6133,1.58024,0.422548,0.375557,0.319483,0.245054
10,1.6025,1.567077,0.426214,0.341126,0.322818,0.250895


[I 2025-03-27 19:48:14,330] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.0003534673833549885, 'weight_decay': 0.01, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5596,1.203792,0.650779,0.596149,0.545171,0.548442
2,0.9463,0.68592,0.818515,0.693073,0.699188,0.695895
3,0.5254,0.502695,0.854262,0.720668,0.728067,0.723922
4,0.339,0.440222,0.865261,0.90098,0.744043,0.751687
5,0.2531,0.422068,0.880843,0.88645,0.824969,0.844821
6,0.1884,0.420195,0.88451,0.889589,0.827514,0.847658
7,0.1474,0.411538,0.886343,0.89267,0.828651,0.849668
8,0.1111,0.425491,0.88451,0.890345,0.826544,0.847471
9,0.0929,0.42997,0.886343,0.890931,0.828742,0.848939
10,0.0777,0.431024,0.887259,0.892005,0.82887,0.849595


[I 2025-03-27 19:49:13,553] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.0002767995356836386, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5966,1.301487,0.598533,0.560488,0.489794,0.484028
2,1.075,0.802906,0.791934,0.674458,0.675314,0.674185
3,0.6462,0.560742,0.847846,0.717227,0.722805,0.719522
4,0.4349,0.480091,0.857929,0.726442,0.729343,0.727083
5,0.3262,0.446134,0.869844,0.863534,0.77909,0.795362
6,0.2538,0.43071,0.877177,0.879834,0.803403,0.823684
7,0.2061,0.413142,0.879927,0.888996,0.822673,0.844596
8,0.1637,0.421746,0.878093,0.885865,0.821041,0.842469
9,0.1413,0.415839,0.885426,0.889494,0.828402,0.847971
10,0.1211,0.41597,0.886343,0.891201,0.828724,0.849165


[I 2025-03-27 19:50:42,967] Trial 43 finished with value: 0.8473833443118536 and parameters: {'learning_rate': 0.0002767995356836386, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 44 with params: {'learning_rate': 0.0004615517048992085, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5181,1.093904,0.71494,0.628261,0.607828,0.612319
2,0.8171,0.578246,0.840513,0.712519,0.716837,0.714288
3,0.4212,0.452772,0.858845,0.724191,0.73223,0.727698
4,0.2573,0.421025,0.87901,0.906573,0.794488,0.818072
5,0.1853,0.399565,0.890009,0.895295,0.831661,0.852095
6,0.1354,0.423939,0.888176,0.892169,0.830613,0.850019
7,0.0995,0.434656,0.882676,0.891957,0.826638,0.846902
8,0.0765,0.443592,0.887259,0.885048,0.846701,0.861933
9,0.0589,0.484373,0.877177,0.875339,0.839527,0.853371
10,0.0539,0.46659,0.889093,0.887205,0.858067,0.869751


[I 2025-03-27 19:52:10,957] Trial 44 finished with value: 0.8570148969910045 and parameters: {'learning_rate': 0.0004615517048992085, 'weight_decay': 0.01, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 45 with params: {'learning_rate': 0.0004406076226602862, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5285,1.124786,0.706691,0.617999,0.601095,0.603896
2,0.8407,0.592253,0.835014,0.705265,0.713489,0.709177
3,0.4345,0.459044,0.858845,0.723702,0.732397,0.727665
4,0.2687,0.421005,0.878093,0.905721,0.793294,0.817131
5,0.1935,0.401771,0.887259,0.891777,0.829692,0.849549
6,0.1412,0.42236,0.88451,0.889468,0.827709,0.847404
7,0.1079,0.430338,0.878093,0.884851,0.823204,0.842329
8,0.0802,0.45198,0.879927,0.890948,0.832028,0.852698
9,0.0674,0.469855,0.875344,0.860062,0.829439,0.841256
10,0.056,0.448,0.887259,0.884451,0.846478,0.86159


[I 2025-03-27 19:53:38,687] Trial 45 finished with value: 0.8419135726157977 and parameters: {'learning_rate': 0.0004406076226602862, 'weight_decay': 0.01, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 46 with params: {'learning_rate': 0.00021984012625838158, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6261,1.386663,0.551787,0.561145,0.440862,0.410699
2,1.1896,0.930805,0.758937,0.654479,0.648156,0.648399
3,0.7826,0.661336,0.833181,0.704378,0.712011,0.707797
4,0.553,0.547008,0.845096,0.715631,0.71947,0.716726
5,0.4228,0.48232,0.865261,0.727888,0.739428,0.733231


[I 2025-03-27 19:54:08,512] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 8.617890542296699e-05, 'weight_decay': 0.001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7098,1.618954,0.371219,0.207436,0.278686,0.231816
2,1.5527,1.421697,0.549038,0.552331,0.440168,0.418192
3,1.3477,1.22085,0.669111,0.621153,0.565193,0.567465
4,1.1475,1.040013,0.729606,0.6323,0.624616,0.621832
5,0.9841,0.906808,0.758937,0.644563,0.652106,0.644795
6,0.8514,0.80591,0.793767,0.672933,0.678647,0.674416
7,0.7533,0.726895,0.822181,0.694071,0.703694,0.69866
8,0.6721,0.672216,0.830431,0.701196,0.709667,0.705283
9,0.612,0.636824,0.834097,0.705834,0.713199,0.709173
10,0.5684,0.608293,0.84143,0.708823,0.719474,0.713955


[I 2025-03-27 19:55:08,179] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.00041546298573903864, 'weight_decay': 0.01, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5371,1.148414,0.681943,0.604473,0.577071,0.581564
2,0.8782,0.622071,0.829514,0.702431,0.708521,0.705142
3,0.4632,0.470782,0.856095,0.720754,0.730419,0.725142
4,0.2896,0.425769,0.865261,0.732087,0.736405,0.733681
5,0.2073,0.405578,0.883593,0.88843,0.826903,0.846708
6,0.1482,0.421736,0.88176,0.886298,0.825828,0.845001
7,0.1173,0.408404,0.890926,0.896882,0.831885,0.852977
8,0.0882,0.435825,0.87901,0.885982,0.823092,0.84335
9,0.0713,0.462563,0.87626,0.85677,0.821432,0.834249
10,0.0587,0.453239,0.883593,0.879676,0.833973,0.851541


[I 2025-03-27 19:56:36,158] Trial 48 finished with value: 0.84858598412416 and parameters: {'learning_rate': 0.00041546298573903864, 'weight_decay': 0.01, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 49 with params: {'learning_rate': 0.0004614888037298609, 'weight_decay': 0.009000000000000001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5124,1.091696,0.715857,0.634852,0.607057,0.610564
2,0.8141,0.584264,0.831347,0.703495,0.709202,0.705996
3,0.4233,0.45666,0.860678,0.72507,0.734299,0.729404
4,0.2584,0.433234,0.870761,0.899731,0.797334,0.822304
5,0.1846,0.41614,0.882676,0.887651,0.826989,0.846185
6,0.1333,0.440165,0.87626,0.881477,0.821848,0.840428
7,0.0994,0.430902,0.88176,0.88795,0.826453,0.845858
8,0.0755,0.456678,0.883593,0.868416,0.835202,0.848072
9,0.0665,0.487626,0.872594,0.857264,0.82829,0.839063
10,0.0528,0.472048,0.886343,0.881302,0.837364,0.853808


[I 2025-03-27 19:58:10,356] Trial 49 finished with value: 0.8377963135506409 and parameters: {'learning_rate': 0.0004614888037298609, 'weight_decay': 0.009000000000000001, 'warmup_steps': 4}. Best is trial 30 with value: 0.8596597868624403.


Trial 50 with params: {'learning_rate': 0.000482555368292722, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5101,1.074179,0.71769,0.628995,0.610619,0.613254
2,0.8013,0.567267,0.83868,0.711513,0.715124,0.71287
3,0.4096,0.453725,0.857929,0.723498,0.730374,0.72636
4,0.2503,0.425221,0.873511,0.901778,0.781431,0.801768
5,0.1769,0.408558,0.887259,0.890939,0.829653,0.849
6,0.1307,0.436131,0.877177,0.881907,0.822829,0.84091
7,0.1028,0.428725,0.878093,0.885608,0.82263,0.843118
8,0.0716,0.453305,0.883593,0.879715,0.835373,0.85184
9,0.0549,0.480934,0.875344,0.851054,0.82969,0.838179
10,0.0488,0.472032,0.890009,0.890258,0.86815,0.877405


[I 2025-03-27 19:59:39,319] Trial 50 finished with value: 0.8539261539863684 and parameters: {'learning_rate': 0.000482555368292722, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 51 with params: {'learning_rate': 4.412575130718341e-05, 'weight_decay': 0.007, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7481,1.680616,0.289643,0.181495,0.215361,0.174084
2,1.6589,1.600284,0.413382,0.203016,0.31247,0.235634
3,1.5794,1.498494,0.450962,0.358511,0.348186,0.292392
4,1.4751,1.390093,0.550871,0.532191,0.448219,0.433058
5,1.3709,1.289088,0.630614,0.57648,0.529581,0.529447
6,1.274,1.202596,0.684693,0.606202,0.583483,0.582015
7,1.1896,1.129689,0.713107,0.620148,0.609322,0.606841
8,1.1216,1.067158,0.722273,0.62499,0.618038,0.615595
9,1.0602,1.02106,0.738772,0.63842,0.632217,0.629272
10,1.0155,0.982328,0.745188,0.639675,0.637674,0.63381


[I 2025-03-27 20:00:39,330] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 0.00048322592072392375, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5098,1.07351,0.715857,0.62775,0.609272,0.611956
2,0.8006,0.566889,0.83868,0.711413,0.715124,0.712832
3,0.4092,0.453597,0.857929,0.723498,0.730374,0.72636
4,0.2501,0.425395,0.873511,0.901778,0.781431,0.801768
5,0.1767,0.408751,0.886343,0.890176,0.828987,0.848306
6,0.1303,0.436912,0.878093,0.882544,0.823544,0.841612
7,0.1031,0.428892,0.87901,0.885976,0.823297,0.843635
8,0.0717,0.453669,0.882676,0.879157,0.834693,0.85121
9,0.0548,0.482855,0.87626,0.851406,0.830683,0.838718
10,0.0486,0.47309,0.890926,0.890512,0.868893,0.877902


[I 2025-03-27 20:02:10,113] Trial 52 finished with value: 0.8540741759588646 and parameters: {'learning_rate': 0.00048322592072392375, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 53 with params: {'learning_rate': 0.0003381988400743685, 'weight_decay': 0.001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5662,1.223833,0.655362,0.596873,0.549953,0.554535
2,0.9715,0.704912,0.813016,0.68882,0.695126,0.691742
3,0.5429,0.507465,0.852429,0.719713,0.726345,0.722503
4,0.3519,0.444213,0.865261,0.734086,0.734235,0.733113
5,0.2643,0.426161,0.88176,0.878667,0.798286,0.816761
6,0.1976,0.425387,0.88176,0.886831,0.825689,0.845208
7,0.1589,0.413245,0.88451,0.891161,0.827123,0.847868
8,0.12,0.424143,0.882676,0.888556,0.82559,0.846192
9,0.0998,0.434783,0.88176,0.887048,0.825007,0.845075
10,0.0844,0.432974,0.886343,0.893457,0.827912,0.849752


[I 2025-03-27 20:03:40,618] Trial 53 finished with value: 0.8428463232156987 and parameters: {'learning_rate': 0.0003381988400743685, 'weight_decay': 0.001, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 54 with params: {'learning_rate': 0.000487970459351888, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5079,1.06905,0.716774,0.628535,0.610337,0.612864
2,0.7964,0.564313,0.83868,0.71143,0.715406,0.712994
3,0.4058,0.453157,0.857929,0.723435,0.730589,0.726486
4,0.2481,0.426848,0.871677,0.900223,0.789235,0.812264
5,0.1756,0.409066,0.885426,0.889241,0.828417,0.847652
6,0.1274,0.43814,0.878093,0.882337,0.823475,0.841523
7,0.1026,0.430492,0.87901,0.87254,0.822965,0.8404
8,0.0712,0.457369,0.883593,0.87817,0.836303,0.851613
9,0.0543,0.49378,0.867094,0.837313,0.823272,0.828522
10,0.0488,0.476655,0.886343,0.878591,0.864341,0.870479


[I 2025-03-27 20:05:11,437] Trial 54 finished with value: 0.8541957567323281 and parameters: {'learning_rate': 0.000487970459351888, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 55 with params: {'learning_rate': 0.00043597012940916317, 'weight_decay': 0.0, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5276,1.125617,0.702108,0.618453,0.59663,0.601615
2,0.8513,0.603124,0.831347,0.704876,0.709917,0.707042
3,0.4447,0.465705,0.858845,0.724719,0.731826,0.727505
4,0.2751,0.421611,0.870761,0.735653,0.740648,0.737649
5,0.1971,0.405155,0.886343,0.890604,0.828942,0.848742
6,0.1406,0.416685,0.887259,0.891209,0.83019,0.849665
7,0.1105,0.43006,0.87901,0.888021,0.822997,0.843516
8,0.0845,0.441189,0.883593,0.890739,0.826751,0.847376
9,0.068,0.467283,0.877177,0.862067,0.831041,0.843096
10,0.0583,0.458766,0.88176,0.880663,0.842116,0.857702


[I 2025-03-27 20:06:38,511] Trial 55 finished with value: 0.8477501901179441 and parameters: {'learning_rate': 0.00043597012940916317, 'weight_decay': 0.0, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 56 with params: {'learning_rate': 0.00042673161469364333, 'weight_decay': 0.001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5303,1.135067,0.694775,0.621743,0.587132,0.590033
2,0.8572,0.609574,0.827681,0.700204,0.706867,0.703326
3,0.4501,0.477141,0.853346,0.719197,0.728594,0.722952
4,0.2799,0.430635,0.869844,0.901348,0.758192,0.77113
5,0.2052,0.411154,0.886343,0.890847,0.829039,0.848744
6,0.1499,0.430495,0.878093,0.883042,0.824052,0.842465
7,0.111,0.423149,0.879927,0.885925,0.824725,0.844161
8,0.0885,0.450896,0.875344,0.883025,0.819728,0.839985
9,0.0707,0.470942,0.877177,0.857945,0.82161,0.834567
10,0.0569,0.462042,0.88176,0.872063,0.825471,0.841356


[I 2025-03-27 20:07:36,470] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 0.0004743300687691856, 'weight_decay': 0.001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5133,1.082073,0.71494,0.629671,0.608322,0.612686
2,0.8082,0.572784,0.83868,0.711249,0.714576,0.712424
3,0.4153,0.454861,0.857929,0.723399,0.730687,0.726438
4,0.2532,0.425081,0.875344,0.903459,0.782765,0.803249
5,0.1801,0.405895,0.885426,0.890459,0.828279,0.848027
6,0.1328,0.425587,0.885426,0.888541,0.828835,0.847473
7,0.0994,0.418115,0.883593,0.889707,0.827099,0.847406
8,0.0715,0.44543,0.886343,0.893917,0.837075,0.857181
9,0.0557,0.472217,0.88176,0.87885,0.843107,0.857012
10,0.0508,0.475559,0.890926,0.890868,0.868733,0.87785


[I 2025-03-27 20:09:07,427] Trial 57 finished with value: 0.8504759255603335 and parameters: {'learning_rate': 0.0004743300687691856, 'weight_decay': 0.001, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 58 with params: {'learning_rate': 3.489344821287614e-05, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7608,1.694513,0.278643,0.259236,0.206841,0.164466
2,1.6799,1.632983,0.393217,0.192497,0.297358,0.226193
3,1.6225,1.558972,0.422548,0.371729,0.32029,0.243647
4,1.5482,1.478489,0.47846,0.49226,0.375587,0.333734
5,1.4689,1.395258,0.557287,0.533156,0.453053,0.438334
6,1.3906,1.324892,0.616865,0.567747,0.516611,0.51443
7,1.3212,1.262694,0.661778,0.593441,0.561898,0.560296
8,1.2639,1.206425,0.681027,0.604842,0.579414,0.579259
9,1.2112,1.162735,0.698442,0.614651,0.595566,0.594163
10,1.1686,1.124852,0.712191,0.620462,0.609937,0.607


[I 2025-03-27 20:10:06,236] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.000335244805249745, 'weight_decay': 0.008, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5673,1.228497,0.660862,0.598785,0.554424,0.558928
2,0.979,0.711827,0.816682,0.691991,0.69854,0.695064
3,0.5479,0.509581,0.857929,0.723799,0.731155,0.727133
4,0.3554,0.447359,0.862511,0.73213,0.732242,0.73106
5,0.2668,0.432574,0.879927,0.877799,0.796472,0.815437


[I 2025-03-27 20:10:35,005] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.00014559095280735742, 'weight_decay': 0.003, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6795,1.53007,0.441797,0.381827,0.33498,0.267957
2,1.3914,1.189623,0.68011,0.611055,0.578001,0.577073
3,1.0649,0.919633,0.75527,0.654849,0.645986,0.645356
4,0.8209,0.743801,0.797434,0.678049,0.680912,0.678053
5,0.6495,0.619322,0.842346,0.708371,0.720661,0.71436
6,0.5239,0.548494,0.846929,0.713467,0.72322,0.718164
7,0.4384,0.503711,0.855179,0.721515,0.729649,0.725199
8,0.3739,0.478929,0.860678,0.724895,0.734389,0.729523
9,0.334,0.45856,0.870761,0.733282,0.742843,0.737925
10,0.3043,0.451701,0.868011,0.729403,0.74128,0.735159


[I 2025-03-27 20:11:32,851] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.0001782132009936772, 'weight_decay': 0.001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6526,1.464059,0.494959,0.371278,0.386858,0.343902
2,1.2924,1.058737,0.730522,0.636941,0.624903,0.621869
3,0.9196,0.780459,0.797434,0.677907,0.682473,0.679145
4,0.6805,0.634299,0.826764,0.70193,0.704009,0.702076
5,0.5284,0.539355,0.858845,0.722875,0.733658,0.727826


[I 2025-03-27 20:12:01,820] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.000273536079777216, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5984,1.306228,0.593951,0.557626,0.485932,0.479993
2,1.0811,0.808989,0.790101,0.673516,0.67398,0.672988
3,0.6527,0.564788,0.846929,0.716774,0.722125,0.718926
4,0.4404,0.482973,0.857929,0.726555,0.729308,0.727066
5,0.3305,0.44719,0.871677,0.865035,0.780755,0.796889
6,0.2574,0.431131,0.877177,0.879834,0.803403,0.823684
7,0.209,0.413866,0.88176,0.890539,0.824351,0.846207
8,0.1668,0.421075,0.878093,0.885845,0.821292,0.842589
9,0.1441,0.415891,0.885426,0.889494,0.828402,0.847971
10,0.1238,0.416136,0.883593,0.888482,0.826317,0.84657


[I 2025-03-27 20:13:31,258] Trial 62 finished with value: 0.8494662371492816 and parameters: {'learning_rate': 0.000273536079777216, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 63 with params: {'learning_rate': 0.0003182184380718577, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5747,1.245248,0.644363,0.588193,0.538375,0.541046
2,1.0,0.732349,0.812099,0.688265,0.694777,0.691248
3,0.572,0.520573,0.852429,0.72044,0.725626,0.722523
4,0.3751,0.451459,0.860678,0.729515,0.730968,0.729336
5,0.2798,0.430906,0.878093,0.876416,0.795093,0.814104


[I 2025-03-27 20:13:59,090] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 0.0004522935912221533, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5225,1.106845,0.712191,0.624475,0.605907,0.609811
2,0.8267,0.582346,0.839597,0.711309,0.716051,0.713344
3,0.4262,0.453053,0.857929,0.72305,0.731418,0.726888
4,0.262,0.420676,0.87901,0.90629,0.794292,0.817919
5,0.1891,0.398568,0.890926,0.896492,0.832328,0.852981
6,0.1359,0.422318,0.886343,0.890275,0.829087,0.848627
7,0.1016,0.426536,0.88451,0.891546,0.826994,0.847638
8,0.0776,0.445247,0.883593,0.895807,0.853419,0.870043
9,0.062,0.468862,0.87626,0.886909,0.838439,0.856682
10,0.0553,0.475237,0.887259,0.886175,0.855852,0.868093


[I 2025-03-27 20:15:28,961] Trial 64 finished with value: 0.8594262556603645 and parameters: {'learning_rate': 0.0004522935912221533, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 65 with params: {'learning_rate': 0.0004608922352030835, 'weight_decay': 0.004, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5184,1.094777,0.714024,0.62754,0.607161,0.611583
2,0.8178,0.578544,0.840513,0.712519,0.716837,0.714288
3,0.4216,0.452818,0.858845,0.724191,0.73223,0.727698
4,0.2576,0.42094,0.87901,0.906573,0.794488,0.818072
5,0.1855,0.399406,0.890009,0.895295,0.831661,0.852095
6,0.1355,0.423788,0.889093,0.892766,0.831329,0.850718
7,0.0997,0.434004,0.88451,0.893185,0.828284,0.848402
8,0.0769,0.443096,0.886343,0.896676,0.846339,0.865352
9,0.0588,0.485845,0.879927,0.867192,0.842137,0.852069
10,0.0537,0.466941,0.888176,0.886369,0.8574,0.869046


[I 2025-03-27 20:16:58,772] Trial 65 finished with value: 0.8570148969910045 and parameters: {'learning_rate': 0.0004608922352030835, 'weight_decay': 0.004, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 66 with params: {'learning_rate': 0.0004866760772067772, 'weight_decay': 0.005, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5084,1.070154,0.716774,0.628372,0.61027,0.612851
2,0.7974,0.56498,0.83868,0.71143,0.715406,0.712994
3,0.4067,0.453318,0.857929,0.723435,0.730589,0.726486
4,0.2486,0.426344,0.873511,0.901589,0.790569,0.813642
5,0.1758,0.409239,0.885426,0.889508,0.828153,0.847693
6,0.1283,0.438214,0.87901,0.883099,0.824357,0.842318
7,0.1031,0.429323,0.880843,0.873218,0.82441,0.841513
8,0.0715,0.45363,0.882676,0.878121,0.835359,0.851141
9,0.0544,0.489702,0.871677,0.848021,0.826932,0.834856
10,0.0489,0.47587,0.888176,0.878968,0.86623,0.871553


[I 2025-03-27 20:18:25,871] Trial 66 finished with value: 0.8533627705732693 and parameters: {'learning_rate': 0.0004866760772067772, 'weight_decay': 0.005, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 67 with params: {'learning_rate': 0.00043113291972233294, 'weight_decay': 0.003, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5299,1.130795,0.699358,0.616531,0.593765,0.598589
2,0.8572,0.607275,0.831347,0.704876,0.709917,0.707042
3,0.4488,0.467343,0.860678,0.726071,0.733208,0.728906
4,0.2783,0.422629,0.868928,0.734357,0.739482,0.736451
5,0.199,0.406346,0.88451,0.889349,0.82756,0.847409
6,0.1418,0.420236,0.883593,0.888001,0.827245,0.846626
7,0.1117,0.42161,0.88176,0.890485,0.825129,0.845982
8,0.0847,0.439982,0.87901,0.886647,0.822633,0.843405
9,0.0695,0.464074,0.882676,0.868984,0.844672,0.854552
10,0.0581,0.454592,0.883593,0.882081,0.843512,0.85914


[I 2025-03-27 20:19:55,642] Trial 67 finished with value: 0.854264225108416 and parameters: {'learning_rate': 0.00043113291972233294, 'weight_decay': 0.003, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 68 with params: {'learning_rate': 0.0002688562781172094, 'weight_decay': 0.003, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.596,1.311698,0.600367,0.556666,0.491275,0.486785
2,1.0931,0.824393,0.789184,0.675184,0.673864,0.673462
3,0.6648,0.575196,0.84418,0.714325,0.719793,0.716567
4,0.4487,0.487682,0.860678,0.729593,0.732141,0.729996
5,0.3368,0.449068,0.869844,0.897758,0.761182,0.771093


[I 2025-03-27 20:20:24,134] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.000459659836262423, 'weight_decay': 0.003, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5166,1.097906,0.710357,0.622593,0.604616,0.608525
2,0.8237,0.58522,0.840513,0.711505,0.717056,0.713887
3,0.4266,0.4587,0.858845,0.723979,0.732152,0.727433
4,0.262,0.421309,0.874427,0.90402,0.772522,0.789754
5,0.1899,0.401919,0.887259,0.891722,0.830023,0.849682
6,0.1353,0.421239,0.889093,0.893374,0.831394,0.851228
7,0.1055,0.429226,0.88176,0.889324,0.826571,0.846126
8,0.0821,0.43805,0.878093,0.886787,0.831444,0.85062
9,0.0618,0.475664,0.871677,0.848835,0.826462,0.835637
10,0.0591,0.452989,0.885426,0.868857,0.836553,0.849281


[I 2025-03-27 20:21:52,256] Trial 69 finished with value: 0.8512930350016283 and parameters: {'learning_rate': 0.000459659836262423, 'weight_decay': 0.003, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 70 with params: {'learning_rate': 0.00022787988612239638, 'weight_decay': 0.004, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6245,1.379062,0.551787,0.56404,0.441251,0.410577
2,1.1757,0.912564,0.766269,0.659457,0.654036,0.654266
3,0.7625,0.643988,0.834097,0.705276,0.71258,0.708563
4,0.5343,0.536558,0.851512,0.720361,0.724464,0.721537
5,0.4079,0.47519,0.865261,0.727682,0.739477,0.733205
6,0.3216,0.447181,0.872594,0.899927,0.772462,0.78772
7,0.2613,0.422216,0.877177,0.873342,0.784075,0.802377
8,0.2162,0.413995,0.883593,0.888821,0.816562,0.838603
9,0.1886,0.408855,0.88451,0.889286,0.827492,0.847501
10,0.1698,0.410091,0.87901,0.8837,0.813276,0.834433


[I 2025-03-27 20:22:50,097] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.00039740924591087715, 'weight_decay': 0.003, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5427,1.16214,0.671861,0.604571,0.566614,0.571577
2,0.8912,0.635793,0.827681,0.698897,0.70728,0.702866
3,0.4751,0.482759,0.849679,0.71629,0.724047,0.719367
4,0.3013,0.426792,0.865261,0.730997,0.736426,0.733135
5,0.2182,0.411496,0.883593,0.888487,0.826802,0.846774
6,0.1595,0.416108,0.880843,0.870515,0.825885,0.840716
7,0.126,0.419096,0.885426,0.891521,0.828369,0.848521
8,0.098,0.423026,0.885426,0.89085,0.827785,0.84822
9,0.0797,0.441143,0.883593,0.874218,0.826666,0.842797
10,0.0681,0.429794,0.886343,0.878072,0.827375,0.845334


[I 2025-03-27 20:23:48,496] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 1.847006633877252e-05, 'weight_decay': 0.005, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7821,1.733807,0.253896,0.265593,0.196722,0.155888
2,1.7214,1.68998,0.285976,0.18495,0.21183,0.162184
3,1.6879,1.657654,0.35472,0.183188,0.266732,0.216556
4,1.6572,1.623441,0.40055,0.196031,0.302228,0.231778
5,1.6268,1.589363,0.419798,0.373615,0.317442,0.243681
6,1.5965,1.556041,0.430797,0.321365,0.326408,0.253247
7,1.5678,1.52319,0.444546,0.351515,0.341633,0.280597
8,1.5409,1.493405,0.469294,0.466262,0.364581,0.31398
9,1.5135,1.466506,0.503208,0.504674,0.397362,0.359989
10,1.4905,1.443146,0.535289,0.52663,0.429526,0.404754


[I 2025-03-27 20:24:47,573] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 0.00033457796707623996, 'weight_decay': 0.003, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5676,1.229246,0.659945,0.599562,0.553426,0.558141
2,0.9801,0.712782,0.816682,0.691991,0.69854,0.695064
3,0.5488,0.510014,0.857012,0.723184,0.73044,0.726454
4,0.3561,0.447571,0.862511,0.73213,0.732242,0.73106
5,0.2675,0.432785,0.879927,0.877799,0.796472,0.815437


[I 2025-03-27 20:25:17,688] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.0004867450074287666, 'weight_decay': 0.005, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5051,1.068989,0.729606,0.641209,0.620579,0.624903
2,0.7964,0.568397,0.839597,0.709663,0.716888,0.712982
3,0.4079,0.446117,0.860678,0.725936,0.733264,0.729118
4,0.2491,0.421286,0.87626,0.90322,0.792885,0.815549
5,0.178,0.411287,0.883593,0.887126,0.827618,0.846128
6,0.1312,0.439846,0.87901,0.885786,0.823682,0.843268
7,0.0989,0.428831,0.878093,0.872056,0.822509,0.839683
8,0.072,0.458399,0.88176,0.874761,0.825285,0.842114
9,0.0588,0.469431,0.877177,0.865374,0.840324,0.850367
10,0.0499,0.468882,0.886343,0.873596,0.845639,0.85742


[I 2025-03-27 20:26:46,361] Trial 74 finished with value: 0.8567957443460131 and parameters: {'learning_rate': 0.0004867450074287666, 'weight_decay': 0.005, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 75 with params: {'learning_rate': 0.0004344344906830412, 'weight_decay': 0.004, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5317,1.135031,0.699358,0.610103,0.595002,0.596875
2,0.8496,0.599247,0.832264,0.702872,0.71144,0.706986
3,0.4404,0.463358,0.857929,0.722902,0.731396,0.726563
4,0.2728,0.421826,0.878093,0.905812,0.79303,0.81701
5,0.1948,0.401022,0.886343,0.891662,0.828841,0.849077
6,0.1426,0.420758,0.88451,0.874584,0.828106,0.843868
7,0.1108,0.42442,0.886343,0.890808,0.829378,0.84886
8,0.0814,0.453402,0.879927,0.877053,0.831679,0.848424
9,0.0689,0.470431,0.87626,0.852081,0.830199,0.838837
10,0.0576,0.449844,0.887259,0.884673,0.846435,0.86155


[I 2025-03-27 20:28:18,801] Trial 75 finished with value: 0.8509338152400187 and parameters: {'learning_rate': 0.0004344344906830412, 'weight_decay': 0.004, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 76 with params: {'learning_rate': 0.00021934666877100533, 'weight_decay': 0.005, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6264,1.387556,0.551787,0.56101,0.440862,0.410714
2,1.1908,0.932161,0.75802,0.65374,0.647489,0.647618
3,0.784,0.66248,0.833181,0.704247,0.712011,0.707738
4,0.5542,0.547816,0.845096,0.715631,0.71947,0.716726
5,0.4239,0.482794,0.865261,0.727888,0.739428,0.733231
6,0.3359,0.451506,0.871677,0.898731,0.753168,0.755916
7,0.273,0.425256,0.874427,0.871254,0.782264,0.800431
8,0.2272,0.416064,0.882676,0.884888,0.806723,0.828106
9,0.198,0.409302,0.886343,0.888722,0.820015,0.840291
10,0.1792,0.409452,0.878093,0.882764,0.812512,0.833562


[I 2025-03-27 20:29:17,451] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.0004116430630194213, 'weight_decay': 0.002, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.539,1.152996,0.68011,0.60327,0.575088,0.579577
2,0.8834,0.625673,0.830431,0.702765,0.709215,0.705723
3,0.4668,0.471397,0.857929,0.722091,0.731704,0.726522
4,0.2923,0.426523,0.866178,0.732607,0.737085,0.734309
5,0.2098,0.406241,0.88451,0.889484,0.827667,0.847684
6,0.1504,0.422106,0.878093,0.883188,0.823161,0.842139
7,0.1192,0.406884,0.890926,0.896769,0.83192,0.852984
8,0.0894,0.433857,0.879927,0.886105,0.823193,0.843539
9,0.0729,0.457637,0.875344,0.856088,0.820399,0.833387
10,0.0594,0.451522,0.885426,0.880843,0.835638,0.852913


[I 2025-03-27 20:30:14,562] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 4.2739403038429994e-05, 'weight_decay': 0.005, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.753,1.683478,0.289643,0.17914,0.215326,0.173568
2,1.6633,1.60677,0.410632,0.201974,0.310429,0.234224
3,1.5875,1.509025,0.446379,0.364085,0.342883,0.281463
4,1.4876,1.404523,0.542621,0.524981,0.439572,0.421763
5,1.3864,1.305447,0.621448,0.571984,0.520146,0.51969


[I 2025-03-27 20:30:46,706] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.00045481718142657905, 'weight_decay': 0.004, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5188,1.103761,0.713107,0.62607,0.606616,0.611167
2,0.829,0.588808,0.83593,0.708613,0.713612,0.710704
3,0.4301,0.460611,0.859762,0.724724,0.732868,0.728126
4,0.2644,0.421424,0.874427,0.904154,0.763385,0.775226
5,0.1913,0.401638,0.888176,0.892141,0.830787,0.850352
6,0.1369,0.418971,0.890009,0.89386,0.832489,0.852158
7,0.1068,0.427039,0.882676,0.889565,0.827299,0.846751
8,0.0818,0.443461,0.880843,0.890173,0.8328,0.852887
9,0.065,0.468618,0.877177,0.866205,0.839551,0.850684
10,0.0585,0.448216,0.887259,0.870185,0.83769,0.850545


[I 2025-03-27 20:32:14,749] Trial 79 finished with value: 0.8510260202845391 and parameters: {'learning_rate': 0.00045481718142657905, 'weight_decay': 0.004, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 80 with params: {'learning_rate': 3.3764327559756856e-05, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7623,1.696444,0.275894,0.257376,0.20473,0.162135
2,1.6824,1.63662,0.393217,0.192662,0.297344,0.226463
3,1.6273,1.566042,0.422548,0.371741,0.31999,0.242447
4,1.5567,1.489,0.466544,0.486399,0.363748,0.316038
5,1.4808,1.409001,0.546288,0.522552,0.441525,0.422516


[I 2025-03-27 20:32:44,534] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.0004849311702819569, 'weight_decay': 0.008, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4969,1.066558,0.72594,0.639767,0.615978,0.620532
2,0.7965,0.571349,0.836847,0.706657,0.714464,0.710191
3,0.4131,0.454923,0.858845,0.724112,0.732513,0.7278
4,0.2505,0.420167,0.87626,0.903878,0.783652,0.804037
5,0.1772,0.433291,0.877177,0.881999,0.822969,0.841113
6,0.1326,0.435479,0.87901,0.885467,0.823489,0.84331
7,0.0975,0.439012,0.875344,0.868279,0.821075,0.836956
8,0.0706,0.469207,0.870761,0.854249,0.815841,0.829888
9,0.0574,0.488796,0.87626,0.859714,0.831074,0.841872
10,0.046,0.480458,0.880843,0.867075,0.843413,0.852917


[I 2025-03-27 20:34:13,682] Trial 81 finished with value: 0.8443310992877588 and parameters: {'learning_rate': 0.0004849311702819569, 'weight_decay': 0.008, 'warmup_steps': 1}. Best is trial 30 with value: 0.8596597868624403.


Trial 82 with params: {'learning_rate': 0.00041754113917147096, 'weight_decay': 0.006, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5361,1.145952,0.68286,0.605154,0.577953,0.582537
2,0.8751,0.619859,0.830431,0.703036,0.709452,0.705917
3,0.461,0.470644,0.855179,0.720215,0.729536,0.724328
4,0.288,0.425269,0.864345,0.731594,0.735474,0.732911
5,0.2058,0.4057,0.88176,0.88687,0.825306,0.845129
6,0.147,0.4217,0.88176,0.886298,0.825828,0.845001
7,0.1163,0.409954,0.890926,0.896882,0.831885,0.852977
8,0.0876,0.439085,0.877177,0.884679,0.821745,0.842026
9,0.0705,0.465175,0.875344,0.855962,0.820549,0.83338
10,0.0586,0.454425,0.882676,0.878812,0.833355,0.850837


[I 2025-03-27 20:35:12,142] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 0.00047272740661940155, 'weight_decay': 0.006, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5139,1.083508,0.714024,0.628277,0.607655,0.611732
2,0.8095,0.573711,0.83868,0.711249,0.714576,0.712424
3,0.4163,0.45529,0.858845,0.723986,0.731618,0.727246
4,0.2539,0.424952,0.875344,0.903459,0.782765,0.803249
5,0.1812,0.405372,0.888176,0.893171,0.830328,0.850406
6,0.1336,0.423843,0.886343,0.889285,0.829502,0.848201
7,0.0996,0.418134,0.886343,0.891989,0.829099,0.849508
8,0.0714,0.446621,0.883593,0.892238,0.834964,0.8553
9,0.0563,0.473495,0.880843,0.877984,0.842454,0.856232
10,0.0506,0.476037,0.887259,0.878419,0.866018,0.870998


[I 2025-03-27 20:36:40,948] Trial 83 finished with value: 0.8524029286397657 and parameters: {'learning_rate': 0.00047272740661940155, 'weight_decay': 0.006, 'warmup_steps': 3}. Best is trial 30 with value: 0.8596597868624403.


Trial 84 with params: {'learning_rate': 0.00037872229959881967, 'weight_decay': 0.005, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5475,1.183416,0.673694,0.605013,0.567966,0.572806
2,0.9233,0.658695,0.830431,0.703038,0.709826,0.706194
3,0.4994,0.485316,0.859762,0.724954,0.732769,0.728536
4,0.3196,0.430551,0.865261,0.899119,0.745278,0.75184
5,0.2329,0.409826,0.886343,0.889168,0.81985,0.840389


[I 2025-03-27 20:37:09,803] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.00039061363634075065, 'weight_decay': 0.007, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5487,1.177266,0.673694,0.606554,0.56745,0.573245
2,0.9123,0.646628,0.826764,0.700133,0.706448,0.703053
3,0.4871,0.478647,0.857929,0.721705,0.732122,0.726663
4,0.3084,0.431208,0.860678,0.729357,0.732146,0.729973
5,0.2244,0.410842,0.879927,0.883557,0.814981,0.835248
6,0.1621,0.413421,0.882676,0.886363,0.826919,0.845619
7,0.1275,0.407252,0.883593,0.890492,0.826072,0.847066
8,0.0981,0.430072,0.885426,0.891419,0.827318,0.848258
9,0.0807,0.439476,0.882676,0.863093,0.825662,0.839507
10,0.0676,0.435804,0.886343,0.894607,0.827391,0.850008


[I 2025-03-27 20:38:39,513] Trial 85 finished with value: 0.8519985040707212 and parameters: {'learning_rate': 0.00039061363634075065, 'weight_decay': 0.007, 'warmup_steps': 2}. Best is trial 30 with value: 0.8596597868624403.


Trial 86 with params: {'learning_rate': 4.0534446710776905e-05, 'weight_decay': 0.01, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7506,1.685413,0.284143,0.184803,0.210744,0.166989
2,1.6665,1.613001,0.408799,0.201439,0.309055,0.233599
3,1.5958,1.521325,0.430797,0.348399,0.328864,0.259779
4,1.5027,1.42319,0.528873,0.528282,0.425585,0.403362
5,1.4073,1.328311,0.612282,0.568714,0.51068,0.509733


[I 2025-03-27 20:39:09,988] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.00046314539247556066, 'weight_decay': 0.001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5173,1.091861,0.713107,0.627536,0.606543,0.611004
2,0.8158,0.577959,0.84143,0.713118,0.717552,0.714952
3,0.4207,0.453481,0.857929,0.723503,0.731564,0.726993
4,0.2569,0.421603,0.879927,0.907133,0.795168,0.818715
5,0.1845,0.400262,0.890009,0.895295,0.831661,0.852095
6,0.1351,0.423062,0.887259,0.891447,0.829982,0.849385
7,0.0993,0.434583,0.880843,0.889504,0.824273,0.845107
8,0.0753,0.447843,0.88176,0.880384,0.842652,0.85752
9,0.0588,0.4833,0.872594,0.872089,0.835768,0.850033
10,0.055,0.466732,0.890009,0.888689,0.858302,0.870884


[I 2025-03-27 20:40:38,485] Trial 87 finished with value: 0.8633510888244157 and parameters: {'learning_rate': 0.00046314539247556066, 'weight_decay': 0.001, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 88 with params: {'learning_rate': 0.0004404619115139071, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5286,1.124902,0.706691,0.617999,0.601095,0.603896
2,0.8409,0.592379,0.835014,0.705265,0.713489,0.709177
3,0.4346,0.459039,0.858845,0.723702,0.732397,0.727665
4,0.2688,0.42097,0.878093,0.905721,0.793294,0.817131
5,0.1936,0.401635,0.887259,0.891777,0.829692,0.849549
6,0.1412,0.422242,0.885426,0.890137,0.828641,0.8482
7,0.1078,0.430187,0.87901,0.885583,0.823871,0.84302
8,0.0803,0.450122,0.880843,0.891558,0.832743,0.853397
9,0.0671,0.470099,0.874427,0.859472,0.828575,0.84046
10,0.0561,0.450393,0.885426,0.883513,0.845117,0.86037


[I 2025-03-27 20:42:07,051] Trial 88 finished with value: 0.8404610412374662 and parameters: {'learning_rate': 0.0004404619115139071, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 89 with params: {'learning_rate': 0.000462248313342956, 'weight_decay': 0.001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5178,1.092923,0.715857,0.629687,0.608543,0.613186
2,0.8166,0.578136,0.840513,0.712519,0.716837,0.714288
3,0.4211,0.453138,0.857929,0.723503,0.731564,0.726993
4,0.2571,0.421267,0.87901,0.906573,0.794488,0.818072
5,0.1849,0.400011,0.890009,0.895295,0.831661,0.852095
6,0.1352,0.423843,0.887259,0.891424,0.829947,0.849327
7,0.0994,0.434973,0.88176,0.891167,0.825707,0.846088
8,0.0758,0.446405,0.885426,0.883679,0.845319,0.860497
9,0.0589,0.487181,0.875344,0.86355,0.838162,0.848236
10,0.0551,0.464534,0.888176,0.885867,0.857184,0.868694


[I 2025-03-27 20:43:35,649] Trial 89 finished with value: 0.8619061442131133 and parameters: {'learning_rate': 0.000462248313342956, 'weight_decay': 0.001, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 90 with params: {'learning_rate': 0.00044058192010102565, 'weight_decay': 0.003, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5246,1.120954,0.703025,0.626622,0.59512,0.598124
2,0.8422,0.599248,0.829514,0.702365,0.707904,0.704817
3,0.4395,0.472571,0.853346,0.719348,0.727733,0.722675
4,0.2708,0.431323,0.866178,0.89786,0.756148,0.768311
5,0.1977,0.415063,0.88451,0.888834,0.827939,0.846985
6,0.145,0.426869,0.88176,0.88617,0.826123,0.845118
7,0.1077,0.430149,0.88176,0.88753,0.826421,0.845629
8,0.0851,0.445512,0.875344,0.882807,0.819784,0.840114
9,0.0679,0.472602,0.877177,0.869289,0.822416,0.838181
10,0.0535,0.46841,0.87901,0.865987,0.841327,0.851179


[I 2025-03-27 20:44:35,422] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.0002002041939696622, 'weight_decay': 0.0, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.646,1.43512,0.51879,0.51258,0.4096,0.372605
2,1.248,0.997961,0.747938,0.647435,0.639042,0.637286
3,0.851,0.717098,0.816682,0.689606,0.698579,0.693847
4,0.6126,0.588931,0.834097,0.708775,0.709904,0.708306
5,0.4716,0.507698,0.858845,0.723432,0.734575,0.728346
6,0.3759,0.46848,0.864345,0.725531,0.737986,0.731533
7,0.3059,0.438897,0.871677,0.901656,0.752615,0.756883
8,0.2555,0.427621,0.871677,0.900036,0.770965,0.787197
9,0.2261,0.416334,0.880843,0.905959,0.788527,0.807648
10,0.2054,0.414996,0.878093,0.876703,0.794767,0.814064


[I 2025-03-27 20:45:33,651] Trial 91 pruned. 


Trial 92 with params: {'learning_rate': 0.0003882821804924566, 'weight_decay': 0.002, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5458,1.170471,0.671861,0.606047,0.565868,0.570831
2,0.9028,0.646062,0.823098,0.696256,0.702638,0.699217
3,0.4844,0.489168,0.851512,0.717564,0.72568,0.720928
4,0.3086,0.429679,0.864345,0.730883,0.73528,0.732403
5,0.2257,0.414068,0.88451,0.887288,0.818644,0.838866


[I 2025-03-27 20:46:02,236] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 0.0003904215188391064, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5451,1.168406,0.672777,0.607402,0.566866,0.57205
2,0.9,0.64361,0.824931,0.697498,0.704236,0.70064
3,0.4822,0.488042,0.851512,0.717564,0.72568,0.720928
4,0.3071,0.429831,0.867094,0.73261,0.737822,0.734658
5,0.2243,0.414256,0.883593,0.888487,0.826802,0.846774
6,0.1629,0.417286,0.882676,0.886628,0.827051,0.845811
7,0.1301,0.421666,0.88451,0.890733,0.827472,0.847249
8,0.1023,0.421899,0.883593,0.889236,0.826298,0.84673
9,0.0834,0.439348,0.882676,0.874572,0.825261,0.842373
10,0.0711,0.430706,0.890926,0.896846,0.830876,0.852922


[I 2025-03-27 20:47:31,202] Trial 93 finished with value: 0.8466961578186907 and parameters: {'learning_rate': 0.0003904215188391064, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 94 with params: {'learning_rate': 0.0004320321183455313, 'weight_decay': 0.0, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5282,1.129813,0.696609,0.622455,0.58946,0.592083
2,0.8515,0.605424,0.828598,0.701002,0.707568,0.704052
3,0.4459,0.475583,0.853346,0.719299,0.728329,0.722864
4,0.2762,0.430732,0.869844,0.901348,0.758192,0.77113
5,0.2022,0.412256,0.885426,0.890239,0.828323,0.847977
6,0.148,0.428032,0.88176,0.885975,0.826719,0.84525
7,0.1094,0.425697,0.880843,0.886819,0.825426,0.844905
8,0.087,0.4504,0.875344,0.883042,0.820006,0.840118
9,0.0699,0.464574,0.877177,0.869856,0.822117,0.838435
10,0.0563,0.458357,0.883593,0.87704,0.835192,0.850764


[I 2025-03-27 20:48:30,857] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 0.00029684930539492687, 'weight_decay': 0.002, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.586,1.274467,0.623281,0.573982,0.514977,0.513825
2,1.0382,0.766987,0.799267,0.678499,0.682764,0.680277
3,0.608,0.538761,0.847846,0.717943,0.721743,0.719253
4,0.4037,0.465377,0.859762,0.729144,0.7305,0.728799
5,0.3024,0.437114,0.874427,0.868034,0.782534,0.799393
6,0.2329,0.428515,0.87626,0.88097,0.811841,0.832126
7,0.1892,0.412072,0.878093,0.888283,0.820272,0.843193
8,0.1472,0.420352,0.87901,0.886979,0.822322,0.843598
9,0.1252,0.41788,0.880843,0.886309,0.824313,0.844393
10,0.1073,0.422738,0.883593,0.888839,0.826322,0.846785


[I 2025-03-27 20:49:31,781] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 0.000389648791707796, 'weight_decay': 0.002, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5453,1.169174,0.671861,0.606047,0.565868,0.570831
2,0.901,0.644405,0.824931,0.697498,0.704236,0.70064
3,0.4829,0.488392,0.851512,0.717564,0.72568,0.720928
4,0.3076,0.42959,0.866178,0.731913,0.736891,0.733813
5,0.2248,0.413955,0.883593,0.88872,0.827133,0.847031
6,0.1635,0.417246,0.882676,0.886348,0.8271,0.845614
7,0.1305,0.419769,0.885426,0.891542,0.828007,0.848091
8,0.1023,0.42276,0.883593,0.889099,0.826033,0.846493
9,0.0833,0.43978,0.883593,0.875246,0.826192,0.843183
10,0.0712,0.430265,0.890926,0.896846,0.830876,0.852922


[I 2025-03-27 20:50:28,384] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.0004824699420548387, 'weight_decay': 0.002, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5101,1.074326,0.716774,0.628304,0.609952,0.612515
2,0.8013,0.567402,0.83868,0.711413,0.715124,0.712832
3,0.4097,0.453726,0.857929,0.723498,0.730374,0.72636
4,0.2503,0.425126,0.873511,0.901886,0.781431,0.801812
5,0.177,0.408624,0.887259,0.891146,0.82964,0.84905
6,0.1307,0.436411,0.877177,0.881907,0.822829,0.84091
7,0.1029,0.428395,0.87901,0.885976,0.823297,0.843635
8,0.0717,0.453478,0.883593,0.879715,0.835373,0.85184
9,0.0551,0.480375,0.875344,0.851038,0.829372,0.838021
10,0.0489,0.472851,0.890009,0.889675,0.868164,0.877132


[I 2025-03-27 20:51:56,131] Trial 97 finished with value: 0.8539261539863684 and parameters: {'learning_rate': 0.0004824699420548387, 'weight_decay': 0.002, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 98 with params: {'learning_rate': 0.00016290656340871634, 'weight_decay': 0.006, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.669,1.501648,0.447296,0.349264,0.341949,0.279604
2,1.344,1.12469,0.702108,0.617651,0.598871,0.596923
3,0.9902,0.84433,0.780018,0.668056,0.666963,0.665108
4,0.7454,0.684566,0.816682,0.693581,0.695846,0.693799
5,0.5828,0.573623,0.847846,0.714099,0.72485,0.719171
6,0.4669,0.513531,0.855179,0.7191,0.729687,0.724156
7,0.3862,0.476212,0.858845,0.723913,0.733322,0.728344
8,0.3282,0.458376,0.867094,0.897228,0.748242,0.752835
9,0.2927,0.439933,0.872594,0.73358,0.745073,0.739196
10,0.2658,0.43671,0.875344,0.901994,0.764765,0.775239


[I 2025-03-27 20:52:55,415] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.0004556985266557911, 'weight_decay': 0.004, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5184,1.102727,0.711274,0.624639,0.605283,0.609718
2,0.8281,0.58819,0.83868,0.710429,0.715674,0.712672
3,0.4295,0.460325,0.859762,0.724724,0.732868,0.728126
4,0.2641,0.421577,0.873511,0.903473,0.762718,0.774552
5,0.1912,0.401545,0.887259,0.891538,0.830072,0.849669
6,0.1367,0.419694,0.890009,0.893953,0.832489,0.852197
7,0.1067,0.427351,0.882676,0.889565,0.827299,0.846751
8,0.0815,0.442314,0.88176,0.890357,0.834062,0.853628
9,0.0639,0.470301,0.874427,0.852206,0.828462,0.838316
10,0.059,0.449069,0.885426,0.868859,0.836343,0.849243


[I 2025-03-27 20:53:54,840] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.0002771884677853293, 'weight_decay': 0.0, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.594,1.301491,0.605866,0.560395,0.496731,0.492869
2,1.0757,0.803995,0.794684,0.678372,0.677842,0.677279
3,0.6449,0.561124,0.848763,0.718123,0.722938,0.720014
4,0.4333,0.478756,0.862511,0.731334,0.733259,0.731379
5,0.3251,0.445792,0.874427,0.867751,0.782817,0.799375


[I 2025-03-27 20:54:24,735] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 0.00021307550543530693, 'weight_decay': 0.007, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6336,1.404477,0.536205,0.532151,0.426777,0.393787
2,1.2101,0.953345,0.751604,0.650418,0.641911,0.641877
3,0.8052,0.678833,0.824015,0.695121,0.704902,0.699748
4,0.5722,0.560636,0.83868,0.711566,0.713846,0.711734
5,0.4385,0.490317,0.860678,0.724266,0.73597,0.729579
6,0.3481,0.456571,0.869844,0.897192,0.751517,0.75441
7,0.2831,0.430025,0.87626,0.904843,0.783589,0.804135
8,0.2358,0.419042,0.880843,0.879781,0.796301,0.81642
9,0.2067,0.411355,0.887259,0.889932,0.820682,0.841276
10,0.1875,0.411053,0.87626,0.880946,0.811179,0.832011


[I 2025-03-27 20:55:23,377] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.00037573399019263237, 'weight_decay': 0.008, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5537,1.192037,0.668194,0.60557,0.561462,0.567096
2,0.9305,0.662779,0.827681,0.70174,0.707481,0.704316
3,0.5008,0.484776,0.858845,0.722642,0.733004,0.727626
4,0.3183,0.434418,0.859762,0.896653,0.740258,0.747719
5,0.2342,0.414569,0.880843,0.878152,0.797484,0.816157


[I 2025-03-27 20:55:53,205] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 0.0003894244139606194, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5454,1.169369,0.671861,0.606047,0.565868,0.570831
2,0.9013,0.644591,0.824931,0.697498,0.704236,0.70064
3,0.4831,0.488597,0.851512,0.717564,0.72568,0.720928
4,0.3078,0.429535,0.866178,0.731913,0.736891,0.733813
5,0.2249,0.414048,0.88451,0.889417,0.828064,0.847829
6,0.1637,0.417567,0.88176,0.885382,0.826482,0.844724
7,0.1307,0.418699,0.885426,0.891542,0.828007,0.848091
8,0.1023,0.423269,0.882676,0.888444,0.825318,0.845798
9,0.0834,0.4403,0.883593,0.87521,0.826192,0.843154
10,0.0714,0.430226,0.890926,0.896361,0.83114,0.852836


[I 2025-03-27 20:57:22,052] Trial 103 finished with value: 0.8468788817433283 and parameters: {'learning_rate': 0.0003894244139606194, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 104 with params: {'learning_rate': 1.1873161138364599e-05, 'weight_decay': 0.006, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7984,1.765528,0.226398,0.190838,0.205628,0.145907
2,1.7518,1.719423,0.266728,0.353711,0.200257,0.155983
3,1.7188,1.692924,0.311641,0.353852,0.232191,0.190089
4,1.6942,1.67222,0.329056,0.172432,0.247186,0.201857
5,1.6761,1.654223,0.36022,0.180254,0.271466,0.216609


[I 2025-03-27 20:57:51,689] Trial 104 pruned. 


Trial 105 with params: {'learning_rate': 0.0004359006533906339, 'weight_decay': 0.007, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5267,1.126219,0.699358,0.624168,0.591791,0.594606
2,0.8475,0.602614,0.827681,0.700471,0.70657,0.703262
3,0.4431,0.474347,0.854262,0.719968,0.728996,0.723575
4,0.2736,0.43098,0.868928,0.900828,0.758175,0.770813
5,0.2,0.413447,0.883593,0.888374,0.826941,0.846313
6,0.1466,0.426216,0.883593,0.887613,0.827783,0.846665
7,0.1086,0.429116,0.883593,0.889188,0.827803,0.847188
8,0.0868,0.446956,0.875344,0.882894,0.820041,0.840168
9,0.0698,0.46469,0.877177,0.870092,0.822014,0.838363
10,0.0553,0.46195,0.883593,0.877182,0.835359,0.850826


[I 2025-03-27 20:59:18,002] Trial 105 finished with value: 0.8491474798561294 and parameters: {'learning_rate': 0.0004359006533906339, 'weight_decay': 0.007, 'warmup_steps': 4}. Best is trial 87 with value: 0.8633510888244157.


Trial 106 with params: {'learning_rate': 0.00040405952414026, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5428,1.162068,0.681027,0.607914,0.57489,0.580385
2,0.8937,0.632944,0.829514,0.701474,0.708513,0.704786
3,0.4741,0.473203,0.857012,0.721108,0.73094,0.72568
4,0.2983,0.427664,0.864345,0.731695,0.735223,0.732793
5,0.2149,0.407641,0.882676,0.888156,0.82607,0.846229
6,0.1546,0.421263,0.880843,0.885258,0.825175,0.84419
7,0.1238,0.408892,0.887259,0.893033,0.82924,0.849873
8,0.0917,0.442045,0.87901,0.886511,0.822443,0.843202
9,0.0765,0.457642,0.87626,0.856958,0.821515,0.834227
10,0.063,0.446021,0.885426,0.878898,0.826121,0.84514


[I 2025-03-27 21:00:17,943] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 0.00044842638971372167, 'weight_decay': 0.0, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5218,1.111346,0.705775,0.619919,0.599892,0.60445
2,0.8363,0.593258,0.835014,0.70737,0.712878,0.709759
3,0.4347,0.461849,0.857929,0.723372,0.731521,0.726797
4,0.2677,0.420209,0.874427,0.903997,0.762749,0.774914
5,0.1928,0.400945,0.889093,0.892757,0.831551,0.851131
6,0.1374,0.415932,0.888176,0.89213,0.830794,0.850385
7,0.1058,0.43275,0.88451,0.892185,0.828091,0.848228
8,0.0823,0.443323,0.879927,0.88743,0.823216,0.843982
9,0.067,0.463154,0.877177,0.866064,0.839405,0.850438
10,0.0577,0.446121,0.887259,0.88499,0.846157,0.861855


[I 2025-03-27 21:01:49,747] Trial 107 finished with value: 0.8512715671172179 and parameters: {'learning_rate': 0.00044842638971372167, 'weight_decay': 0.0, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 108 with params: {'learning_rate': 6.45937609014475e-05, 'weight_decay': 0.003, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7281,1.647916,0.333639,0.186943,0.250493,0.206913
2,1.6107,1.517185,0.44088,0.377243,0.336197,0.272433
3,1.4725,1.361635,0.560037,0.534935,0.457557,0.442964
4,1.3122,1.209346,0.678277,0.610793,0.57508,0.5777
5,1.1697,1.080645,0.719523,0.622266,0.616744,0.612834
6,1.0456,0.979996,0.737855,0.633055,0.632356,0.627558
7,0.95,0.901148,0.762603,0.65218,0.652593,0.650135
8,0.8713,0.837905,0.782768,0.665157,0.670022,0.666331
9,0.8042,0.793221,0.791017,0.674147,0.67738,0.674185
10,0.7577,0.755557,0.804766,0.680764,0.689551,0.684496


[I 2025-03-27 21:02:47,136] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.0004960056561230302, 'weight_decay': 0.001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5043,1.061391,0.721357,0.633364,0.614165,0.616846
2,0.7892,0.560386,0.840513,0.712648,0.716753,0.714291
3,0.4001,0.45127,0.857929,0.723354,0.730805,0.726616
4,0.2442,0.428151,0.869844,0.898881,0.787638,0.810734
5,0.1737,0.408734,0.883593,0.886796,0.82723,0.845985
6,0.1241,0.440594,0.880843,0.884758,0.825307,0.843907
7,0.0988,0.433964,0.882676,0.890636,0.826108,0.847054
8,0.0698,0.463205,0.886343,0.882076,0.847719,0.860735
9,0.0556,0.491013,0.874427,0.851507,0.828309,0.837717
10,0.0477,0.492135,0.87901,0.872966,0.859093,0.864796


[I 2025-03-27 21:04:13,098] Trial 109 finished with value: 0.8498166636046607 and parameters: {'learning_rate': 0.0004960056561230302, 'weight_decay': 0.001, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 110 with params: {'learning_rate': 0.0003642276755634042, 'weight_decay': 0.007, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.556,1.196221,0.659028,0.598357,0.554202,0.558238
2,0.9362,0.673542,0.813016,0.689244,0.694417,0.691593
3,0.511,0.498236,0.854262,0.719808,0.728276,0.72349
4,0.3277,0.436404,0.861595,0.731561,0.731792,0.730584
5,0.2452,0.419222,0.88176,0.885901,0.81638,0.837058


[I 2025-03-27 21:04:42,377] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 0.0004230388731007733, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5355,1.145986,0.690192,0.605279,0.585536,0.588402
2,0.8623,0.610237,0.829514,0.700339,0.709262,0.7046
3,0.4512,0.471339,0.855179,0.720706,0.729383,0.724345
4,0.2815,0.423637,0.873511,0.90336,0.770831,0.788543
5,0.2015,0.403465,0.890009,0.894456,0.831597,0.852012
6,0.1489,0.417638,0.88451,0.875191,0.828308,0.844296
7,0.1163,0.420271,0.88451,0.889503,0.828101,0.847648
8,0.086,0.440455,0.87901,0.887564,0.821666,0.843082
9,0.0713,0.455061,0.879927,0.864351,0.832845,0.844962
10,0.059,0.439977,0.892759,0.900776,0.85063,0.869591


[I 2025-03-27 21:06:10,547] Trial 111 finished with value: 0.8518590808401999 and parameters: {'learning_rate': 0.0004230388731007733, 'weight_decay': 0.01, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 112 with params: {'learning_rate': 0.00036916231759675535, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5539,1.190663,0.661778,0.59957,0.557196,0.561324
2,0.9293,0.668027,0.813932,0.689915,0.694753,0.692055
3,0.5054,0.49668,0.853346,0.719206,0.727344,0.722652
4,0.3237,0.435022,0.865261,0.734072,0.735135,0.73355
5,0.2411,0.418813,0.882676,0.886657,0.817046,0.837767


[I 2025-03-27 21:06:41,161] Trial 112 pruned. 


Trial 113 with params: {'learning_rate': 0.00047904000496580123, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5082,1.076316,0.721357,0.635186,0.613655,0.618435
2,0.8037,0.572507,0.842346,0.71267,0.718985,0.715454
3,0.4134,0.449669,0.855179,0.721142,0.728916,0.724461
4,0.2529,0.419134,0.877177,0.904746,0.793217,0.816508
5,0.1815,0.408182,0.883593,0.887156,0.827653,0.846088
6,0.1306,0.436491,0.880843,0.887147,0.825566,0.845064
7,0.1,0.428245,0.874427,0.868292,0.819406,0.836429
8,0.0723,0.454982,0.88176,0.878121,0.833889,0.850177
9,0.0589,0.462804,0.877177,0.85717,0.839485,0.847083
10,0.0528,0.45354,0.886343,0.874875,0.845587,0.858


[I 2025-03-27 21:08:08,158] Trial 113 finished with value: 0.8525657627805111 and parameters: {'learning_rate': 0.00047904000496580123, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 114 with params: {'learning_rate': 0.00040779934341471456, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5396,1.155546,0.68011,0.605879,0.575301,0.580208
2,0.8789,0.625226,0.829514,0.700415,0.709028,0.704525
3,0.465,0.477241,0.853346,0.719234,0.72704,0.722396
4,0.2931,0.427038,0.867094,0.732542,0.737808,0.734548
5,0.2118,0.409006,0.885426,0.889736,0.828348,0.848126
6,0.1554,0.413924,0.885426,0.875508,0.828967,0.844907
7,0.1208,0.421078,0.887259,0.892697,0.829893,0.849987
8,0.0925,0.430338,0.886343,0.891596,0.828403,0.848845
9,0.0744,0.451855,0.88451,0.874802,0.828179,0.843888
10,0.0643,0.442371,0.88451,0.890678,0.826349,0.847561


[I 2025-03-27 21:09:05,046] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.00019413469526554405, 'weight_decay': 0.01, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6422,1.43366,0.516957,0.512303,0.407411,0.370404
2,1.251,1.00545,0.747021,0.648553,0.638327,0.637019
3,0.8626,0.729221,0.813016,0.687774,0.695767,0.691296
4,0.627,0.596515,0.833181,0.70672,0.709321,0.707203
5,0.4836,0.513475,0.860678,0.724139,0.736019,0.72958
6,0.3856,0.471469,0.862511,0.724235,0.73659,0.730162
7,0.3151,0.440878,0.872594,0.902411,0.75333,0.757623
8,0.2648,0.42935,0.872594,0.901064,0.771381,0.787904
9,0.2339,0.418226,0.87901,0.904711,0.768655,0.77849
10,0.2126,0.416706,0.878093,0.876917,0.7944,0.814013


[I 2025-03-27 21:10:04,392] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.00048693494077293005, 'weight_decay': 0.007, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5049,1.068724,0.730522,0.641798,0.621294,0.625494
2,0.7962,0.568332,0.839597,0.709663,0.716888,0.712982
3,0.4077,0.446192,0.858845,0.724423,0.731402,0.727356
4,0.2491,0.420974,0.875344,0.902555,0.792218,0.814862
5,0.178,0.411327,0.885426,0.888882,0.828735,0.847505
6,0.1314,0.439584,0.87626,0.883592,0.82162,0.841107
7,0.0988,0.427356,0.88176,0.874768,0.825074,0.842418
8,0.0726,0.457526,0.879927,0.8758,0.832723,0.848278
9,0.06,0.466808,0.87626,0.856124,0.839326,0.846272
10,0.05,0.465887,0.888176,0.876431,0.847409,0.859573


[I 2025-03-27 21:11:31,763] Trial 116 finished with value: 0.8584548813629858 and parameters: {'learning_rate': 0.00048693494077293005, 'weight_decay': 0.007, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 117 with params: {'learning_rate': 0.0004439046341805483, 'weight_decay': 0.006, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5239,1.116824,0.703941,0.618797,0.59851,0.603215
2,0.8416,0.596731,0.833181,0.705989,0.711312,0.708333
3,0.4382,0.463403,0.857929,0.724023,0.731076,0.726779
4,0.2702,0.420337,0.873511,0.903489,0.762047,0.77435
5,0.1941,0.400946,0.887259,0.891423,0.830204,0.849782
6,0.1385,0.414955,0.887259,0.891352,0.830079,0.849575
7,0.1068,0.434155,0.88176,0.889469,0.825757,0.845717
8,0.0829,0.441839,0.878093,0.886534,0.821786,0.842705
9,0.0662,0.461063,0.877177,0.865457,0.839865,0.850332
10,0.0582,0.440952,0.885426,0.882677,0.84504,0.860174


[I 2025-03-27 21:12:58,867] Trial 117 finished with value: 0.8503371710185962 and parameters: {'learning_rate': 0.0004439046341805483, 'weight_decay': 0.006, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 118 with params: {'learning_rate': 0.00016881903214673732, 'weight_decay': 0.002, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6621,1.486334,0.463795,0.359023,0.357046,0.304045
2,1.3236,1.098785,0.71494,0.625897,0.611186,0.608264
3,0.9622,0.818524,0.792851,0.676698,0.678166,0.675782
4,0.7193,0.664123,0.819432,0.695843,0.698129,0.696126
5,0.561,0.559487,0.852429,0.718144,0.728563,0.722956
6,0.4492,0.503576,0.857929,0.720661,0.732547,0.726389
7,0.3705,0.467214,0.860678,0.725028,0.735,0.729776
8,0.3143,0.451608,0.867094,0.897772,0.747862,0.752854
9,0.2798,0.434418,0.872594,0.73375,0.745038,0.739282
10,0.2542,0.431841,0.873511,0.900867,0.763404,0.773965


[I 2025-03-27 21:14:01,840] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.00020887867346149568, 'weight_decay': 0.007, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6329,1.406109,0.535289,0.531113,0.425805,0.393569
2,1.2149,0.96111,0.752521,0.652165,0.642675,0.642714
3,0.8149,0.68807,0.821265,0.693423,0.702937,0.697854
4,0.5826,0.566229,0.83868,0.712162,0.7139,0.711969
5,0.4471,0.494285,0.860678,0.724266,0.73597,0.729579


[I 2025-03-27 21:14:31,380] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 0.0004482853112786817, 'weight_decay': 0.005, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5218,1.111472,0.705775,0.619919,0.599892,0.60445
2,0.8364,0.59329,0.835014,0.70737,0.712878,0.709759
3,0.4348,0.461928,0.857929,0.723462,0.731305,0.72667
4,0.2678,0.420156,0.875344,0.904558,0.763747,0.7757
5,0.1928,0.40065,0.889093,0.892757,0.831551,0.851131
6,0.1375,0.415918,0.890009,0.893866,0.832176,0.851942
7,0.1058,0.433845,0.88451,0.892381,0.828374,0.848354
8,0.082,0.442856,0.877177,0.885814,0.821168,0.842108
9,0.0664,0.463602,0.875344,0.864193,0.837856,0.848761
10,0.0567,0.444454,0.88451,0.872491,0.84406,0.85605


[I 2025-03-27 21:15:57,628] Trial 120 finished with value: 0.8492651508983983 and parameters: {'learning_rate': 0.0004482853112786817, 'weight_decay': 0.005, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 121 with params: {'learning_rate': 0.00048809847248262005, 'weight_decay': 0.007, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5077,1.068736,0.716774,0.628535,0.610337,0.612864
2,0.7961,0.564197,0.83868,0.71143,0.715406,0.712994
3,0.4057,0.453139,0.857929,0.723435,0.730589,0.726486
4,0.2479,0.426705,0.872594,0.900904,0.789902,0.812954
5,0.1756,0.409362,0.885426,0.889241,0.828417,0.847652
6,0.1274,0.438823,0.878093,0.882337,0.823475,0.841523
7,0.1025,0.430159,0.879927,0.873108,0.823646,0.841041
8,0.071,0.457157,0.882676,0.877609,0.835623,0.850978
9,0.0543,0.492945,0.870761,0.847152,0.826561,0.834161
10,0.0491,0.477457,0.887259,0.87934,0.865021,0.871157


[I 2025-03-27 21:17:25,741] Trial 121 finished with value: 0.8535503733709922 and parameters: {'learning_rate': 0.00048809847248262005, 'weight_decay': 0.007, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 122 with params: {'learning_rate': 0.00011784240005609097, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6938,1.573318,0.439047,0.219459,0.330069,0.263296
2,1.4663,1.297251,0.632447,0.587034,0.529721,0.528199
3,1.1923,1.055907,0.719523,0.638261,0.616106,0.614223
4,0.959,0.861028,0.76077,0.652886,0.650702,0.647719
5,0.7824,0.723844,0.816682,0.687813,0.700683,0.693895
6,0.6455,0.634073,0.834097,0.703699,0.71265,0.707923
7,0.5508,0.572842,0.848763,0.716793,0.725018,0.720593
8,0.4787,0.537546,0.853346,0.720164,0.727591,0.723638
9,0.4311,0.514357,0.858845,0.725316,0.732039,0.728342
10,0.3945,0.49623,0.861595,0.723976,0.736199,0.729937


[I 2025-03-27 21:18:25,175] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.00047470780916174156, 'weight_decay': 0.004, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5132,1.081876,0.71494,0.629436,0.608653,0.612707
2,0.8081,0.572651,0.837764,0.710451,0.71391,0.711724
3,0.4152,0.454896,0.857929,0.723399,0.730687,0.726438
4,0.253,0.42506,0.875344,0.903459,0.782765,0.803249
5,0.18,0.405887,0.885426,0.890459,0.828279,0.848027
6,0.1328,0.426269,0.885426,0.888541,0.828835,0.847473
7,0.0991,0.418841,0.88451,0.890489,0.827801,0.848143
8,0.0715,0.446623,0.882676,0.878191,0.834082,0.850652
9,0.0557,0.471832,0.88176,0.87885,0.843107,0.857012
10,0.0505,0.473126,0.891842,0.89122,0.869497,0.878539


[I 2025-03-27 21:19:53,597] Trial 123 finished with value: 0.8512576676556266 and parameters: {'learning_rate': 0.00047470780916174156, 'weight_decay': 0.004, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 124 with params: {'learning_rate': 0.0004781939074722696, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5118,1.078467,0.715857,0.628971,0.609153,0.612493
2,0.8051,0.570412,0.837764,0.710451,0.71391,0.711724
3,0.4128,0.454577,0.858845,0.724072,0.731353,0.727135
4,0.2517,0.424754,0.874427,0.902682,0.782098,0.802543
5,0.1784,0.406941,0.885426,0.889821,0.828029,0.847601
6,0.1318,0.430949,0.883593,0.887185,0.827607,0.846079
7,0.0993,0.420748,0.882676,0.888848,0.826503,0.846739
8,0.071,0.451376,0.882676,0.878384,0.834222,0.850792
9,0.0551,0.472937,0.874427,0.871024,0.827968,0.843953
10,0.0497,0.46481,0.890926,0.881717,0.868866,0.87427


[I 2025-03-27 21:21:21,159] Trial 124 finished with value: 0.851774975846729 and parameters: {'learning_rate': 0.0004781939074722696, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 125 with params: {'learning_rate': 0.00046750866934690245, 'weight_decay': 0.007, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5131,1.08857,0.718607,0.631561,0.61108,0.616036
2,0.8149,0.579432,0.840513,0.711049,0.717285,0.7138
3,0.4212,0.455499,0.859762,0.724652,0.732806,0.728188
4,0.258,0.419396,0.877177,0.905656,0.783686,0.804822
5,0.1861,0.402386,0.887259,0.890921,0.830368,0.849438
6,0.1321,0.423403,0.885426,0.889681,0.828781,0.848056
7,0.1015,0.428569,0.883593,0.88939,0.827813,0.847138
8,0.0764,0.447194,0.87901,0.875657,0.831701,0.847781
9,0.0596,0.471939,0.873511,0.85308,0.836863,0.843698
10,0.055,0.45905,0.886343,0.87172,0.836981,0.850791


[I 2025-03-27 21:22:18,664] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.0004919540858634305, 'weight_decay': 0.001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5061,1.065142,0.72044,0.631506,0.613763,0.615986
2,0.7927,0.562272,0.839597,0.711938,0.716086,0.713618
3,0.4029,0.452496,0.857012,0.722773,0.729874,0.725804
4,0.2462,0.427488,0.869844,0.898785,0.787902,0.810864
5,0.1747,0.408744,0.88451,0.887791,0.827848,0.846751
6,0.125,0.439241,0.87901,0.883259,0.823939,0.842436
7,0.0992,0.428438,0.880843,0.874164,0.824002,0.841696
8,0.0698,0.46486,0.88451,0.877879,0.83783,0.852155
9,0.0553,0.497565,0.870761,0.848158,0.825946,0.834659
10,0.0487,0.484411,0.887259,0.888275,0.865853,0.874823


[I 2025-03-27 21:23:46,333] Trial 126 finished with value: 0.8525439379312719 and parameters: {'learning_rate': 0.0004919540858634305, 'weight_decay': 0.001, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 127 with params: {'learning_rate': 0.00046420145387501377, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5145,1.092594,0.715857,0.628304,0.609261,0.613782
2,0.8186,0.582004,0.839597,0.710929,0.716389,0.713243
3,0.4235,0.457188,0.859762,0.724712,0.732555,0.728056
4,0.2597,0.420413,0.877177,0.905656,0.783686,0.804822
5,0.1877,0.401652,0.888176,0.892277,0.830703,0.850328
6,0.1333,0.42205,0.888176,0.892563,0.830727,0.850524
7,0.103,0.428562,0.879927,0.887029,0.82507,0.844524
8,0.0788,0.441289,0.879927,0.888689,0.832764,0.852198
9,0.0601,0.477488,0.871677,0.852059,0.835572,0.84249
10,0.0584,0.458769,0.885426,0.871862,0.836001,0.85034


[I 2025-03-27 21:24:42,719] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 0.0002204901775755416, 'weight_decay': 0.008, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6223,1.381128,0.553621,0.56005,0.442931,0.414965
2,1.1865,0.928458,0.758937,0.655521,0.64794,0.648367
3,0.78,0.6607,0.829514,0.701791,0.708484,0.704615
4,0.5506,0.545699,0.845096,0.716485,0.719422,0.717094
5,0.4205,0.482528,0.864345,0.726989,0.738748,0.732413


[I 2025-03-27 21:25:12,427] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.00021713473000878515, 'weight_decay': 0.01, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6354,1.404943,0.537122,0.531315,0.427161,0.393286
2,1.208,0.948012,0.758937,0.654889,0.648055,0.64805
3,0.7976,0.671386,0.826764,0.697448,0.706902,0.70191
4,0.5632,0.555712,0.839597,0.711644,0.714826,0.712346
5,0.4312,0.48723,0.862511,0.725978,0.737366,0.731158
6,0.3417,0.455502,0.867094,0.895231,0.749441,0.7523
7,0.2774,0.429561,0.875344,0.871404,0.782874,0.800817
8,0.2302,0.418356,0.880843,0.88381,0.805425,0.826971
9,0.2021,0.411574,0.886343,0.88911,0.820001,0.840496
10,0.183,0.411016,0.878093,0.88245,0.812777,0.833508


[I 2025-03-27 21:26:09,274] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.00048250540295786704, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5102,1.074365,0.716774,0.628391,0.609903,0.612664
2,0.8014,0.567347,0.83868,0.711413,0.715124,0.712832
3,0.4097,0.453702,0.857929,0.723498,0.730374,0.72636
4,0.2503,0.425216,0.873511,0.901886,0.781431,0.801812
5,0.1769,0.408651,0.888176,0.891706,0.83032,0.849694
6,0.1307,0.436575,0.877177,0.881907,0.822829,0.84091
7,0.1029,0.429017,0.87901,0.88622,0.823345,0.843797
8,0.0718,0.453642,0.883593,0.879715,0.835373,0.85184
9,0.0552,0.481191,0.875344,0.851024,0.829623,0.838141
10,0.0491,0.471901,0.890009,0.890258,0.86815,0.877405


[I 2025-03-27 21:27:35,947] Trial 130 finished with value: 0.8539261539863684 and parameters: {'learning_rate': 0.00048250540295786704, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 131 with params: {'learning_rate': 0.0001874958608811346, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.65,1.452349,0.500458,0.455786,0.392588,0.35076
2,1.2737,1.032805,0.734189,0.639083,0.628033,0.62536
3,0.8903,0.752563,0.804766,0.682044,0.688852,0.684846
4,0.6513,0.614609,0.828598,0.702751,0.705656,0.703455
5,0.5038,0.524867,0.860678,0.724462,0.735803,0.729585


[I 2025-03-27 21:28:06,188] Trial 131 pruned. 


Trial 132 with params: {'learning_rate': 0.0004413211432725232, 'weight_decay': 0.0, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5282,1.123883,0.706691,0.617999,0.601095,0.603896
2,0.8398,0.591577,0.83593,0.705886,0.714204,0.709849
3,0.4339,0.458663,0.859762,0.724382,0.733064,0.728364
4,0.2683,0.420976,0.878093,0.905721,0.793294,0.817131
5,0.1932,0.401821,0.887259,0.891777,0.829692,0.849549
6,0.1411,0.422612,0.885426,0.890137,0.828641,0.8482
7,0.1076,0.429856,0.878093,0.884851,0.823204,0.842329
8,0.0801,0.451533,0.879927,0.890948,0.832028,0.852698
9,0.0672,0.47047,0.874427,0.859415,0.828508,0.840451
10,0.0561,0.450522,0.885426,0.883513,0.845117,0.86037


[I 2025-03-27 21:29:35,942] Trial 132 finished with value: 0.8404610412374662 and parameters: {'learning_rate': 0.0004413211432725232, 'weight_decay': 0.0, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 133 with params: {'learning_rate': 0.00045921701846223396, 'weight_decay': 0.002, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5168,1.098418,0.711274,0.623957,0.605283,0.609464
2,0.8242,0.585631,0.840513,0.711505,0.717056,0.713887
3,0.427,0.458943,0.858845,0.723979,0.732152,0.727433
4,0.2623,0.421373,0.874427,0.90402,0.772522,0.789754
5,0.1901,0.402194,0.886343,0.890937,0.829356,0.848985
6,0.1356,0.421021,0.891842,0.895225,0.833491,0.853267
7,0.1059,0.428465,0.882676,0.889593,0.827334,0.846789
8,0.0819,0.439725,0.87901,0.887828,0.832062,0.851365
9,0.0623,0.473444,0.869844,0.847399,0.825129,0.834217
10,0.0586,0.45187,0.885426,0.869127,0.836553,0.849429


[I 2025-03-27 21:30:33,946] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 2.988130860570449e-05, 'weight_decay': 0.0, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7649,1.702643,0.278643,0.260121,0.206684,0.1617
2,1.6904,1.649349,0.347388,0.174128,0.26243,0.208281
3,1.6425,1.588962,0.415215,0.368725,0.314165,0.240151
4,1.5839,1.523669,0.445463,0.356653,0.341563,0.279831
5,1.5199,1.455642,0.506874,0.520822,0.402139,0.36799
6,1.4552,1.392951,0.558203,0.525202,0.45487,0.441037
7,1.3953,1.337973,0.613199,0.564146,0.51287,0.510418
8,1.3459,1.289312,0.636114,0.578694,0.534468,0.533932
9,1.3001,1.249933,0.668194,0.601387,0.56701,0.566546
10,1.2613,1.214238,0.681943,0.607235,0.580511,0.580429


[I 2025-03-27 21:31:33,520] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.00044697488981559994, 'weight_decay': 0.001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5211,1.112334,0.707608,0.63038,0.59973,0.602986
2,0.8341,0.595355,0.829514,0.702563,0.708221,0.705046
3,0.4349,0.469444,0.855179,0.721246,0.729331,0.724544
4,0.2671,0.433097,0.867094,0.898517,0.756814,0.768983
5,0.1941,0.416682,0.882676,0.887404,0.826323,0.845619
6,0.1413,0.431279,0.878093,0.88243,0.823143,0.841662
7,0.1049,0.43312,0.880843,0.887807,0.825093,0.844872
8,0.0819,0.446499,0.87626,0.884475,0.820553,0.841344
9,0.0656,0.469997,0.875344,0.867814,0.821765,0.837234
10,0.0531,0.471796,0.88451,0.876311,0.863682,0.8689


[I 2025-03-27 21:33:01,083] Trial 135 finished with value: 0.8490062517320506 and parameters: {'learning_rate': 0.00044697488981559994, 'weight_decay': 0.001, 'warmup_steps': 4}. Best is trial 87 with value: 0.8633510888244157.


Trial 136 with params: {'learning_rate': 0.0004854157775908582, 'weight_decay': 0.001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5089,1.07143,0.715857,0.627814,0.609272,0.611979
2,0.7987,0.565776,0.839597,0.712381,0.716122,0.713819
3,0.4077,0.453572,0.858845,0.724115,0.731256,0.727179
4,0.2492,0.425959,0.873511,0.901778,0.781431,0.801768
5,0.1761,0.409408,0.885426,0.889417,0.82832,0.847611
6,0.1294,0.438076,0.878093,0.882544,0.823544,0.841612
7,0.1034,0.42872,0.87901,0.871331,0.823063,0.839843
8,0.0719,0.451722,0.882676,0.878534,0.835289,0.851352
9,0.0544,0.486302,0.87626,0.851466,0.830794,0.838624
10,0.0488,0.475444,0.887259,0.877835,0.865599,0.870692


[I 2025-03-27 21:34:31,805] Trial 136 finished with value: 0.8526552750363648 and parameters: {'learning_rate': 0.0004854157775908582, 'weight_decay': 0.001, 'warmup_steps': 3}. Best is trial 87 with value: 0.8633510888244157.


Trial 137 with params: {'learning_rate': 0.0004972310557253398, 'weight_decay': 0.0, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5005,1.059185,0.736022,0.645439,0.626252,0.630003
2,0.7869,0.564531,0.83593,0.707156,0.713577,0.710043
3,0.4016,0.443805,0.862511,0.728121,0.733888,0.730419
4,0.2446,0.422206,0.87626,0.903084,0.792567,0.815366
5,0.1736,0.412606,0.88451,0.887957,0.828284,0.84677
6,0.126,0.446856,0.868928,0.878355,0.815276,0.835287
7,0.0966,0.430517,0.88451,0.891924,0.827984,0.8484
8,0.0705,0.446819,0.886343,0.893832,0.837677,0.857489
9,0.0583,0.481974,0.873511,0.871757,0.837733,0.850449
10,0.0457,0.477527,0.887259,0.88381,0.847251,0.861829


[I 2025-03-27 21:36:02,798] Trial 137 finished with value: 0.8517816428966953 and parameters: {'learning_rate': 0.0004972310557253398, 'weight_decay': 0.0, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 138 with params: {'learning_rate': 0.0004896375697959419, 'weight_decay': 0.006, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5038,1.066197,0.731439,0.642353,0.622292,0.62637
2,0.7937,0.567189,0.837764,0.708376,0.71529,0.711535
3,0.4059,0.445304,0.862511,0.727464,0.73443,0.730374
4,0.2479,0.421758,0.87626,0.903149,0.792899,0.81551
5,0.1768,0.411971,0.88451,0.887467,0.828333,0.846661
6,0.1299,0.441991,0.87901,0.885847,0.823696,0.843386
7,0.0986,0.427927,0.880843,0.888498,0.824805,0.845353
8,0.0704,0.453242,0.88451,0.875483,0.827758,0.844032
9,0.0578,0.479408,0.877177,0.856986,0.840392,0.847035
10,0.0491,0.474058,0.88451,0.871331,0.844276,0.855553


[I 2025-03-27 21:37:33,022] Trial 138 finished with value: 0.8587432987305128 and parameters: {'learning_rate': 0.0004896375697959419, 'weight_decay': 0.006, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 139 with params: {'learning_rate': 0.0004923239208543718, 'weight_decay': 0.005, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4935,1.059524,0.726856,0.642623,0.616795,0.621719
2,0.7897,0.568062,0.836847,0.706657,0.714464,0.710191
3,0.4085,0.453384,0.859762,0.724793,0.73318,0.728496
4,0.2469,0.420543,0.878093,0.905212,0.784818,0.805249
5,0.1739,0.435055,0.87901,0.882743,0.824295,0.842133
6,0.1313,0.435039,0.879927,0.886494,0.823689,0.844043
7,0.0955,0.447663,0.875344,0.883332,0.821253,0.840645
8,0.0708,0.472852,0.870761,0.858034,0.824728,0.837518
9,0.0596,0.480106,0.87901,0.85246,0.833278,0.840735
10,0.0469,0.476988,0.88451,0.871651,0.855182,0.86192


[I 2025-03-27 21:39:04,752] Trial 139 finished with value: 0.8503418993436772 and parameters: {'learning_rate': 0.0004923239208543718, 'weight_decay': 0.005, 'warmup_steps': 1}. Best is trial 87 with value: 0.8633510888244157.


Trial 140 with params: {'learning_rate': 0.0004962940086537099, 'weight_decay': 0.008, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.501,1.060102,0.736022,0.646038,0.626252,0.630199
2,0.7878,0.564952,0.83593,0.707156,0.713577,0.710043
3,0.4022,0.444074,0.863428,0.728733,0.734568,0.73105
4,0.2451,0.422059,0.875344,0.902421,0.791901,0.81468
5,0.174,0.412444,0.88451,0.887513,0.828284,0.846486
6,0.1279,0.448063,0.872594,0.882025,0.817942,0.838404
7,0.0972,0.425361,0.886343,0.89223,0.829243,0.849538
8,0.07,0.447799,0.887259,0.877873,0.829686,0.846312
9,0.0586,0.479793,0.878093,0.865506,0.841032,0.850594
10,0.0448,0.47794,0.890009,0.883122,0.840745,0.856585


[I 2025-03-27 21:40:34,798] Trial 140 finished with value: 0.8547844279499174 and parameters: {'learning_rate': 0.0004962940086537099, 'weight_decay': 0.008, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 141 with params: {'learning_rate': 0.0004413607194608039, 'weight_decay': 0.006, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5252,1.120025,0.702108,0.618309,0.596846,0.601804
2,0.8449,0.598867,0.831347,0.704876,0.709917,0.707042
3,0.4403,0.464369,0.859762,0.725802,0.732458,0.72834
4,0.2718,0.420654,0.873511,0.903886,0.752592,0.758163
5,0.1951,0.402178,0.887259,0.891316,0.830204,0.849741
6,0.1392,0.414682,0.888176,0.891935,0.830759,0.850216
7,0.1081,0.435281,0.879927,0.887832,0.824361,0.844084
8,0.0839,0.44178,0.87901,0.887199,0.82269,0.84347
9,0.0667,0.463975,0.874427,0.863073,0.837601,0.847965
10,0.0586,0.447418,0.887259,0.884497,0.846171,0.861635


[I 2025-03-27 21:42:05,687] Trial 141 finished with value: 0.8520244864678097 and parameters: {'learning_rate': 0.0004413607194608039, 'weight_decay': 0.006, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 142 with params: {'learning_rate': 0.0004452696544872675, 'weight_decay': 0.007, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5155,1.108407,0.709441,0.62889,0.601324,0.606843
2,0.8364,0.593948,0.83593,0.706838,0.71382,0.710087
3,0.4392,0.46296,0.857012,0.722193,0.730915,0.72594
4,0.2724,0.421359,0.873511,0.903502,0.753665,0.758434
5,0.1977,0.417442,0.882676,0.887102,0.826504,0.845803
6,0.1444,0.423732,0.879927,0.884713,0.82501,0.843719
7,0.1091,0.429733,0.879927,0.873007,0.823807,0.840805
8,0.0857,0.456188,0.87626,0.88463,0.820065,0.840881
9,0.0699,0.467623,0.878093,0.885375,0.82268,0.842789
10,0.0572,0.456789,0.877177,0.85755,0.838637,0.846722


[I 2025-03-27 21:43:06,240] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 0.0004393545779231083, 'weight_decay': 0.007, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5261,1.122015,0.702108,0.618309,0.596846,0.601804
2,0.8472,0.60039,0.831347,0.704876,0.709917,0.707042
3,0.442,0.464823,0.859762,0.725365,0.732506,0.728148
4,0.273,0.420928,0.873511,0.903985,0.752328,0.758037
5,0.1959,0.403298,0.887259,0.891373,0.82994,0.849645
6,0.1398,0.414985,0.889093,0.892684,0.831523,0.851061
7,0.1091,0.434057,0.878093,0.886498,0.822944,0.84262
8,0.0841,0.442163,0.87901,0.88708,0.82269,0.843456
9,0.0666,0.468153,0.877177,0.865037,0.840157,0.850283
10,0.0584,0.455577,0.886343,0.883747,0.845442,0.860844


[I 2025-03-27 21:44:38,238] Trial 143 finished with value: 0.8517046403787831 and parameters: {'learning_rate': 0.0004393545779231083, 'weight_decay': 0.007, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 144 with params: {'learning_rate': 0.00046207736392892576, 'weight_decay': 0.007, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5155,1.094928,0.71494,0.626143,0.608594,0.612497
2,0.8209,0.583485,0.840513,0.711413,0.71732,0.713988
3,0.425,0.457848,0.859762,0.72464,0.732819,0.728135
4,0.2607,0.420805,0.877177,0.905656,0.783686,0.804822
5,0.1888,0.40178,0.887259,0.891722,0.830023,0.849682
6,0.1344,0.421799,0.889093,0.893231,0.831408,0.851177
7,0.1041,0.428645,0.88176,0.888767,0.826668,0.846087
8,0.0807,0.437667,0.879927,0.888399,0.832715,0.851997
9,0.0604,0.479337,0.871677,0.851815,0.835648,0.842345
10,0.0598,0.456769,0.885426,0.871022,0.83653,0.85023


[I 2025-03-27 21:45:38,238] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 0.0004910165029038034, 'weight_decay': 0.01, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5033,1.064941,0.731439,0.642353,0.622292,0.62637
2,0.7924,0.566718,0.83593,0.706974,0.713625,0.710006
3,0.4051,0.444867,0.863428,0.728149,0.735097,0.731072
4,0.2473,0.421831,0.877177,0.903812,0.793565,0.816197
5,0.1762,0.411917,0.885426,0.888417,0.829,0.847498
6,0.1297,0.442587,0.87901,0.886026,0.823696,0.843439
7,0.0987,0.428459,0.880843,0.888621,0.825053,0.845456
8,0.07,0.452436,0.885426,0.87536,0.828791,0.844489
9,0.0576,0.481864,0.874427,0.854954,0.838047,0.844864
10,0.048,0.473615,0.886343,0.872956,0.84586,0.857114


[I 2025-03-27 21:47:09,041] Trial 145 finished with value: 0.8589079139018393 and parameters: {'learning_rate': 0.0004910165029038034, 'weight_decay': 0.01, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 146 with params: {'learning_rate': 0.00043842632877861496, 'weight_decay': 0.01, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5265,1.123089,0.702108,0.618309,0.596846,0.601804
2,0.8483,0.601084,0.832264,0.705391,0.710597,0.707668
3,0.4427,0.464967,0.859762,0.725396,0.732493,0.728201
4,0.2735,0.42108,0.870761,0.902418,0.749785,0.755859
5,0.1962,0.40391,0.887259,0.891373,0.82994,0.849645
6,0.1401,0.415365,0.889093,0.892739,0.831523,0.851068
7,0.1095,0.434291,0.87626,0.885714,0.821182,0.841216
8,0.0843,0.442057,0.880843,0.8884,0.824072,0.84482
9,0.0667,0.468562,0.874427,0.859511,0.828728,0.840624
10,0.0583,0.457243,0.885426,0.883255,0.844762,0.860248


[I 2025-03-27 21:48:40,575] Trial 146 finished with value: 0.8507614495666534 and parameters: {'learning_rate': 0.00043842632877861496, 'weight_decay': 0.01, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 147 with params: {'learning_rate': 0.00047860688825627373, 'weight_decay': 0.01, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5,1.073143,0.722273,0.636944,0.613098,0.618004
2,0.8025,0.574508,0.837764,0.707054,0.715193,0.710823
3,0.4171,0.456198,0.857929,0.723625,0.731366,0.726912
4,0.2538,0.421415,0.875344,0.903695,0.764495,0.775655
5,0.1807,0.430362,0.87626,0.881184,0.821971,0.84029
6,0.1363,0.432744,0.878093,0.884114,0.822903,0.842257
7,0.0995,0.433623,0.874427,0.867828,0.819732,0.83618
8,0.0721,0.468647,0.872594,0.85577,0.817223,0.83139
9,0.0572,0.486942,0.878093,0.861524,0.832393,0.843397
10,0.0464,0.468316,0.887259,0.874128,0.847279,0.858336


[I 2025-03-27 21:50:15,604] Trial 147 finished with value: 0.8377057777923339 and parameters: {'learning_rate': 0.00047860688825627373, 'weight_decay': 0.01, 'warmup_steps': 1}. Best is trial 87 with value: 0.8633510888244157.


Trial 148 with params: {'learning_rate': 0.000427837974552112, 'weight_decay': 0.008, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5314,1.13422,0.693859,0.612484,0.588868,0.593339
2,0.8613,0.610126,0.830431,0.70374,0.709501,0.706284
3,0.4515,0.468091,0.862511,0.726862,0.735022,0.730337
4,0.2805,0.423219,0.865261,0.732176,0.736189,0.733578
5,0.2002,0.406566,0.88451,0.889456,0.827476,0.847393
6,0.1428,0.422108,0.883593,0.888199,0.827232,0.846673
7,0.1125,0.418148,0.885426,0.893139,0.827963,0.848858
8,0.0853,0.444448,0.87901,0.88628,0.822911,0.843299
9,0.0703,0.467217,0.880843,0.864691,0.833924,0.845878
10,0.0578,0.454671,0.882676,0.883559,0.852063,0.865453


[I 2025-03-27 21:51:45,732] Trial 148 finished with value: 0.8491745793974798 and parameters: {'learning_rate': 0.000427837974552112, 'weight_decay': 0.008, 'warmup_steps': 2}. Best is trial 87 with value: 0.8633510888244157.


Trial 149 with params: {'learning_rate': 0.00023331250665160932, 'weight_decay': 0.003, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6181,1.363934,0.562786,0.564468,0.451293,0.424837
2,1.16,0.896022,0.768103,0.660638,0.656149,0.656161
3,0.7456,0.63211,0.834097,0.705496,0.712378,0.70852
4,0.5201,0.527115,0.852429,0.721305,0.725193,0.722462
5,0.3959,0.469241,0.867094,0.728754,0.74081,0.734452


[I 2025-03-27 21:52:16,015] Trial 149 pruned. 


In [15]:
print(best_trial)

BestRun(run_id='87', objective=0.8633510888244157, hyperparameters={'learning_rate': 0.00046314539247556066, 'weight_decay': 0.001, 'warmup_steps': 3}, run_summary=None)


In [16]:
base.reset_seed()

In [17]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_coarse_hp-search", logging_dir=f"~/logs/{DATASET}/bert-distill_coarse_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [18]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [19]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [20]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert(),
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
best_trial2 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Distilation",
    n_trials=150
)

[I 2025-03-27 21:52:16,565] A new study created in memory with name: Distilation


Trial 0 with params: {'learning_rate': 4.3284502212938785e-05, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9655,3.84083,0.280477,0.199898,0.207449,0.157816
2,3.8247,3.710131,0.408799,0.198401,0.308985,0.233811
3,3.703,3.549631,0.431714,0.373389,0.328452,0.256811
4,3.5361,3.372205,0.51879,0.534913,0.413358,0.378027
5,3.3642,3.200775,0.583868,0.556214,0.480675,0.475059
6,3.2032,3.055242,0.659945,0.604067,0.558012,0.559253
7,3.0624,2.929659,0.689276,0.615554,0.586682,0.585862
8,2.9452,2.818455,0.703025,0.621896,0.599454,0.599792
9,2.8324,2.732872,0.721357,0.63035,0.617804,0.61404
10,2.7562,2.661854,0.726856,0.630142,0.622087,0.618734


[I 2025-03-27 21:53:17,017] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 1.8408992080552506e-05, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0192,3.928426,0.259395,0.183613,0.207869,0.164584
2,3.9254,3.853934,0.275894,0.185277,0.203937,0.150338
3,3.8723,3.802667,0.345555,0.190112,0.259085,0.214089
4,3.8206,3.745611,0.398717,0.193474,0.300757,0.23207
5,3.7725,3.690166,0.420715,0.206324,0.317544,0.242117


[I 2025-03-27 21:53:46,635] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 1.0838581269344744e-05, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0385,3.975247,0.209899,0.162802,0.194444,0.131438
2,3.9751,3.909353,0.264895,0.192065,0.201635,0.155773
3,3.9299,3.867077,0.280477,0.209812,0.207675,0.158373
4,3.8889,3.837846,0.315307,0.19121,0.235083,0.195055
5,3.8624,3.812215,0.336389,0.190388,0.252158,0.209138
6,3.8386,3.785786,0.362053,0.191107,0.272319,0.222323
7,3.8224,3.758347,0.395967,0.194122,0.298504,0.233912
8,3.8018,3.735419,0.408799,0.199658,0.308358,0.236582
9,3.777,3.716271,0.416132,0.203411,0.31383,0.24117
10,3.7688,3.699129,0.422548,0.206878,0.318788,0.244774


[I 2025-03-27 21:54:48,740] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0111,3.914221,0.262145,0.189359,0.200153,0.158151
2,3.9127,3.84182,0.285976,0.189324,0.211825,0.161171
3,3.8583,3.782957,0.364803,0.190223,0.274315,0.223609
4,3.801,3.718075,0.411549,0.199926,0.310545,0.237185
5,3.7443,3.653306,0.423465,0.207008,0.319739,0.243288


[I 2025-03-27 21:55:19,809] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00010952662748632558, 'weight_decay': 0.001, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.876,3.677129,0.427131,0.215636,0.321042,0.257947
2,3.54,3.237463,0.572869,0.551867,0.465801,0.452094
3,3.1008,2.829925,0.686526,0.627143,0.583556,0.584193
4,2.6965,2.467462,0.748854,0.646477,0.640997,0.637827
5,2.3591,2.179449,0.800183,0.675765,0.686309,0.679551
6,2.0708,1.956313,0.816682,0.689686,0.698643,0.693453
7,1.8498,1.77737,0.839597,0.709231,0.717413,0.713119
8,1.6673,1.649853,0.848763,0.715967,0.724075,0.719882
9,1.5231,1.549704,0.847846,0.716351,0.723942,0.71979
10,1.4112,1.47058,0.851512,0.717008,0.727264,0.722004


[I 2025-03-27 21:56:51,154] Trial 4 finished with value: 0.7299849713984009 and parameters: {'learning_rate': 0.00010952662748632558, 'weight_decay': 0.001, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 4.5}. Best is trial 4 with value: 0.7299849713984009.


Trial 5 with params: {'learning_rate': 0.0002157696745589684, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7711,3.371079,0.511457,0.38787,0.400437,0.359612
2,3.056,2.57265,0.738772,0.646879,0.631374,0.629487
3,2.3008,1.97716,0.813932,0.688168,0.696616,0.691491
4,1.7522,1.609605,0.836847,0.711285,0.711885,0.710677
5,1.3726,1.33294,0.856095,0.721145,0.731494,0.725933
6,1.0758,1.151981,0.868928,0.729611,0.742397,0.735706
7,0.8545,1.004682,0.873511,0.734795,0.745434,0.739855
8,0.7077,0.932806,0.877177,0.736792,0.747352,0.741877
9,0.5951,0.875355,0.88176,0.740329,0.751874,0.745955
10,0.5242,0.866018,0.875344,0.735979,0.746231,0.740833


[I 2025-03-27 21:57:49,720] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.00010769622478263136, 'weight_decay': 0.001, 'warmup_steps': 0, 'lambda_param': 1.0, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8709,3.678075,0.428964,0.213966,0.322303,0.257015
2,3.542,3.243462,0.563703,0.546967,0.45614,0.438919
3,3.1098,2.837234,0.688359,0.627647,0.583497,0.585436
4,2.7109,2.482561,0.754354,0.651416,0.645593,0.643095
5,2.3778,2.198566,0.800183,0.675919,0.686358,0.679519
6,2.0924,1.976832,0.818515,0.691507,0.69976,0.694914
7,1.8734,1.797989,0.83868,0.708477,0.716531,0.712327
8,1.6916,1.670187,0.846929,0.715112,0.722612,0.71871
9,1.5486,1.570259,0.848763,0.71692,0.724891,0.720603
10,1.4366,1.490557,0.854262,0.719054,0.729595,0.724189


[I 2025-03-27 21:59:23,120] Trial 6 finished with value: 0.7316903138768874 and parameters: {'learning_rate': 0.00010769622478263136, 'weight_decay': 0.001, 'warmup_steps': 0, 'lambda_param': 1.0, 'temperature': 7.0}. Best is trial 6 with value: 0.7316903138768874.


Trial 7 with params: {'learning_rate': 0.000236288641842364, 'weight_decay': 0.003, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.732,3.300315,0.538955,0.567385,0.428297,0.393159
2,2.9619,2.464115,0.752521,0.652768,0.64363,0.642233
3,2.1772,1.852961,0.827681,0.697595,0.708574,0.702636
4,1.6091,1.480989,0.850596,0.721837,0.723519,0.721714
5,1.2318,1.229564,0.862511,0.726323,0.736919,0.731257
6,0.9581,1.076369,0.868928,0.729829,0.741783,0.735495
7,0.7569,0.9402,0.878093,0.737688,0.749126,0.743148
8,0.6117,0.885888,0.879927,0.738093,0.749504,0.743618
9,0.5113,0.826136,0.882676,0.740845,0.752161,0.746239
10,0.4424,0.821992,0.879927,0.738249,0.749406,0.743652


[I 2025-03-27 22:00:24,699] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.6119044727609182e-05, 'weight_decay': 0.005, 'warmup_steps': 0, 'lambda_param': 1.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0185,3.935097,0.254812,0.194037,0.209239,0.162582
2,3.9343,3.864756,0.256645,0.177372,0.188785,0.127912
3,3.8854,3.822073,0.315307,0.195297,0.235058,0.194401
4,3.8402,3.776559,0.378552,0.19246,0.284838,0.22955
5,3.8008,3.730102,0.415215,0.203093,0.312868,0.24432


[I 2025-03-27 22:00:55,879] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.00013353819088790598, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8559,3.6179,0.44363,0.218893,0.334178,0.262674
2,3.4292,3.07532,0.618698,0.582962,0.514047,0.511243
3,2.9005,2.604812,0.732356,0.646854,0.625661,0.62469
4,2.4486,2.22261,0.771769,0.660159,0.660553,0.657515
5,2.0824,1.916637,0.825848,0.696974,0.707971,0.702186
6,1.7756,1.692649,0.843263,0.710417,0.720805,0.715457
7,1.5461,1.52176,0.851512,0.72002,0.726303,0.72275
8,1.3593,1.40298,0.857012,0.722012,0.731228,0.726521
9,1.2157,1.306836,0.868011,0.730854,0.741068,0.735783
10,1.1082,1.236467,0.863428,0.725344,0.737448,0.731283


[I 2025-03-27 22:02:28,306] Trial 9 finished with value: 0.7370408149921506 and parameters: {'learning_rate': 0.00013353819088790598, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}. Best is trial 9 with value: 0.7370408149921506.


Trial 10 with params: {'learning_rate': 0.00022653365944687691, 'weight_decay': 0.004, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.772,3.358806,0.510541,0.389922,0.400918,0.359653
2,3.0273,2.529041,0.747938,0.650889,0.63849,0.637457
3,2.2462,1.920385,0.820348,0.692355,0.702243,0.696787
4,1.6875,1.556739,0.84418,0.716771,0.718957,0.716952
5,1.3086,1.289376,0.858845,0.723651,0.734071,0.728515
6,1.0202,1.115905,0.869844,0.731113,0.743063,0.736762
7,0.8034,0.97399,0.873511,0.735076,0.745517,0.739921
8,0.6612,0.906111,0.87901,0.738313,0.749675,0.743854
9,0.5511,0.85312,0.879927,0.73886,0.750492,0.74453
10,0.4839,0.84594,0.87626,0.736226,0.747391,0.741551


[I 2025-03-27 22:03:31,352] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 0.00017286627142386828, 'weight_decay': 0.001, 'warmup_steps': 0, 'lambda_param': 0.9, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8021,3.492017,0.468378,0.35403,0.360008,0.312227
2,3.2324,2.81535,0.699358,0.623088,0.594127,0.594355
3,2.5831,2.261769,0.773602,0.662806,0.661913,0.659292
4,2.0701,1.870891,0.823098,0.699071,0.700867,0.69936
5,1.689,1.574792,0.851512,0.717565,0.727487,0.722368
6,1.3772,1.36339,0.863428,0.726036,0.737338,0.731494
7,1.1485,1.214001,0.868011,0.732095,0.739879,0.735629
8,0.9829,1.119683,0.873511,0.734274,0.745038,0.7395
9,0.8505,1.037774,0.878093,0.739043,0.748808,0.743721
10,0.7587,0.993536,0.87626,0.738078,0.747364,0.742353


[I 2025-03-27 22:05:01,049] Trial 11 finished with value: 0.738432072657767 and parameters: {'learning_rate': 0.00017286627142386828, 'weight_decay': 0.001, 'warmup_steps': 0, 'lambda_param': 0.9, 'temperature': 6.5}. Best is trial 11 with value: 0.738432072657767.


Trial 12 with params: {'learning_rate': 0.00040776462389522527, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5558,2.871016,0.659945,0.605133,0.5495,0.554921
2,2.3485,1.766081,0.822181,0.693086,0.704205,0.698122
3,1.3866,1.18405,0.859762,0.723577,0.734725,0.728908
4,0.833,0.939063,0.87626,0.738611,0.745385,0.741603
5,0.5664,0.836595,0.873511,0.731907,0.746322,0.73875
6,0.3966,0.808174,0.877177,0.900408,0.767465,0.775568
7,0.2872,0.812352,0.882676,0.904321,0.817303,0.840159
8,0.2163,0.770049,0.887259,0.890793,0.820081,0.841358
9,0.1914,0.851199,0.878093,0.883955,0.822916,0.841568
10,0.167,0.769971,0.880843,0.875622,0.823604,0.8421


[I 2025-03-27 22:06:31,265] Trial 12 finished with value: 0.8457088496185348 and parameters: {'learning_rate': 0.00040776462389522527, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 6.5}. Best is trial 12 with value: 0.8457088496185348.


Trial 13 with params: {'learning_rate': 0.00032375556543311825, 'weight_decay': 0.001, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.67,3.105143,0.586618,0.568575,0.477696,0.465985
2,2.6571,2.083313,0.800183,0.676228,0.686337,0.680527
3,1.7231,1.452013,0.850596,0.717948,0.726209,0.721721
4,1.1417,1.148036,0.860678,0.730323,0.731836,0.72998
5,0.8219,0.971661,0.872594,0.7329,0.745337,0.738847
6,0.5952,0.89325,0.867094,0.72825,0.740193,0.733861
7,0.4547,0.819707,0.87626,0.738795,0.747145,0.742196
8,0.3327,0.796064,0.877177,0.90353,0.756376,0.760007
9,0.2756,0.781471,0.878093,0.86286,0.775874,0.789323
10,0.2383,0.7622,0.882676,0.880465,0.79727,0.817161


[I 2025-03-27 22:08:03,258] Trial 13 finished with value: 0.8508724570939009 and parameters: {'learning_rate': 0.00032375556543311825, 'weight_decay': 0.001, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 13 with value: 0.8508724570939009.


Trial 14 with params: {'learning_rate': 0.00028734156687283685, 'weight_decay': 0.001, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6809,3.17003,0.572869,0.582673,0.460621,0.436999
2,2.7576,2.21387,0.785518,0.671413,0.671851,0.669828
3,1.8827,1.586331,0.843263,0.710879,0.720614,0.715549
4,1.3092,1.251296,0.857012,0.725508,0.728663,0.726275
5,0.9503,1.054569,0.867094,0.731013,0.740315,0.735114
6,0.7335,0.947403,0.871677,0.731062,0.743912,0.737205
7,0.5652,0.85594,0.873511,0.734611,0.745358,0.739318
8,0.4365,0.820133,0.880843,0.739242,0.75059,0.744696
9,0.3584,0.782165,0.885426,0.742238,0.754193,0.747807
10,0.3053,0.778071,0.880843,0.736831,0.750704,0.743603


[I 2025-03-27 22:09:00,886] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.00024876082710187, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7384,3.282205,0.538038,0.567319,0.427998,0.394675
2,2.9214,2.403187,0.761687,0.659346,0.651109,0.650187
3,2.1032,1.787524,0.829514,0.698859,0.710697,0.70451
4,1.5332,1.429337,0.849679,0.721109,0.723188,0.721113
5,1.1603,1.18955,0.864345,0.727057,0.738293,0.732429
6,0.8918,1.043655,0.868011,0.729368,0.741681,0.735166
7,0.6959,0.909964,0.88176,0.741059,0.751938,0.746072
8,0.5587,0.853418,0.877177,0.736486,0.747436,0.741813
9,0.4618,0.803133,0.88176,0.74046,0.751728,0.74579
10,0.3994,0.80774,0.87901,0.736737,0.749585,0.742848


[I 2025-03-27 22:10:30,790] Trial 15 finished with value: 0.7936022524340172 and parameters: {'learning_rate': 0.00024876082710187, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 6.5}. Best is trial 13 with value: 0.8508724570939009.


Trial 16 with params: {'learning_rate': 0.00037511374111104536, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6374,2.999233,0.63978,0.587104,0.532591,0.533174
2,2.492,1.898657,0.817599,0.688504,0.701088,0.694297
3,1.5201,1.283722,0.863428,0.725099,0.738408,0.731484
4,0.9435,1.012431,0.863428,0.731219,0.734591,0.732189
5,0.6517,0.893951,0.874427,0.734565,0.746353,0.739978
6,0.4543,0.824334,0.874427,0.733868,0.746363,0.739983
7,0.3521,0.785943,0.874427,0.900945,0.764433,0.77434
8,0.2577,0.806401,0.882676,0.88053,0.797397,0.816822
9,0.2127,0.784354,0.882676,0.886183,0.817374,0.837622
10,0.1775,0.777322,0.886343,0.892173,0.827675,0.849009


[I 2025-03-27 22:11:59,714] Trial 16 finished with value: 0.8496717642123374 and parameters: {'learning_rate': 0.00037511374111104536, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 13 with value: 0.8508724570939009.


Trial 17 with params: {'learning_rate': 0.00033613562170626806, 'weight_decay': 0.001, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6689,3.077507,0.585701,0.5635,0.479476,0.466433
2,2.6109,2.035259,0.800183,0.675513,0.684568,0.67958
3,1.6709,1.412583,0.853346,0.719718,0.728287,0.723835
4,1.0979,1.125703,0.856095,0.7258,0.72699,0.725367
5,0.7721,0.940257,0.868928,0.731188,0.741239,0.735964
6,0.5636,0.90615,0.868928,0.727025,0.742602,0.734102
7,0.4376,0.849008,0.868928,0.735699,0.74063,0.736683
8,0.3216,0.784136,0.878093,0.903695,0.758351,0.761131
9,0.262,0.774154,0.88176,0.883474,0.806414,0.826874
10,0.2248,0.766606,0.887259,0.890403,0.819239,0.840702


[I 2025-03-27 22:13:31,446] Trial 17 finished with value: 0.8468938503879794 and parameters: {'learning_rate': 0.00033613562170626806, 'weight_decay': 0.001, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 7.0}. Best is trial 13 with value: 0.8508724570939009.


Trial 18 with params: {'learning_rate': 0.0004579825536123422, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5809,2.835388,0.692942,0.61978,0.58472,0.589174
2,2.2639,1.654475,0.831347,0.69944,0.711226,0.704868
3,1.2613,1.101483,0.870761,0.731991,0.743771,0.73753
4,0.7347,0.874533,0.873511,0.736903,0.744752,0.740499
5,0.4807,0.795197,0.87901,0.738105,0.749613,0.743636
6,0.3367,0.785893,0.878093,0.901039,0.760289,0.760513
7,0.2462,0.761074,0.888176,0.909266,0.803854,0.824265
8,0.1957,0.77204,0.88451,0.877069,0.827109,0.844573
9,0.1624,0.781885,0.882676,0.87226,0.827209,0.84207
10,0.1515,0.759107,0.887259,0.892209,0.829316,0.849851


[I 2025-03-27 22:15:01,868] Trial 18 finished with value: 0.8575094917390763 and parameters: {'learning_rate': 0.0004579825536123422, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 18 with value: 0.8575094917390763.


Trial 19 with params: {'learning_rate': 0.00043881111434962547, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5953,2.876968,0.675527,0.610102,0.567931,0.572855
2,2.3138,1.706432,0.829514,0.697921,0.709987,0.703356
3,1.3151,1.13496,0.864345,0.726339,0.73898,0.732335
4,0.7748,0.893499,0.873511,0.736097,0.744166,0.73976
5,0.5131,0.824527,0.87626,0.734186,0.748171,0.740689
6,0.3544,0.789797,0.878093,0.736451,0.750342,0.743023
7,0.2661,0.772219,0.88451,0.879625,0.801062,0.818619
8,0.2033,0.77756,0.87901,0.888609,0.821811,0.843712
9,0.1685,0.766,0.889093,0.878276,0.832327,0.847789
10,0.1552,0.737079,0.889093,0.879953,0.830558,0.847895


[I 2025-03-27 22:16:36,233] Trial 19 finished with value: 0.8627147902988012 and parameters: {'learning_rate': 0.00043881111434962547, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 20 with params: {'learning_rate': 0.00022983747014318628, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7614,3.336369,0.51879,0.392937,0.408226,0.368298
2,3.0007,2.499844,0.756187,0.656809,0.645812,0.644887
3,2.2167,1.893832,0.820348,0.692373,0.702494,0.696965
4,1.6571,1.531173,0.845096,0.717478,0.719144,0.717358
5,1.2788,1.268969,0.860678,0.725202,0.735467,0.730017
6,0.9947,1.099383,0.869844,0.730207,0.743112,0.73638
7,0.7825,0.960253,0.875344,0.736426,0.746899,0.741286
8,0.6418,0.895122,0.87901,0.737795,0.749958,0.74373
9,0.5341,0.844227,0.87901,0.738136,0.749777,0.743802
10,0.4685,0.838084,0.878093,0.737007,0.748738,0.742628


[I 2025-03-27 22:18:08,102] Trial 20 finished with value: 0.7463222334782379 and parameters: {'learning_rate': 0.00022983747014318628, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 21 with params: {'learning_rate': 0.0004957894043848183, 'weight_decay': 0.001, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5588,2.773301,0.67736,0.614387,0.568448,0.573849
2,2.177,1.574998,0.834097,0.70179,0.713808,0.707398
3,1.1755,1.049217,0.866178,0.729244,0.73957,0.733916
4,0.6625,0.868857,0.874427,0.736579,0.746028,0.740852
5,0.44,0.79967,0.878093,0.737032,0.75029,0.743182
6,0.3096,0.756773,0.888176,0.910544,0.794659,0.812889
7,0.2248,0.742018,0.892759,0.897398,0.833981,0.854724
8,0.1866,0.776622,0.887259,0.894008,0.829017,0.850206
9,0.1577,0.834865,0.87901,0.871136,0.82313,0.839097
10,0.146,0.74914,0.888176,0.879259,0.829961,0.847225


[I 2025-03-27 22:19:40,043] Trial 21 finished with value: 0.8542294945446761 and parameters: {'learning_rate': 0.0004957894043848183, 'weight_decay': 0.001, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 4.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 22 with params: {'learning_rate': 0.0004480527040934682, 'weight_decay': 0.004, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5856,2.858916,0.667278,0.607376,0.560783,0.565151
2,2.3022,1.698003,0.827681,0.697072,0.708117,0.702237
3,1.3051,1.12508,0.867094,0.729349,0.740347,0.734282
4,0.7602,0.891067,0.873511,0.735962,0.744311,0.739954
5,0.4949,0.789187,0.883593,0.741081,0.753725,0.747053
6,0.341,0.750784,0.883593,0.906897,0.772901,0.781647
7,0.2554,0.762501,0.883593,0.874508,0.79038,0.806546
8,0.2004,0.774085,0.886343,0.889681,0.819663,0.840308
9,0.1671,0.787721,0.886343,0.891047,0.829104,0.848937
10,0.1547,0.754456,0.888176,0.879088,0.829759,0.847164


[I 2025-03-27 22:21:07,233] Trial 22 finished with value: 0.8526558232325722 and parameters: {'learning_rate': 0.0004480527040934682, 'weight_decay': 0.004, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 23 with params: {'learning_rate': 0.00013362810060060313, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8642,3.626755,0.445463,0.219091,0.335601,0.263165
2,3.4391,3.086449,0.615032,0.58509,0.50987,0.505644
3,2.9113,2.61686,0.730522,0.647086,0.624064,0.623273
4,2.4575,2.230713,0.767186,0.656654,0.656628,0.653862
5,2.089,1.922636,0.822181,0.693849,0.704776,0.698958


[I 2025-03-27 22:21:38,560] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 0.0003398036841328398, 'weight_decay': 0.004, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6659,3.069326,0.589368,0.562084,0.482469,0.469168
2,2.5989,2.021998,0.80385,0.678247,0.687898,0.682703
3,1.6558,1.400586,0.853346,0.719718,0.728287,0.723835
4,1.0831,1.11654,0.856095,0.7258,0.72699,0.725367
5,0.7597,0.934907,0.871677,0.733104,0.743385,0.737975
6,0.5534,0.899282,0.870761,0.72866,0.743949,0.735649
7,0.4272,0.839835,0.874427,0.738636,0.745617,0.741047
8,0.3163,0.786742,0.880843,0.873507,0.787965,0.804755
9,0.2567,0.778602,0.88176,0.883707,0.806149,0.826859
10,0.2185,0.767241,0.890009,0.894884,0.83046,0.851795


[I 2025-03-27 22:23:07,462] Trial 24 finished with value: 0.8467154333790182 and parameters: {'learning_rate': 0.0003398036841328398, 'weight_decay': 0.004, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 25 with params: {'learning_rate': 0.00044067625470672455, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5964,2.882205,0.63703,0.597346,0.528046,0.527214
2,2.3222,1.718756,0.824931,0.695114,0.705281,0.699734
3,1.3196,1.136097,0.866178,0.728581,0.74052,0.73419
4,0.7763,0.913582,0.875344,0.737754,0.745238,0.741158
5,0.5156,0.83773,0.87626,0.736327,0.747829,0.741466
6,0.362,0.813576,0.875344,0.897792,0.757844,0.757566
7,0.2808,0.770514,0.879927,0.882063,0.804997,0.825705
8,0.2181,0.801755,0.879927,0.888626,0.81292,0.835144
9,0.1828,0.780883,0.886343,0.891073,0.828986,0.84881
10,0.1532,0.743704,0.890009,0.881666,0.830357,0.84859


[I 2025-03-27 22:24:35,945] Trial 25 finished with value: 0.8473764866720578 and parameters: {'learning_rate': 0.00044067625470672455, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 26 with params: {'learning_rate': 0.0002035452684630682, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7895,3.417055,0.489459,0.373977,0.378439,0.329301
2,3.1147,2.646056,0.728689,0.640466,0.621903,0.620242
3,2.3843,2.059219,0.804766,0.681561,0.688828,0.683976
4,1.8452,1.687668,0.830431,0.705327,0.707103,0.70549
5,1.4627,1.398888,0.856095,0.721439,0.73123,0.725897
6,1.158,1.20523,0.864345,0.726543,0.738608,0.732153
7,0.9298,1.05689,0.868928,0.731486,0.742206,0.736566
8,0.777,0.981494,0.875344,0.736054,0.746274,0.740887
9,0.6607,0.913528,0.87901,0.738834,0.750146,0.744355
10,0.5833,0.898933,0.875344,0.736744,0.746182,0.741085


[I 2025-03-27 22:25:36,103] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 0.00032790652275992697, 'weight_decay': 0.0, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6662,3.094724,0.591201,0.571798,0.482686,0.473628
2,2.6421,2.067596,0.800183,0.676169,0.68635,0.680439
3,1.7055,1.437137,0.851512,0.718602,0.726876,0.722408
4,1.1241,1.136317,0.863428,0.731595,0.733947,0.731782
5,0.8065,0.962871,0.873511,0.733899,0.746004,0.739699
6,0.5816,0.885719,0.868011,0.729211,0.740908,0.73472
7,0.4431,0.817376,0.87626,0.738105,0.747428,0.741973
8,0.3235,0.79956,0.875344,0.901944,0.755056,0.758546
9,0.2702,0.786593,0.878093,0.871771,0.784632,0.802087
10,0.2324,0.764711,0.88451,0.882193,0.798652,0.818661


[I 2025-03-27 22:26:32,789] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.0003956469553490707, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6276,2.964516,0.632447,0.58862,0.524768,0.524806
2,2.4363,1.837323,0.817599,0.687778,0.700651,0.693753
3,1.4503,1.232843,0.863428,0.725383,0.738645,0.731736
4,0.8882,0.977967,0.868928,0.734658,0.739765,0.736601
5,0.6143,0.877092,0.874427,0.734818,0.746057,0.739876
6,0.4313,0.809163,0.880843,0.737785,0.751779,0.744548
7,0.3294,0.788481,0.875344,0.901472,0.765064,0.774904
8,0.2409,0.81061,0.878093,0.881123,0.80251,0.823679
9,0.1973,0.785252,0.87626,0.883905,0.820284,0.841044
10,0.1676,0.767307,0.887259,0.895586,0.828103,0.850703


[I 2025-03-27 22:28:01,147] Trial 28 finished with value: 0.8517200602864584 and parameters: {'learning_rate': 0.0003956469553490707, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 29 with params: {'learning_rate': 0.00047814122014260875, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5606,2.792457,0.693859,0.619788,0.586591,0.590128
2,2.2245,1.620627,0.834097,0.702267,0.713097,0.707349
3,1.2196,1.067971,0.864345,0.72763,0.738208,0.731934
4,0.6953,0.855984,0.87626,0.737313,0.747556,0.742216
5,0.4465,0.770796,0.88176,0.739858,0.75282,0.745893
6,0.3117,0.754791,0.887259,0.910927,0.793246,0.812402
7,0.2343,0.777222,0.882676,0.884434,0.816928,0.836628
8,0.1876,0.7464,0.892759,0.898814,0.832556,0.854501
9,0.1565,0.779775,0.882676,0.888918,0.826009,0.846335
10,0.14,0.759231,0.886343,0.880838,0.827192,0.846331


[I 2025-03-27 22:29:27,723] Trial 29 finished with value: 0.8532198084886123 and parameters: {'learning_rate': 0.00047814122014260875, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 2.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 30 with params: {'learning_rate': 4.731579131667026e-05, 'weight_decay': 0.008, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9484,3.827795,0.278643,0.198831,0.20595,0.153389
2,3.8061,3.681969,0.422548,0.205621,0.319137,0.244064
3,3.6651,3.498896,0.459212,0.376832,0.353746,0.297164
4,3.4751,3.298962,0.545371,0.557665,0.439796,0.418048
5,3.286,3.118749,0.622365,0.584824,0.519374,0.52053
6,3.1152,2.96161,0.68011,0.611083,0.578425,0.578706
7,2.9641,2.828676,0.701192,0.621968,0.597008,0.597591
8,2.8401,2.715572,0.725023,0.632289,0.619262,0.61788
9,2.7219,2.626958,0.733272,0.636162,0.627913,0.624894
10,2.6422,2.553071,0.743355,0.642872,0.636605,0.633591


[I 2025-03-27 22:30:25,986] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.0004440255005234728, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5892,2.867922,0.664528,0.605931,0.558201,0.562201
2,2.3128,1.708034,0.829514,0.698986,0.709782,0.704065
3,1.3164,1.131446,0.866178,0.728629,0.73995,0.733803
4,0.7697,0.895426,0.872594,0.735333,0.743595,0.739284
5,0.5026,0.790276,0.883593,0.741496,0.753676,0.747227
6,0.3455,0.751099,0.882676,0.906288,0.772252,0.780981
7,0.2606,0.751778,0.886343,0.877157,0.792596,0.808981
8,0.2054,0.770349,0.887259,0.893198,0.828853,0.849886
9,0.1704,0.785984,0.887259,0.89221,0.828585,0.849166
10,0.1577,0.761442,0.890009,0.895089,0.830457,0.851921


[I 2025-03-27 22:31:59,273] Trial 31 finished with value: 0.8540507436479811 and parameters: {'learning_rate': 0.0004440255005234728, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 32 with params: {'learning_rate': 0.0003423817132871393, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6634,3.072578,0.593034,0.576928,0.483958,0.471078
2,2.5971,2.01358,0.806599,0.679603,0.69127,0.684761
3,1.6443,1.38746,0.850596,0.71676,0.726716,0.721536
4,1.0667,1.100631,0.863428,0.730238,0.734211,0.731388
5,0.7533,0.930079,0.877177,0.736409,0.748,0.741815
6,0.5448,0.876184,0.871677,0.730571,0.744504,0.737052
7,0.4164,0.834639,0.868011,0.7315,0.740805,0.735119
8,0.3112,0.785226,0.878093,0.90392,0.766973,0.777251
9,0.25,0.77997,0.88176,0.881787,0.806873,0.826497
10,0.2126,0.790382,0.877177,0.8846,0.810996,0.833402


[I 2025-03-27 22:33:27,400] Trial 32 finished with value: 0.8503808527971994 and parameters: {'learning_rate': 0.0003423817132871393, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 33 with params: {'learning_rate': 0.00035452422780417716, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6551,3.049209,0.59945,0.570669,0.490064,0.47914
2,2.5598,1.972461,0.809349,0.681227,0.693664,0.686697
3,1.5982,1.348336,0.851512,0.716518,0.728363,0.722125
4,1.0209,1.066496,0.861595,0.728261,0.733209,0.73006
5,0.7151,0.918108,0.875344,0.735553,0.746751,0.740462
6,0.5122,0.866184,0.870761,0.72924,0.744315,0.736318
7,0.391,0.80061,0.870761,0.732541,0.743013,0.737078
8,0.2922,0.792293,0.87626,0.861377,0.774164,0.787934
9,0.2381,0.795462,0.877177,0.879094,0.802371,0.82275
10,0.2044,0.793906,0.88176,0.887367,0.814424,0.836475


[I 2025-03-27 22:34:25,474] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 0.0004886143689992361, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5623,2.781072,0.701192,0.626486,0.593601,0.597775
2,2.1979,1.586881,0.83868,0.705862,0.717142,0.711224
3,1.191,1.044033,0.873511,0.734643,0.745835,0.739944
4,0.6787,0.84867,0.878093,0.739688,0.749126,0.744162
5,0.4399,0.750968,0.889093,0.74587,0.758397,0.751901
6,0.3064,0.790023,0.87901,0.873713,0.797351,0.813292
7,0.2336,0.766925,0.885426,0.888104,0.819834,0.839734
8,0.1885,0.753366,0.894592,0.899702,0.834274,0.855879
9,0.1538,0.806347,0.883593,0.887801,0.827431,0.846332
10,0.1349,0.761766,0.889093,0.89454,0.829914,0.851257


[I 2025-03-27 22:35:55,662] Trial 34 finished with value: 0.8576284276181694 and parameters: {'learning_rate': 0.0004886143689992361, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 35 with params: {'learning_rate': 1.1372143478447463e-05, 'weight_decay': 0.0, 'warmup_steps': 1, 'lambda_param': 0.8, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0344,3.968601,0.220898,0.169202,0.201459,0.140403
2,3.9689,3.902277,0.260312,0.197033,0.196358,0.150004
3,3.9235,3.861395,0.27956,0.18793,0.206615,0.155148
4,3.8831,3.831841,0.31714,0.190205,0.236722,0.196954
5,3.8561,3.804996,0.340972,0.189255,0.255789,0.211245


[I 2025-03-27 22:36:26,193] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.00033994908296327306, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6647,3.075731,0.590284,0.571126,0.481944,0.468714
2,2.6033,2.020903,0.805683,0.678929,0.690604,0.684041
3,1.6529,1.3952,0.851512,0.717718,0.727382,0.722376
4,1.0761,1.108671,0.863428,0.730843,0.73426,0.731737
5,0.7615,0.934624,0.878093,0.737173,0.748667,0.742526
6,0.552,0.87669,0.871677,0.730571,0.744504,0.737052
7,0.4216,0.837549,0.868928,0.732047,0.74152,0.735811
8,0.3155,0.788147,0.87626,0.902433,0.765375,0.775725
9,0.2538,0.778401,0.88176,0.882444,0.806555,0.826701
10,0.2166,0.787102,0.87626,0.881545,0.800976,0.823269


[I 2025-03-27 22:37:55,521] Trial 36 finished with value: 0.8485125454619671 and parameters: {'learning_rate': 0.00033994908296327306, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 37 with params: {'learning_rate': 0.00045842419540289544, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5854,2.849104,0.649863,0.601894,0.540577,0.54267
2,2.2762,1.671525,0.829514,0.698512,0.709143,0.703431
3,1.2699,1.105341,0.864345,0.727282,0.739161,0.732992
4,0.7363,0.892784,0.875344,0.738747,0.745273,0.741597
5,0.4894,0.80592,0.87901,0.737967,0.749937,0.743321
6,0.3384,0.791114,0.88176,0.905411,0.771146,0.779756
7,0.2573,0.753287,0.887259,0.888341,0.820302,0.84031
8,0.197,0.760046,0.890009,0.881514,0.830793,0.84844
9,0.1698,0.767167,0.882676,0.872138,0.827147,0.842105
10,0.1485,0.738558,0.890009,0.896638,0.831257,0.852828


[I 2025-03-27 22:39:24,578] Trial 37 finished with value: 0.8574519708615641 and parameters: {'learning_rate': 0.00045842419540289544, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 5.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 38 with params: {'learning_rate': 2.4269221144679105e-05, 'weight_decay': 0.005, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0036,3.897373,0.268561,0.192377,0.200837,0.156076
2,3.896,3.822723,0.309808,0.19257,0.231273,0.187614
3,3.8349,3.744156,0.40055,0.195003,0.302041,0.233672
4,3.7643,3.666391,0.422548,0.205207,0.319024,0.242966
5,3.6895,3.584227,0.426214,0.371294,0.323316,0.245254


[I 2025-03-27 22:39:54,256] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 3.7432529104406396e-05, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9736,3.853251,0.27681,0.199154,0.204448,0.1544
2,3.8443,3.743391,0.3978,0.195413,0.300291,0.235122
3,3.7451,3.610436,0.422548,0.203066,0.319594,0.240621
4,3.6103,3.466069,0.48121,0.460027,0.375812,0.328404
5,3.4677,3.316795,0.545371,0.55349,0.440418,0.419955
6,3.3275,3.185946,0.593951,0.564955,0.491717,0.487983
7,3.2029,3.073684,0.646196,0.595742,0.544601,0.545153
8,3.0984,2.972586,0.678277,0.609404,0.576485,0.577725
9,2.9973,2.890979,0.691109,0.614087,0.589542,0.587669
10,2.926,2.823042,0.701192,0.617885,0.598964,0.596693


[I 2025-03-27 22:40:53,075] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 2.1221899420191e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0043,3.906281,0.259395,0.193314,0.19525,0.146912
2,3.9064,3.836672,0.287809,0.192164,0.213109,0.161639
3,3.8527,3.777735,0.380385,0.200815,0.285667,0.233867
4,3.7935,3.708523,0.417965,0.203347,0.315469,0.240856
5,3.7333,3.639385,0.424381,0.372255,0.320781,0.244521


[I 2025-03-27 22:41:21,266] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.00029719141844976565, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7027,3.169238,0.558203,0.572077,0.448618,0.421937
2,2.7468,2.188659,0.786434,0.667924,0.673238,0.669282
3,1.8475,1.56097,0.843263,0.711466,0.720393,0.715627
4,1.2753,1.240192,0.855179,0.725319,0.727289,0.725313
5,0.9298,1.035082,0.868011,0.730016,0.740903,0.735243
6,0.6914,0.931881,0.869844,0.730971,0.743197,0.736535
7,0.5342,0.857532,0.872594,0.735207,0.744594,0.738972
8,0.4076,0.796929,0.879927,0.738188,0.750477,0.744204
9,0.335,0.775298,0.87626,0.734215,0.747836,0.740637
10,0.2879,0.781075,0.88176,0.872692,0.788035,0.804357


[I 2025-03-27 22:42:50,044] Trial 41 finished with value: 0.8398731872510434 and parameters: {'learning_rate': 0.00029719141844976565, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 5.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 42 with params: {'learning_rate': 0.00037232504275850654, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.639,2.998619,0.60495,0.570257,0.499137,0.492433
2,2.4953,1.911271,0.813016,0.684806,0.696948,0.690581
3,1.5317,1.307765,0.852429,0.717776,0.728067,0.722692
4,0.9647,1.03922,0.861595,0.730392,0.731851,0.73013
5,0.6653,0.89397,0.877177,0.736838,0.747973,0.741908
6,0.4778,0.846414,0.873511,0.730412,0.746653,0.738004
7,0.3617,0.818491,0.874427,0.903896,0.755221,0.758686
8,0.2649,0.785586,0.887259,0.891719,0.819242,0.841005
9,0.217,0.802395,0.877177,0.884014,0.821765,0.841751
10,0.1855,0.771792,0.885426,0.891488,0.827558,0.848685


[I 2025-03-27 22:44:20,815] Trial 42 finished with value: 0.8462829545041356 and parameters: {'learning_rate': 0.00037232504275850654, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 4.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 43 with params: {'learning_rate': 0.0004145660682858629, 'weight_decay': 0.008, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.549,2.855287,0.665445,0.607031,0.554561,0.560369
2,2.3294,1.745639,0.825848,0.69564,0.706927,0.700621
3,1.3637,1.165925,0.860678,0.724126,0.735737,0.729651
4,0.8138,0.926339,0.877177,0.73926,0.746683,0.742606
5,0.5512,0.828631,0.87626,0.734257,0.748419,0.740904
6,0.3867,0.808846,0.875344,0.899536,0.775485,0.789237
7,0.2839,0.809898,0.874427,0.87888,0.810175,0.830229
8,0.2101,0.764727,0.887259,0.892598,0.829267,0.850072
9,0.1887,0.853685,0.873511,0.881758,0.819194,0.838837
10,0.1613,0.779162,0.883593,0.877996,0.825372,0.84411


[I 2025-03-27 22:45:51,747] Trial 43 finished with value: 0.8446383533586742 and parameters: {'learning_rate': 0.0004145660682858629, 'weight_decay': 0.008, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 44 with params: {'learning_rate': 0.00017665299926535667, 'weight_decay': 0.009000000000000001, 'warmup_steps': 4, 'lambda_param': 0.9, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8233,3.511948,0.448213,0.355065,0.342184,0.278598
2,3.2426,2.816029,0.691109,0.618844,0.587861,0.587176
3,2.5783,2.253299,0.771769,0.66012,0.661598,0.657342
4,2.0599,1.86866,0.818515,0.696187,0.696433,0.695378
5,1.6744,1.563679,0.853346,0.719234,0.728705,0.723794


[I 2025-03-27 22:46:20,724] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 0.00043371026242218253, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5994,2.88747,0.670027,0.607779,0.562703,0.567164
2,2.3283,1.722049,0.828598,0.696977,0.70932,0.70247
3,1.3303,1.143368,0.865261,0.727368,0.739695,0.733185
4,0.7859,0.901579,0.875344,0.738117,0.745548,0.741473
5,0.5239,0.84151,0.87626,0.734365,0.748551,0.740797
6,0.3636,0.792544,0.877177,0.903051,0.758217,0.760579
7,0.2736,0.779381,0.880843,0.871091,0.788816,0.804025
8,0.2089,0.767992,0.882676,0.890391,0.824915,0.846424
9,0.1779,0.76577,0.883593,0.889887,0.827507,0.847786
10,0.1495,0.771418,0.888176,0.895053,0.829858,0.851119


[I 2025-03-27 22:47:50,793] Trial 45 finished with value: 0.8591521346363442 and parameters: {'learning_rate': 0.00043371026242218253, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 46 with params: {'learning_rate': 0.00025926062732275525, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7287,3.256266,0.544455,0.569312,0.433709,0.400888
2,2.8812,2.351751,0.765353,0.660411,0.65452,0.653284
3,2.0425,1.732056,0.829514,0.698859,0.710697,0.70451
4,1.4703,1.378691,0.851512,0.722233,0.724503,0.722402
5,1.102,1.153629,0.865261,0.727785,0.739389,0.733287


[I 2025-03-27 22:48:19,550] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.00026813863808438217, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7237,3.234266,0.546288,0.568746,0.435869,0.405076
2,2.8459,2.30913,0.766269,0.658083,0.655685,0.653639
3,1.9923,1.688592,0.832264,0.701607,0.712794,0.706933
4,1.4231,1.346807,0.851512,0.7223,0.724288,0.722315
5,1.0606,1.129086,0.865261,0.728269,0.739375,0.733501
6,0.8078,0.994581,0.870761,0.731243,0.74373,0.736996
7,0.6277,0.882058,0.877177,0.738576,0.748087,0.742634
8,0.4924,0.829495,0.875344,0.736217,0.74604,0.7409
9,0.4027,0.781066,0.882676,0.739448,0.752959,0.745869
10,0.344,0.798273,0.878093,0.734798,0.74917,0.741608


[I 2025-03-27 22:49:16,741] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.00033015690088901705, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.674,3.091394,0.577452,0.559737,0.470762,0.454902
2,2.6313,2.057622,0.79835,0.674153,0.683221,0.678227
3,1.6963,1.432866,0.853346,0.719233,0.728301,0.723623
4,1.123,1.14062,0.857012,0.726508,0.727922,0.726215
5,0.7938,0.94994,0.870761,0.732346,0.7432,0.737541
6,0.5819,0.908796,0.870761,0.729087,0.743935,0.735876
7,0.4517,0.850639,0.868928,0.734413,0.740692,0.736285
8,0.3332,0.784299,0.87901,0.903928,0.759115,0.761661
9,0.2722,0.770681,0.883593,0.884095,0.808276,0.828078
10,0.2314,0.766906,0.887259,0.889575,0.8199,0.840539


[I 2025-03-27 22:50:44,553] Trial 48 finished with value: 0.8498100972189356 and parameters: {'learning_rate': 0.00033015690088901705, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 49 with params: {'learning_rate': 0.00043274279238292534, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5989,2.892212,0.656279,0.601565,0.549883,0.553312
2,2.3428,1.736822,0.828598,0.697844,0.709115,0.703131
3,1.3482,1.151761,0.865261,0.72766,0.73952,0.733171
4,0.7978,0.907814,0.872594,0.736381,0.743401,0.73961
5,0.5276,0.799137,0.882676,0.739916,0.752573,0.745872
6,0.3617,0.757744,0.88451,0.907299,0.773801,0.782229
7,0.2701,0.74595,0.888176,0.879545,0.794255,0.81091
8,0.218,0.758694,0.883593,0.889063,0.816593,0.838586
9,0.1813,0.789539,0.885426,0.890966,0.827709,0.848004
10,0.1585,0.768486,0.88451,0.879044,0.825868,0.844703


[I 2025-03-27 22:51:40,441] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 0.0003511750010394002, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6575,3.056499,0.595784,0.571734,0.487035,0.476471
2,2.5703,1.983621,0.809349,0.681089,0.693677,0.686562
3,1.6106,1.358466,0.850596,0.715934,0.727682,0.72149
4,1.0329,1.074503,0.860678,0.727735,0.732211,0.72928
5,0.7244,0.919436,0.878093,0.737411,0.748751,0.742514
6,0.5206,0.87152,0.870761,0.729018,0.743886,0.735984
7,0.3986,0.810376,0.872594,0.734316,0.744527,0.738564
8,0.298,0.789958,0.877177,0.862347,0.77513,0.788971
9,0.2412,0.784809,0.879927,0.880754,0.804401,0.824665
10,0.2062,0.786181,0.878093,0.887146,0.820576,0.842654


[I 2025-03-27 22:52:40,277] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.000348159842365858, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6596,3.062428,0.593034,0.569455,0.484108,0.472478
2,2.5798,1.994081,0.808433,0.68038,0.692962,0.685905
3,1.6222,1.368331,0.849679,0.715213,0.726684,0.720662
4,1.0443,1.082965,0.860678,0.727735,0.732211,0.72928
5,0.7338,0.921839,0.87901,0.73792,0.749431,0.743183
6,0.5289,0.874192,0.870761,0.729223,0.743838,0.736017
7,0.4048,0.818132,0.872594,0.734316,0.744527,0.738564
8,0.3033,0.78759,0.878093,0.863066,0.775797,0.789662
9,0.2444,0.782802,0.880843,0.881418,0.805664,0.825685
10,0.2067,0.787081,0.87626,0.883629,0.810607,0.832848


[I 2025-03-27 22:54:11,955] Trial 51 finished with value: 0.8501320775102869 and parameters: {'learning_rate': 0.000348159842365858, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 2.5}. Best is trial 19 with value: 0.8627147902988012.


Trial 52 with params: {'learning_rate': 0.00047767141951285603, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.572,2.810472,0.660862,0.60541,0.552238,0.555972
2,2.2246,1.621256,0.832264,0.70049,0.711593,0.705608
3,1.2205,1.077355,0.869844,0.732571,0.742676,0.737355
4,0.6988,0.876759,0.882676,0.742698,0.751921,0.747137
5,0.4656,0.807568,0.880843,0.739971,0.751699,0.74499
6,0.3159,0.767808,0.88176,0.906424,0.778501,0.794228
7,0.2441,0.751818,0.891842,0.892461,0.823997,0.844216
8,0.1896,0.755539,0.888176,0.894722,0.828897,0.850546
9,0.1631,0.840282,0.87626,0.883982,0.820197,0.840554
10,0.1503,0.770692,0.87901,0.887368,0.822481,0.843382


[I 2025-03-27 22:55:11,916] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.000436461142954974, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5971,2.881734,0.672777,0.608119,0.565697,0.569957
2,2.3204,1.713546,0.828598,0.696912,0.70932,0.702487
3,1.322,1.139034,0.865261,0.727368,0.739695,0.733185
4,0.7797,0.897462,0.873511,0.736497,0.744166,0.739958
5,0.5179,0.832286,0.878093,0.735575,0.749885,0.742164
6,0.3581,0.790625,0.878093,0.903768,0.758883,0.761263
7,0.2697,0.77601,0.883593,0.878826,0.800334,0.817823
8,0.2065,0.776673,0.88176,0.89039,0.824195,0.846067
9,0.174,0.769475,0.887259,0.891233,0.830085,0.849535
10,0.1528,0.736095,0.891842,0.883479,0.832684,0.850604


[I 2025-03-27 22:56:41,138] Trial 53 finished with value: 0.8602998886593664 and parameters: {'learning_rate': 0.000436461142954974, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.0}. Best is trial 19 with value: 0.8627147902988012.


Trial 54 with params: {'learning_rate': 2.2869967933363696e-05, 'weight_decay': 0.007, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0085,3.905235,0.268561,0.358521,0.203401,0.160842
2,3.9031,3.830503,0.301558,0.190055,0.224545,0.178688
3,3.8442,3.75844,0.404216,0.198609,0.304551,0.239064
4,3.7787,3.686222,0.419798,0.205768,0.31694,0.240709
5,3.7108,3.610484,0.419798,0.367677,0.317683,0.238107


[I 2025-03-27 22:57:09,799] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 1.3699906998412503e-05, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.03,3.955448,0.242896,0.184367,0.212106,0.1575
2,3.9536,3.883435,0.264895,0.187796,0.195651,0.139544
3,3.9046,3.842147,0.305225,0.19698,0.226595,0.182884
4,3.8621,3.804745,0.340972,0.181506,0.256063,0.209962
5,3.8298,3.768665,0.382218,0.19242,0.287893,0.230631
6,3.798,3.731031,0.408799,0.199617,0.30823,0.240071
7,3.7725,3.69689,0.419798,0.206967,0.316989,0.240354
8,3.7451,3.666109,0.426214,0.209124,0.321815,0.244652
9,3.7118,3.638372,0.428048,0.375199,0.323448,0.247558
10,3.695,3.613572,0.428048,0.372872,0.323594,0.247111


[I 2025-03-27 22:58:11,365] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 0.00043703488150776966, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5967,2.880565,0.672777,0.608119,0.565697,0.569957
2,2.3188,1.711901,0.828598,0.696912,0.70932,0.702487
3,1.3205,1.138196,0.865261,0.727368,0.739695,0.733185
4,0.7786,0.89654,0.873511,0.736497,0.744166,0.739958
5,0.5169,0.83051,0.878093,0.735575,0.749885,0.742164
6,0.357,0.790026,0.877177,0.736187,0.749411,0.742418
7,0.2687,0.775563,0.883593,0.878826,0.800334,0.817823
8,0.2056,0.775162,0.879927,0.88874,0.822738,0.844416
9,0.1724,0.77472,0.886343,0.890016,0.829912,0.848742
10,0.153,0.738695,0.891842,0.883668,0.83238,0.850566


[I 2025-03-27 22:59:43,004] Trial 56 finished with value: 0.8639964697109707 and parameters: {'learning_rate': 0.00043703488150776966, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 57 with params: {'learning_rate': 0.0003112592568135511, 'weight_decay': 0.005, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6902,3.135479,0.568286,0.566756,0.459728,0.438394
2,2.6967,2.131202,0.793767,0.671778,0.679858,0.674952
3,1.7807,1.503945,0.846929,0.714499,0.72306,0.718526
4,1.2079,1.194115,0.855179,0.724517,0.727006,0.724851
5,0.87,0.992322,0.869844,0.731536,0.742714,0.736927
6,0.6424,0.914602,0.870761,0.730116,0.743595,0.736274
7,0.4959,0.852585,0.871677,0.734589,0.743596,0.738201
8,0.37,0.788933,0.877177,0.735591,0.747962,0.741669
9,0.3058,0.764158,0.879927,0.86266,0.777928,0.790356
10,0.2612,0.781518,0.882676,0.878941,0.798338,0.816721


[I 2025-03-27 23:00:41,825] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.0004273228715836958, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6044,2.900295,0.659028,0.601747,0.551788,0.555585
2,2.3463,1.740736,0.824015,0.69358,0.705545,0.698966
3,1.349,1.156289,0.867094,0.729029,0.741029,0.734726
4,0.8019,0.913216,0.874427,0.737444,0.74455,0.740649
5,0.5387,0.863059,0.875344,0.733596,0.747748,0.739727
6,0.3762,0.798806,0.878093,0.736681,0.749844,0.742924
7,0.2817,0.781834,0.877177,0.860965,0.776874,0.78899
8,0.2167,0.788594,0.882676,0.891024,0.825145,0.846708
9,0.1791,0.775977,0.885426,0.89165,0.827845,0.848778
10,0.1532,0.763175,0.889093,0.894788,0.830345,0.85149


[I 2025-03-27 23:02:13,377] Trial 58 finished with value: 0.8591995376352287 and parameters: {'learning_rate': 0.0004273228715836958, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 59 with params: {'learning_rate': 0.00045803375981362716, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5767,2.835602,0.67461,0.610913,0.567658,0.572237
2,2.2756,1.672468,0.831347,0.70012,0.711097,0.70526
3,1.277,1.109416,0.864345,0.727811,0.738208,0.731953
4,0.7386,0.880038,0.874427,0.736225,0.745309,0.740583
5,0.4785,0.777954,0.88451,0.742092,0.754987,0.748138
6,0.3243,0.754422,0.88451,0.908503,0.773616,0.782847
7,0.2394,0.789477,0.878093,0.871059,0.787055,0.802888
8,0.1888,0.786175,0.883593,0.890032,0.826671,0.847334
9,0.1676,0.799001,0.883593,0.889656,0.826789,0.847317
10,0.152,0.778416,0.883593,0.87614,0.825668,0.843595


[I 2025-03-27 23:03:13,078] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 0.00014630775437362437, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8477,3.587747,0.433547,0.378624,0.327345,0.255012
2,3.3752,2.998743,0.64253,0.598176,0.538782,0.538467
3,2.8039,2.495746,0.746104,0.653303,0.638164,0.636666
4,2.3295,2.108072,0.793767,0.677142,0.676761,0.675254
5,1.9515,1.79925,0.837764,0.705854,0.716347,0.711008
6,1.6397,1.576641,0.854262,0.718788,0.729776,0.724196
7,1.4072,1.410595,0.855179,0.723643,0.728468,0.72547
8,1.222,1.300404,0.863428,0.726564,0.737316,0.731834
9,1.0813,1.204681,0.868928,0.732319,0.742082,0.737002
10,0.9774,1.142644,0.866178,0.727909,0.739448,0.733544


[I 2025-03-27 23:04:16,795] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.00019674999278956242, 'weight_decay': 0.005, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7968,3.439235,0.48396,0.37,0.373381,0.322324
2,3.1444,2.685534,0.719523,0.634826,0.614139,0.611958
3,2.4292,2.104372,0.7956,0.674613,0.681891,0.676786
4,1.8953,1.729086,0.829514,0.704764,0.705889,0.704561
5,1.5118,1.435788,0.856095,0.721509,0.73123,0.725939


[I 2025-03-27 23:04:45,622] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.0003161542840032307, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6837,3.134118,0.573786,0.574377,0.463953,0.44155
2,2.6882,2.113091,0.8011,0.677877,0.687149,0.681406
3,1.7575,1.480197,0.848763,0.716115,0.725261,0.720244
4,1.175,1.165583,0.858845,0.728173,0.730255,0.728314
5,0.8504,0.985289,0.871677,0.731956,0.743854,0.737572
6,0.6229,0.905357,0.870761,0.730183,0.743577,0.736562
7,0.4747,0.82575,0.878093,0.740354,0.748761,0.743654
8,0.3555,0.791906,0.875344,0.734529,0.746218,0.740204
9,0.2887,0.775894,0.880843,0.864194,0.778328,0.791431
10,0.2471,0.77577,0.879927,0.873449,0.786534,0.803983


[I 2025-03-27 23:05:43,591] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 0.0003558754656671467, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6536,3.043819,0.619615,0.582945,0.510789,0.508133
2,2.5555,1.967706,0.809349,0.683034,0.693389,0.687567
3,1.5931,1.340286,0.856095,0.720599,0.73182,0.72595
4,1.0103,1.055681,0.864345,0.73196,0.735307,0.732853
5,0.7108,0.920277,0.87626,0.735855,0.748363,0.74146
6,0.504,0.843221,0.87901,0.73794,0.749542,0.743475
7,0.3897,0.794232,0.868011,0.895524,0.750143,0.75293
8,0.2843,0.816463,0.877177,0.869945,0.783885,0.800608
9,0.2336,0.782305,0.877177,0.878446,0.803109,0.822812
10,0.2008,0.778805,0.88176,0.883741,0.80581,0.827203


[I 2025-03-27 23:06:41,064] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 5.986275918990953e-05, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9434,3.804662,0.307058,0.1977,0.228782,0.181533
2,3.7635,3.60551,0.429881,0.208981,0.324573,0.248547
3,3.5658,3.361935,0.505958,0.547407,0.40181,0.362308
4,3.3124,3.115328,0.612282,0.581141,0.50856,0.507995
5,3.0809,2.899439,0.695692,0.619503,0.592655,0.591505
6,2.8745,2.720654,0.71769,0.623679,0.614078,0.610898
7,2.7012,2.569889,0.744271,0.644073,0.636013,0.634826
8,2.558,2.444194,0.753437,0.64651,0.644966,0.642095
9,2.4235,2.346401,0.762603,0.655565,0.653184,0.649957
10,2.3319,2.265095,0.769019,0.657066,0.659051,0.655183


[I 2025-03-27 23:07:41,247] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.00038720447917864255, 'weight_decay': 0.005, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6275,2.969181,0.616865,0.580791,0.510202,0.506028
2,2.4524,1.865418,0.815765,0.686416,0.699309,0.692489
3,1.4809,1.268933,0.855179,0.720092,0.730447,0.725052
4,0.9185,1.010457,0.864345,0.731693,0.734311,0.732042
5,0.6311,0.876507,0.875344,0.735623,0.746591,0.740651
6,0.4508,0.828018,0.874427,0.730952,0.746821,0.738498
7,0.3377,0.819168,0.870761,0.900446,0.760988,0.771352
8,0.2439,0.831383,0.879927,0.889075,0.822371,0.844183
9,0.2009,0.801573,0.883593,0.888176,0.827272,0.84649
10,0.1803,0.76679,0.888176,0.893206,0.829815,0.850718


[I 2025-03-27 23:09:10,182] Trial 65 finished with value: 0.8453851107679079 and parameters: {'learning_rate': 0.00038720447917864255, 'weight_decay': 0.005, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 66 with params: {'learning_rate': 0.0003185578080644487, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.68,3.115996,0.584785,0.562665,0.475387,0.459147
2,2.6673,2.095098,0.7956,0.673265,0.681124,0.676487
3,1.7397,1.468086,0.852429,0.718367,0.727202,0.722565
4,1.1656,1.166509,0.857012,0.727271,0.728075,0.726594
5,0.8368,0.974718,0.871677,0.732901,0.74366,0.738088
6,0.6183,0.90281,0.873511,0.732845,0.745431,0.738408
7,0.4749,0.843085,0.871677,0.734576,0.743879,0.738288
8,0.3525,0.787168,0.874427,0.73439,0.745802,0.73997
9,0.2883,0.768485,0.87901,0.863011,0.776892,0.789908
10,0.2518,0.763856,0.886343,0.883369,0.800907,0.820203


[I 2025-03-27 23:10:39,357] Trial 66 finished with value: 0.8507186518229416 and parameters: {'learning_rate': 0.0003185578080644487, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 67 with params: {'learning_rate': 0.00041066704402357746, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.617,2.934191,0.648029,0.596477,0.541172,0.544004
2,2.3932,1.789452,0.819432,0.690364,0.701451,0.695414
3,1.4006,1.194388,0.866178,0.727987,0.740896,0.734152
4,0.8455,0.945031,0.877177,0.73976,0.746612,0.742845
5,0.5765,0.855712,0.875344,0.7353,0.747386,0.740755
6,0.4034,0.81464,0.874427,0.733218,0.747341,0.739859
7,0.3031,0.793841,0.87626,0.902262,0.76623,0.776021
8,0.229,0.776058,0.883593,0.890473,0.816392,0.838852
9,0.1834,0.791006,0.885426,0.892217,0.827936,0.848701
10,0.16,0.755647,0.890009,0.894767,0.831873,0.852307


[I 2025-03-27 23:12:07,107] Trial 67 finished with value: 0.85570501593592 and parameters: {'learning_rate': 0.00041066704402357746, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 68 with params: {'learning_rate': 0.000339223219164977, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6666,3.082216,0.6022,0.578372,0.492833,0.484279
2,2.6102,2.02583,0.805683,0.680369,0.690695,0.684865
3,1.6589,1.395129,0.854262,0.720252,0.729803,0.724754
4,1.0756,1.098565,0.863428,0.730521,0.734543,0.731778
5,0.7664,0.939595,0.874427,0.734862,0.746663,0.74028
6,0.552,0.868918,0.87626,0.73558,0.747695,0.7413
7,0.4232,0.806257,0.870761,0.734198,0.742881,0.738014
8,0.3125,0.796197,0.879927,0.904868,0.768333,0.778256
9,0.2557,0.780254,0.878093,0.875836,0.794405,0.813273
10,0.215,0.768978,0.886343,0.887395,0.80967,0.83096


[I 2025-03-27 23:13:06,102] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 8.280370799388257e-05, 'weight_decay': 0.005, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8993,3.743199,0.384968,0.220126,0.288084,0.241537
2,3.6578,3.424992,0.511457,0.550586,0.400553,0.359805
3,3.3317,3.08533,0.615949,0.592367,0.510687,0.507481
4,2.9975,2.777088,0.714024,0.629638,0.607946,0.609058
5,2.7083,2.526244,0.746104,0.641786,0.639993,0.635243
6,2.4625,2.325861,0.774519,0.659832,0.663216,0.658833
7,2.2668,2.152317,0.79835,0.677059,0.683249,0.679529
8,2.0959,2.01671,0.813932,0.688017,0.696094,0.691734
9,1.9539,1.912984,0.822181,0.696425,0.703411,0.699307
10,1.8467,1.83091,0.83593,0.703435,0.715766,0.709241


[I 2025-03-27 23:14:06,064] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.00046754186426139, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5687,2.814473,0.683776,0.614606,0.576227,0.580218
2,2.251,1.647324,0.834097,0.702255,0.713146,0.707335
3,1.2494,1.091149,0.864345,0.728655,0.73804,0.732146
4,0.7182,0.868465,0.873511,0.73536,0.744593,0.739773
5,0.4647,0.773467,0.883593,0.741386,0.754278,0.747423
6,0.3155,0.755916,0.887259,0.910235,0.775559,0.784729
7,0.243,0.780645,0.877177,0.880682,0.813591,0.832956
8,0.1868,0.772807,0.888176,0.891301,0.82038,0.841829
9,0.1574,0.818708,0.883593,0.88872,0.82687,0.846507
10,0.1442,0.799461,0.882676,0.892455,0.824591,0.846613


[I 2025-03-27 23:15:35,992] Trial 70 finished with value: 0.8430352232129527 and parameters: {'learning_rate': 0.00046754186426139, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 71 with params: {'learning_rate': 0.0003866298701324124, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6338,2.982946,0.623281,0.58645,0.5155,0.513322
2,2.4628,1.866799,0.818515,0.688313,0.701636,0.694571
3,1.4815,1.256771,0.863428,0.725547,0.738313,0.731675
4,0.9151,0.995762,0.868011,0.733279,0.739049,0.735562
5,0.6318,0.883912,0.877177,0.737536,0.748105,0.742163
6,0.4436,0.818418,0.878093,0.735407,0.749556,0.742183
7,0.3439,0.797509,0.878093,0.904609,0.757517,0.760556
8,0.2537,0.807854,0.880843,0.878775,0.795853,0.815354
9,0.2056,0.781319,0.88176,0.887165,0.825281,0.845136
10,0.1705,0.778316,0.886343,0.894489,0.827752,0.849868


[I 2025-03-27 23:17:05,127] Trial 71 finished with value: 0.8506461138578892 and parameters: {'learning_rate': 0.0003866298701324124, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 72 with params: {'learning_rate': 0.00032908413374949406, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.672,3.094535,0.588451,0.569004,0.479939,0.466707
2,2.634,2.056333,0.8011,0.676109,0.686361,0.680558
3,1.6946,1.430354,0.852429,0.718463,0.727731,0.722931
4,1.1197,1.138299,0.859762,0.729085,0.730137,0.728566
5,0.7967,0.952841,0.877177,0.736715,0.748137,0.74219
6,0.5842,0.893501,0.872594,0.732359,0.744993,0.738032
7,0.4485,0.83475,0.871677,0.734554,0.743645,0.738264
8,0.3321,0.789385,0.875344,0.901769,0.75532,0.758675
9,0.2695,0.775535,0.880843,0.87953,0.796284,0.815959
10,0.2319,0.761509,0.883593,0.886252,0.807742,0.829185


[I 2025-03-27 23:18:32,410] Trial 72 finished with value: 0.8508555378709298 and parameters: {'learning_rate': 0.00032908413374949406, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 73 with params: {'learning_rate': 0.0004541566753242674, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5832,2.842131,0.691109,0.619506,0.583055,0.587727
2,2.2732,1.664415,0.830431,0.699061,0.710891,0.704425
3,1.2719,1.109061,0.869844,0.731194,0.74314,0.73675
4,0.7428,0.876868,0.877177,0.738971,0.748257,0.743383
5,0.4859,0.800969,0.880843,0.739086,0.751327,0.744959
6,0.3397,0.783973,0.87901,0.735517,0.751735,0.743294
7,0.2468,0.758937,0.887259,0.908695,0.802745,0.823427
8,0.1978,0.785001,0.88451,0.877758,0.826756,0.84455
9,0.1644,0.784255,0.888176,0.876412,0.831323,0.84641
10,0.1427,0.751177,0.889093,0.895191,0.829925,0.851597


[I 2025-03-27 23:19:58,999] Trial 73 finished with value: 0.8506280247895175 and parameters: {'learning_rate': 0.0004541566753242674, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 5.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 74 with params: {'learning_rate': 4.520286738939564e-05, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9648,3.837788,0.281393,0.199149,0.208248,0.158928
2,3.8192,3.700546,0.415215,0.204769,0.313844,0.236116
3,3.6895,3.529997,0.439963,0.376815,0.335515,0.266998
4,3.512,3.342148,0.530706,0.537079,0.425021,0.395935
5,3.3311,3.164638,0.600367,0.569558,0.497811,0.495966
6,3.1642,3.014409,0.670027,0.606374,0.568292,0.568771
7,3.018,2.88449,0.697525,0.621141,0.593891,0.593694
8,2.8972,2.771314,0.710357,0.625052,0.606579,0.606159
9,2.7811,2.684378,0.724106,0.630823,0.620052,0.615937
10,2.7033,2.6119,0.736939,0.638344,0.630262,0.627368


[I 2025-03-27 23:20:56,880] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 0.0004681684804752109, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5758,2.81886,0.694775,0.617145,0.587016,0.590704
2,2.2409,1.629753,0.832264,0.700188,0.711893,0.7056
3,1.2353,1.08174,0.868928,0.730347,0.74212,0.73591
4,0.7153,0.865364,0.875344,0.737283,0.746331,0.741618
5,0.4646,0.779041,0.882676,0.740359,0.75314,0.746453
6,0.325,0.775998,0.880843,0.9029,0.771321,0.778439
7,0.2424,0.765196,0.886343,0.880449,0.801545,0.81936
8,0.1928,0.773022,0.88451,0.891929,0.826984,0.848047
9,0.1564,0.772983,0.889093,0.890878,0.832219,0.8504
10,0.147,0.746495,0.890009,0.894731,0.831073,0.851733


[I 2025-03-27 23:22:27,957] Trial 75 finished with value: 0.8581737562373984 and parameters: {'learning_rate': 0.0004681684804752109, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 76 with params: {'learning_rate': 4.8942995480796015e-05, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9564,3.828604,0.28506,0.197782,0.211228,0.162021
2,3.8047,3.676428,0.417049,0.204535,0.315267,0.236456
3,3.6583,3.486791,0.465628,0.372362,0.35908,0.302011
4,3.4601,3.281018,0.551787,0.550063,0.446844,0.42981
5,3.2642,3.094058,0.627864,0.588608,0.524748,0.526165
6,3.0876,2.934479,0.683776,0.611431,0.582072,0.581953
7,2.9322,2.797698,0.707608,0.624103,0.603793,0.60304
8,2.8053,2.681931,0.730522,0.634824,0.6247,0.622458
9,2.6838,2.591705,0.736939,0.639961,0.630297,0.627995
10,2.6025,2.51653,0.747021,0.645276,0.639536,0.636569


[I 2025-03-27 23:23:25,350] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.00041999461692224086, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6101,2.915391,0.653529,0.598464,0.545835,0.549177
2,2.3671,1.7619,0.821265,0.691802,0.702833,0.696815
3,1.3714,1.172781,0.866178,0.728046,0.74068,0.734044
4,0.821,0.92722,0.87626,0.738796,0.745932,0.742012
5,0.5535,0.842861,0.877177,0.736516,0.749051,0.742168
6,0.3867,0.815636,0.873511,0.732771,0.747159,0.73955
7,0.2892,0.782131,0.878093,0.90267,0.777256,0.791835
8,0.222,0.777099,0.885426,0.888313,0.809165,0.830695
9,0.1787,0.771035,0.888176,0.89206,0.831334,0.850605
10,0.1548,0.767081,0.891842,0.897277,0.833074,0.854147


[I 2025-03-27 23:24:56,965] Trial 77 finished with value: 0.8540375831722127 and parameters: {'learning_rate': 0.00041999461692224086, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 78 with params: {'learning_rate': 0.0004423984561514521, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5907,2.871652,0.666361,0.607271,0.559632,0.563354
2,2.3173,1.712243,0.830431,0.699663,0.710448,0.704768
3,1.321,1.133936,0.866178,0.728629,0.73995,0.733803
4,0.7735,0.896662,0.872594,0.735276,0.743595,0.739254
5,0.5058,0.790504,0.883593,0.740863,0.753676,0.746896
6,0.3478,0.750632,0.880843,0.904927,0.77087,0.779609
7,0.2617,0.756174,0.888176,0.885199,0.803093,0.822331
8,0.2096,0.769753,0.883593,0.888196,0.816938,0.838294
9,0.1719,0.778969,0.887259,0.89271,0.828469,0.849346
10,0.1578,0.753665,0.889093,0.867917,0.82955,0.844018


[I 2025-03-27 23:25:56,309] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.00042610883908470126, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6053,2.902809,0.657195,0.600705,0.549509,0.552984
2,2.3498,1.744088,0.824015,0.693526,0.705545,0.698984
3,1.3525,1.158907,0.867094,0.728676,0.741029,0.73457
4,0.805,0.915533,0.874427,0.737444,0.74455,0.740649
5,0.5412,0.863097,0.874427,0.732859,0.747032,0.739055
6,0.378,0.799494,0.879927,0.738022,0.751159,0.74428
7,0.2835,0.780991,0.878093,0.90271,0.768501,0.777247
8,0.2177,0.792815,0.88451,0.893213,0.826492,0.848393
9,0.1821,0.780015,0.88176,0.890294,0.824123,0.846017
10,0.1524,0.766031,0.885426,0.892426,0.827665,0.849005


[I 2025-03-27 23:27:27,793] Trial 79 finished with value: 0.8588567608314666 and parameters: {'learning_rate': 0.00042610883908470126, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 80 with params: {'learning_rate': 0.00011982266138079141, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8695,3.654132,0.43538,0.216071,0.327464,0.260021
2,3.4948,3.169861,0.591201,0.56352,0.486317,0.47846
3,3.0161,2.736032,0.701192,0.632838,0.597666,0.597477
4,2.5894,2.36097,0.759853,0.6533,0.650731,0.647173
5,2.2382,2.062039,0.813932,0.686826,0.697897,0.691844


[I 2025-03-27 23:27:57,579] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.000490983292594775, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5613,2.777805,0.699358,0.624681,0.592435,0.596792
2,2.1932,1.583197,0.837764,0.7044,0.716475,0.710172
3,1.1874,1.041219,0.874427,0.734364,0.746474,0.740206
4,0.6758,0.851781,0.878093,0.739651,0.749064,0.744041
5,0.4409,0.755248,0.889093,0.746293,0.75841,0.752172
6,0.3063,0.807125,0.87626,0.87163,0.795055,0.810893
7,0.2337,0.780132,0.882676,0.886877,0.817494,0.837932
8,0.1885,0.758998,0.890009,0.898891,0.829995,0.852948
9,0.1503,0.778395,0.883593,0.874194,0.826574,0.842941
10,0.1313,0.7626,0.886343,0.878398,0.82797,0.84585


[I 2025-03-27 23:29:27,719] Trial 81 finished with value: 0.8546417377152019 and parameters: {'learning_rate': 0.000490983292594775, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 82 with params: {'learning_rate': 0.0002516554159735703, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7394,3.27606,0.538038,0.569194,0.427567,0.393766
2,2.9097,2.387589,0.762603,0.657117,0.652256,0.650445
3,2.0857,1.772167,0.827681,0.697501,0.70935,0.703131
4,1.5182,1.418898,0.848763,0.720127,0.72257,0.720435
5,1.1458,1.182369,0.863428,0.726414,0.737627,0.731774


[I 2025-03-27 23:29:56,398] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 0.00045887692990457553, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5805,2.83408,0.693859,0.619074,0.586049,0.590071
2,2.2619,1.652068,0.831347,0.699493,0.711226,0.704852
3,1.259,1.100159,0.869844,0.73122,0.743105,0.736814
4,0.7328,0.873906,0.874427,0.737638,0.745418,0.74118
5,0.479,0.7936,0.878093,0.737526,0.748933,0.742984
6,0.336,0.784576,0.877177,0.900386,0.759358,0.759728
7,0.2456,0.762896,0.888176,0.909257,0.803972,0.824279
8,0.196,0.765718,0.886343,0.893125,0.828275,0.849602
9,0.1608,0.76838,0.890926,0.879553,0.833509,0.848962
10,0.1498,0.759935,0.888176,0.892453,0.829468,0.849963


[I 2025-03-27 23:31:28,537] Trial 83 finished with value: 0.857580344909665 and parameters: {'learning_rate': 0.00045887692990457553, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 84 with params: {'learning_rate': 0.00037231439532970273, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6434,3.013102,0.614115,0.582793,0.5059,0.499996
2,2.5059,1.913743,0.814849,0.68561,0.69826,0.691408
3,1.5324,1.295838,0.858845,0.722286,0.734605,0.728202
4,0.9597,1.025295,0.865261,0.730695,0.736934,0.73324
5,0.6659,0.907474,0.877177,0.737607,0.748423,0.742204
6,0.4728,0.828438,0.880843,0.737571,0.751693,0.744403
7,0.3639,0.794137,0.875344,0.902412,0.755754,0.758699
8,0.269,0.807513,0.87626,0.869764,0.783302,0.800358
9,0.2173,0.785073,0.878093,0.883249,0.812608,0.833651
10,0.1889,0.763774,0.889093,0.893663,0.820419,0.843024


[I 2025-03-27 23:32:27,698] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 4.4166255288717016e-05, 'weight_decay': 0.0, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.959,3.83685,0.278643,0.198031,0.20595,0.153378
2,3.8195,3.702835,0.412466,0.200448,0.311603,0.237434
3,3.6933,3.537141,0.43538,0.373697,0.332079,0.264974
4,3.5212,3.354664,0.525206,0.54096,0.419418,0.3871
5,3.3455,3.181195,0.593951,0.56314,0.490229,0.486596
6,3.1828,3.033518,0.665445,0.605049,0.563682,0.564238
7,3.0399,2.905601,0.693859,0.619421,0.590513,0.590814
8,2.9208,2.794288,0.705775,0.623704,0.602001,0.60249
9,2.8071,2.708387,0.72319,0.63072,0.619372,0.615586
10,2.7303,2.636803,0.732356,0.634751,0.626616,0.623643


[I 2025-03-27 23:33:26,023] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 0.0004499998627174235, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5859,2.850324,0.687443,0.617319,0.57961,0.584399
2,2.2837,1.675932,0.828598,0.697756,0.709293,0.702923
3,1.2839,1.117042,0.867094,0.72916,0.741126,0.7347
4,0.7518,0.880657,0.877177,0.738434,0.747939,0.74299
5,0.4926,0.804874,0.882676,0.740363,0.752722,0.746313
6,0.3433,0.783725,0.878093,0.735062,0.75079,0.742613
7,0.2499,0.760118,0.883593,0.878672,0.799919,0.81754
8,0.1977,0.771945,0.886343,0.893499,0.828351,0.849751
9,0.166,0.761929,0.889093,0.877755,0.83144,0.847199
10,0.1446,0.73631,0.890009,0.880669,0.831731,0.848872


[I 2025-03-27 23:34:54,256] Trial 86 finished with value: 0.8564616654994902 and parameters: {'learning_rate': 0.0004499998627174235, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 87 with params: {'learning_rate': 0.00047431985428495805, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5744,2.817367,0.656279,0.602855,0.548012,0.551457
2,2.2335,1.630011,0.831347,0.699787,0.710878,0.704938
3,1.2291,1.082525,0.868928,0.731463,0.742292,0.736622
4,0.7051,0.879462,0.88176,0.741943,0.751157,0.746334
5,0.469,0.800448,0.882676,0.74072,0.753032,0.746188
6,0.3204,0.767001,0.880843,0.905498,0.777834,0.793346
7,0.2465,0.758379,0.888176,0.889752,0.82133,0.841507
8,0.1933,0.751186,0.883593,0.891168,0.82538,0.847058
9,0.1658,0.795053,0.882676,0.888017,0.82571,0.845643
10,0.1493,0.744564,0.890926,0.896645,0.831057,0.852808


[I 2025-03-27 23:36:21,964] Trial 87 finished with value: 0.8555904976655794 and parameters: {'learning_rate': 0.00047431985428495805, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 3.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 88 with params: {'learning_rate': 2.235261791293594e-05, 'weight_decay': 0.004, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0097,3.907708,0.264895,0.356092,0.200699,0.159737
2,3.9055,3.833164,0.299725,0.190098,0.223074,0.176519
3,3.8474,3.763885,0.39505,0.195347,0.297567,0.235264
4,3.7838,3.693326,0.418882,0.205436,0.316211,0.240558
5,3.7183,3.620001,0.421632,0.369739,0.318954,0.240437
6,3.6507,3.548611,0.437214,0.374061,0.333167,0.263432
7,3.5873,3.479429,0.48396,0.461437,0.37765,0.330027
8,3.528,3.416627,0.505958,0.524958,0.400164,0.361337
9,3.4629,3.361534,0.533456,0.530917,0.427676,0.399551
10,3.4172,3.31316,0.550871,0.54117,0.445612,0.427558


[I 2025-03-27 23:37:20,927] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 6.961472074236449e-05, 'weight_decay': 0.003, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9157,3.776879,0.329973,0.213398,0.246095,0.203392
2,3.7156,3.525902,0.463795,0.378004,0.352347,0.285483
3,3.4596,3.233404,0.553621,0.552612,0.449348,0.431464
4,3.1673,2.958155,0.662695,0.606311,0.559304,0.562606
5,2.9082,2.725535,0.72594,0.630786,0.621349,0.61699


[I 2025-03-27 23:37:51,137] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 0.0004563146892141975, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5818,2.838128,0.694775,0.621714,0.586317,0.591053
2,2.2678,1.658563,0.829514,0.698112,0.709893,0.703452
3,1.2657,1.104818,0.868928,0.730059,0.742424,0.735858
4,0.7382,0.875351,0.875344,0.73794,0.746395,0.741871
5,0.4827,0.798903,0.87901,0.738105,0.749613,0.743636
6,0.338,0.782369,0.877177,0.734066,0.750436,0.741888
7,0.246,0.760075,0.887259,0.908826,0.793987,0.811757
8,0.1957,0.781,0.882676,0.889963,0.825458,0.846555
9,0.163,0.790726,0.883593,0.872888,0.828114,0.842849
10,0.1432,0.765182,0.883593,0.890281,0.825674,0.84709


[I 2025-03-27 23:38:48,650] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 3.408392995012435e-05, 'weight_decay': 0.01, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9779,3.86093,0.271311,0.192181,0.200282,0.148394
2,3.8556,3.765876,0.373969,0.196606,0.281797,0.228337
3,3.7684,3.64461,0.425298,0.205642,0.321351,0.244137
4,3.6511,3.518097,0.453712,0.37419,0.348953,0.290186
5,3.5263,3.385228,0.520623,0.556551,0.414141,0.378502
6,3.4002,3.264131,0.560953,0.541616,0.457265,0.444093
7,3.2859,3.158472,0.610449,0.577233,0.507988,0.506859
8,3.1893,3.065454,0.649863,0.597828,0.547472,0.549807
9,3.0963,2.989439,0.670944,0.606208,0.571166,0.569433
10,3.0294,2.923475,0.688359,0.614385,0.586249,0.585452


[I 2025-03-27 23:39:48,996] Trial 91 pruned. 


Trial 92 with params: {'learning_rate': 0.0004728570434403027, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5732,2.810402,0.700275,0.621008,0.59239,0.595949
2,2.2306,1.618708,0.834097,0.701816,0.71349,0.707248
3,1.2239,1.071504,0.872594,0.734239,0.744876,0.739228
4,0.7066,0.860575,0.87901,0.74037,0.749095,0.744515
5,0.4566,0.77432,0.885426,0.742967,0.754862,0.748568
6,0.3186,0.762703,0.882676,0.905461,0.763531,0.764345
7,0.2371,0.751794,0.890926,0.888413,0.814616,0.833784
8,0.1888,0.751915,0.887259,0.89458,0.829099,0.850682
9,0.1578,0.759608,0.892759,0.894316,0.834802,0.853453
10,0.1469,0.74353,0.891842,0.896246,0.832596,0.853358


[I 2025-03-27 23:41:17,883] Trial 92 finished with value: 0.8583770445721489 and parameters: {'learning_rate': 0.0004728570434403027, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 93 with params: {'learning_rate': 0.000491589316657617, 'weight_decay': 0.005, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5608,2.776533,0.699358,0.623903,0.592435,0.59652
2,2.1916,1.581606,0.836847,0.703791,0.715795,0.70954
3,1.1856,1.039717,0.874427,0.734364,0.746474,0.740206
4,0.6743,0.851044,0.878093,0.739672,0.748848,0.743923
5,0.4399,0.754183,0.888176,0.745691,0.75773,0.751517
6,0.3056,0.804561,0.877177,0.872485,0.796053,0.811849
7,0.2334,0.77182,0.883593,0.887224,0.817923,0.838432
8,0.1899,0.75396,0.890926,0.899678,0.830958,0.853813
9,0.1507,0.776773,0.88451,0.889881,0.827208,0.847186
10,0.1318,0.740789,0.891842,0.896731,0.832122,0.853522


[I 2025-03-27 23:42:49,785] Trial 93 finished with value: 0.8572693528808234 and parameters: {'learning_rate': 0.000491589316657617, 'weight_decay': 0.005, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 94 with params: {'learning_rate': 0.00036657627182631305, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6445,3.018322,0.630614,0.581939,0.522794,0.521586
2,2.5197,1.929561,0.812099,0.684784,0.695632,0.689636
3,1.5522,1.307103,0.860678,0.723249,0.736359,0.72955
4,0.971,1.029482,0.862511,0.730468,0.733925,0.731448
5,0.6767,0.906892,0.87626,0.735828,0.748411,0.741414
6,0.4758,0.827614,0.878093,0.736486,0.749003,0.742587
7,0.3718,0.791538,0.870761,0.897461,0.752539,0.75488
8,0.2721,0.814005,0.879927,0.87268,0.785995,0.802963
9,0.2233,0.78242,0.880843,0.882338,0.80637,0.826402
10,0.1919,0.771435,0.885426,0.889101,0.817903,0.839501


[I 2025-03-27 23:43:47,835] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 0.0002342588850787988, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7569,3.323721,0.523373,0.394575,0.412402,0.372692
2,2.9818,2.476404,0.756187,0.655887,0.646046,0.644978
3,2.1895,1.867988,0.821265,0.692969,0.703492,0.697768
4,1.6274,1.50651,0.845096,0.716962,0.719192,0.717191
5,1.2498,1.249202,0.862511,0.726131,0.736863,0.731202
6,0.9693,1.084755,0.869844,0.730434,0.743112,0.736521
7,0.7608,0.948009,0.87626,0.736963,0.74758,0.741915
8,0.6219,0.883966,0.879927,0.738214,0.750342,0.744145
9,0.5162,0.834126,0.88176,0.740179,0.751839,0.745843
10,0.4521,0.830757,0.87901,0.737523,0.749405,0.74318


[I 2025-03-27 23:44:48,710] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 1.0675005523304308e-05, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0381,3.975328,0.209899,0.164515,0.194678,0.131148
2,3.9756,3.910378,0.265811,0.195167,0.202929,0.157977
3,3.9311,3.868379,0.280477,0.210041,0.207662,0.157897
4,3.8903,3.839596,0.31439,0.191504,0.23416,0.193321
5,3.8642,3.814685,0.331806,0.18915,0.248443,0.205773


[I 2025-03-27 23:45:18,090] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 0.00030630723715882336, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6911,3.155781,0.563703,0.564309,0.453526,0.425596
2,2.7224,2.152683,0.791017,0.671557,0.677567,0.673258
3,1.8026,1.518574,0.845096,0.713191,0.722263,0.717279
4,1.2207,1.195511,0.859762,0.72895,0.730922,0.728996
5,0.8868,1.006262,0.870761,0.731986,0.743125,0.737262
6,0.6572,0.923527,0.868011,0.727464,0.741197,0.733944
7,0.5027,0.837548,0.874427,0.737755,0.745984,0.740829
8,0.3803,0.786965,0.878093,0.737489,0.748518,0.742883
9,0.31,0.771823,0.87626,0.900773,0.756622,0.758615
10,0.2671,0.771452,0.878093,0.871003,0.78505,0.801969


[I 2025-03-27 23:46:16,458] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 0.00046964319550051953, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5751,2.816234,0.694775,0.616967,0.587065,0.590566
2,2.2377,1.626227,0.833181,0.70076,0.712824,0.706366
3,1.2317,1.078447,0.869844,0.731387,0.742836,0.736759
4,0.7128,0.863795,0.877177,0.738613,0.747713,0.742974
5,0.4621,0.777335,0.883593,0.741146,0.753807,0.747151
6,0.3225,0.770421,0.88176,0.904239,0.772023,0.779579
7,0.2405,0.760089,0.886343,0.880464,0.801911,0.819524
8,0.189,0.76383,0.887259,0.894141,0.828817,0.850174
9,0.1575,0.770372,0.891842,0.89268,0.834101,0.85214
10,0.1455,0.743778,0.893676,0.898434,0.834094,0.855247


[I 2025-03-27 23:47:46,451] Trial 98 finished with value: 0.8610680994755452 and parameters: {'learning_rate': 0.00046964319550051953, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 99 with params: {'learning_rate': 0.0003564624938428186, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6537,3.045122,0.600367,0.571207,0.490995,0.47999
2,2.5537,1.965814,0.810266,0.681785,0.694595,0.687464
3,1.5908,1.342003,0.850596,0.715653,0.727696,0.721358
4,1.0137,1.061462,0.861595,0.728261,0.733209,0.73006
5,0.7094,0.917654,0.87626,0.73617,0.747417,0.741146
6,0.5073,0.862761,0.872594,0.730723,0.745463,0.737712
7,0.3875,0.796934,0.870761,0.732541,0.743013,0.737078
8,0.2898,0.795806,0.875344,0.859821,0.773762,0.786874
9,0.236,0.789254,0.87901,0.880724,0.803753,0.824302
10,0.202,0.785362,0.878093,0.887561,0.820576,0.842856


[I 2025-03-27 23:48:46,666] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.00035162822682321623, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.656,3.04275,0.594867,0.565754,0.487795,0.476961
2,2.5602,1.980118,0.806599,0.680055,0.690243,0.684784
3,1.6086,1.364057,0.850596,0.716472,0.726869,0.721496
4,1.0371,1.087079,0.860678,0.729585,0.730901,0.729307
5,0.7226,0.9193,0.878093,0.738025,0.748612,0.742914
6,0.5246,0.887814,0.870761,0.727895,0.7442,0.735312
7,0.4032,0.832814,0.875344,0.738714,0.746549,0.741415
8,0.296,0.789857,0.880843,0.873966,0.787401,0.804638
9,0.2431,0.783179,0.878093,0.883741,0.812272,0.833427
10,0.206,0.780933,0.882676,0.890791,0.824654,0.846398


[I 2025-03-27 23:50:14,932] Trial 100 finished with value: 0.8507649924256837 and parameters: {'learning_rate': 0.00035162822682321623, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 101 with params: {'learning_rate': 0.0004628814047643298, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5784,2.827394,0.692942,0.616762,0.5854,0.589061
2,2.2525,1.642029,0.831347,0.69944,0.711226,0.704868
3,1.2483,1.092183,0.869844,0.73122,0.743105,0.736814
4,0.7246,0.871363,0.877177,0.73914,0.747996,0.743322
5,0.4735,0.784863,0.87901,0.737581,0.749613,0.743353
6,0.3333,0.775565,0.87901,0.901867,0.760691,0.761121
7,0.2464,0.759049,0.887259,0.881503,0.802578,0.820388
8,0.1952,0.765095,0.886343,0.8939,0.828178,0.849785
9,0.1607,0.745895,0.890009,0.893452,0.832907,0.85221
10,0.1473,0.745749,0.889093,0.893207,0.830871,0.85103


[I 2025-03-27 23:51:41,929] Trial 101 finished with value: 0.8587637399251523 and parameters: {'learning_rate': 0.0004628814047643298, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 102 with params: {'learning_rate': 0.00027802004198722004, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.7146,3.210219,0.553621,0.571975,0.442972,0.414191
2,2.8093,2.264195,0.773602,0.660983,0.661816,0.659069
3,1.9384,1.640992,0.835014,0.704276,0.714247,0.709016
4,1.3685,1.306928,0.852429,0.722941,0.724989,0.722999
5,1.0139,1.096587,0.865261,0.727859,0.739291,0.733283
6,0.7679,0.97235,0.869844,0.730712,0.742799,0.736251
7,0.5947,0.869542,0.875344,0.737404,0.746621,0.741196
8,0.4602,0.819764,0.87626,0.736516,0.747136,0.741698
9,0.3755,0.776362,0.88176,0.738992,0.752543,0.745454
10,0.3198,0.784486,0.882676,0.905464,0.761778,0.763667


[I 2025-03-27 23:52:39,489] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 4.190718069183538e-05, 'weight_decay': 0.005, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9654,3.842864,0.27956,0.2009,0.206658,0.155598
2,3.8285,3.716999,0.406049,0.19713,0.306785,0.233798
3,3.7122,3.563208,0.428048,0.370546,0.325507,0.25297
4,3.5529,3.393518,0.513291,0.554843,0.407704,0.371006
5,3.3876,3.226714,0.573786,0.547292,0.469482,0.460044


[I 2025-03-27 23:53:10,112] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 9.400853497184947e-05, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8996,3.721222,0.394134,0.216546,0.295854,0.24544
2,3.6224,3.364953,0.527039,0.554323,0.417943,0.382045
3,3.25,2.990154,0.646196,0.609598,0.541981,0.542181
4,2.8812,2.654481,0.727773,0.63577,0.621591,0.620111
5,2.5665,2.382991,0.762603,0.650909,0.655429,0.648456
6,2.2986,2.169781,0.799267,0.678912,0.68292,0.679551
7,2.0869,1.988255,0.822181,0.695438,0.702797,0.698908
8,1.908,1.854122,0.830431,0.701166,0.709594,0.705217
9,1.7637,1.74872,0.837764,0.708953,0.715579,0.711877
10,1.651,1.667102,0.84418,0.711046,0.721254,0.715984


[I 2025-03-27 23:54:08,220] Trial 104 pruned. 


Trial 105 with params: {'learning_rate': 0.00016896026180779306, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8307,3.533259,0.441797,0.36472,0.335869,0.267803
2,3.2776,2.864414,0.689276,0.619556,0.584502,0.583947
3,2.6358,2.312861,0.767186,0.659457,0.65666,0.653907
4,2.126,1.925167,0.813932,0.692092,0.69352,0.691837
5,1.7405,1.618461,0.848763,0.715987,0.724794,0.720205


[I 2025-03-27 23:54:37,615] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 2.777716320805797e-05, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9876,3.878354,0.254812,0.18799,0.187797,0.130142
2,3.878,3.803042,0.327223,0.197602,0.244929,0.200668
3,3.8109,3.710137,0.420715,0.2047,0.317309,0.244275
4,3.7251,3.613926,0.431714,0.375973,0.326282,0.25064
5,3.6322,3.514792,0.461962,0.36456,0.356246,0.298805
6,3.5379,3.418435,0.511457,0.552707,0.40488,0.36558
7,3.4485,3.327418,0.543538,0.550365,0.437877,0.415703
8,3.3681,3.247944,0.569203,0.545503,0.464806,0.454476
9,3.289,3.183944,0.595784,0.570623,0.49287,0.490152
10,3.2326,3.12693,0.629698,0.588063,0.527403,0.52821


[I 2025-03-27 23:55:38,394] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 0.00047777618577968294, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5701,2.801473,0.698442,0.622694,0.591322,0.595457
2,2.2209,1.608713,0.834097,0.702182,0.713477,0.707477
3,1.2142,1.062219,0.870761,0.732698,0.743578,0.73777
4,0.6971,0.85488,0.877177,0.739095,0.747678,0.743164
5,0.4473,0.757156,0.885426,0.743871,0.754598,0.74886
6,0.3115,0.750059,0.882676,0.905328,0.78173,0.795172
7,0.231,0.738531,0.890009,0.88799,0.814178,0.833381
8,0.1826,0.774352,0.880843,0.871257,0.815776,0.833537
9,0.1496,0.791079,0.879927,0.868808,0.825814,0.839507
10,0.1464,0.76638,0.883593,0.873807,0.82713,0.843031


[I 2025-03-27 23:56:37,759] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 0.0004520369817040292, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.582,2.849557,0.670027,0.609008,0.563379,0.567768
2,2.2915,1.687873,0.829514,0.698562,0.70945,0.703659
3,1.2938,1.119027,0.862511,0.726174,0.736548,0.730434
4,0.7513,0.887344,0.875344,0.737186,0.746024,0.741417
5,0.4875,0.786584,0.88451,0.74204,0.754392,0.747883
6,0.3355,0.752695,0.88451,0.908224,0.773589,0.782604
7,0.2482,0.781994,0.88451,0.875137,0.792222,0.807769
8,0.1999,0.777054,0.885426,0.888096,0.819172,0.839533
9,0.1734,0.795485,0.88176,0.888727,0.824803,0.845634
10,0.1625,0.773599,0.886343,0.866992,0.827412,0.842298


[I 2025-03-27 23:57:37,626] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.0004505179186160738, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5857,2.849717,0.687443,0.617319,0.57961,0.584399
2,2.2823,1.674345,0.829514,0.698313,0.710224,0.703688
3,1.2822,1.115869,0.868011,0.729816,0.741793,0.735388
4,0.7505,0.879918,0.877177,0.738828,0.747939,0.743186
5,0.4919,0.80422,0.882676,0.740363,0.752722,0.746313
6,0.343,0.784472,0.87901,0.735789,0.751457,0.743313
7,0.2497,0.758522,0.886343,0.884823,0.811153,0.830251
8,0.1967,0.779107,0.883593,0.891297,0.825922,0.8474
9,0.1671,0.752109,0.887259,0.876369,0.830093,0.845846
10,0.1442,0.738959,0.890009,0.895069,0.831449,0.852369


[I 2025-03-27 23:59:05,676] Trial 109 finished with value: 0.8544476675862024 and parameters: {'learning_rate': 0.0004505179186160738, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 110 with params: {'learning_rate': 0.0004974348415894276, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5571,2.765752,0.706691,0.627218,0.598612,0.602565
2,2.1772,1.567269,0.83868,0.705716,0.717107,0.71119
3,1.1711,1.030762,0.872594,0.733216,0.745071,0.738938
4,0.6641,0.850322,0.877177,0.738637,0.748384,0.743123
5,0.4284,0.767813,0.879927,0.739841,0.751403,0.745217
6,0.3019,0.796256,0.877177,0.874069,0.795693,0.813087
7,0.2253,0.778899,0.878093,0.882316,0.813414,0.833749
8,0.1737,0.754048,0.888176,0.879109,0.830852,0.847659
9,0.1497,0.777624,0.889093,0.879686,0.831611,0.847948
10,0.1367,0.763522,0.887259,0.867234,0.828761,0.84318


[I 2025-03-28 00:00:33,847] Trial 110 finished with value: 0.8596404416575818 and parameters: {'learning_rate': 0.0004974348415894276, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 111 with params: {'learning_rate': 0.0004437759749957221, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5911,2.865315,0.681027,0.612857,0.573588,0.578485
2,2.3,1.692235,0.830431,0.698733,0.710737,0.704019
3,1.3007,1.126361,0.866178,0.728358,0.740411,0.734
4,0.7642,0.887474,0.87626,0.738121,0.746744,0.742156
5,0.5026,0.808991,0.883593,0.7404,0.753222,0.74653
6,0.3482,0.787358,0.88176,0.905607,0.762553,0.763992
7,0.2571,0.753764,0.888176,0.887257,0.812619,0.832282
8,0.1976,0.805138,0.877177,0.888459,0.820774,0.842517
9,0.1709,0.73146,0.890926,0.894757,0.832966,0.85268
10,0.1469,0.735623,0.893676,0.883352,0.834266,0.85137


[I 2025-03-28 00:02:03,742] Trial 111 finished with value: 0.8580154793883699 and parameters: {'learning_rate': 0.0004437759749957221, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 112 with params: {'learning_rate': 0.00012041223802555697, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8769,3.66142,0.439963,0.219625,0.330798,0.263709
2,3.5025,3.178962,0.591201,0.569517,0.485566,0.47558
3,3.0225,2.743019,0.697525,0.630776,0.594735,0.594153
4,2.5927,2.363449,0.75802,0.652344,0.649384,0.645987
5,2.2383,2.06208,0.811182,0.685022,0.695619,0.689733
6,1.9361,1.834702,0.830431,0.700011,0.710345,0.704892
7,1.7084,1.657274,0.845096,0.714806,0.721225,0.717749
8,1.5218,1.532856,0.853346,0.71982,0.727788,0.723636
9,1.3757,1.433151,0.856095,0.722602,0.730273,0.726121
10,1.2642,1.357457,0.857929,0.721091,0.733055,0.726986


[I 2025-03-28 00:03:02,251] Trial 112 pruned. 


Trial 113 with params: {'learning_rate': 0.00030101677781851937, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6945,3.156183,0.566453,0.57427,0.456589,0.431927
2,2.7277,2.165553,0.786434,0.667611,0.673456,0.669338
3,1.8213,1.538186,0.845096,0.712791,0.721775,0.716979
4,1.2484,1.220195,0.852429,0.723273,0.724596,0.722843
5,0.9096,1.020224,0.868011,0.730112,0.741332,0.735522


[I 2025-03-28 00:03:31,199] Trial 113 pruned. 


Trial 114 with params: {'learning_rate': 0.00046814216034870264, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5758,2.818899,0.694775,0.617145,0.587016,0.590704
2,2.2408,1.629685,0.832264,0.700188,0.711893,0.7056
3,1.2352,1.081917,0.868928,0.730347,0.74212,0.73591
4,0.7152,0.865703,0.875344,0.737283,0.746331,0.741618
5,0.4648,0.778943,0.882676,0.740359,0.75314,0.746453
6,0.3252,0.77517,0.880843,0.9029,0.771321,0.778439
7,0.2423,0.764231,0.886343,0.880449,0.801545,0.81936
8,0.1924,0.777917,0.88176,0.890066,0.824887,0.845971
9,0.1567,0.763393,0.890009,0.891755,0.832886,0.851247
10,0.147,0.750277,0.891842,0.89668,0.832026,0.853017


[I 2025-03-28 00:04:56,180] Trial 114 finished with value: 0.8595834137811598 and parameters: {'learning_rate': 0.00046814216034870264, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 115 with params: {'learning_rate': 8.589423926025179e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9048,3.736035,0.373969,0.210282,0.280686,0.233859
2,3.6545,3.419329,0.519707,0.501127,0.408819,0.370891
3,3.3195,3.069686,0.621448,0.596947,0.516508,0.513971
4,2.975,2.752021,0.714024,0.630716,0.608159,0.608353
5,2.6771,2.493378,0.747938,0.642151,0.642235,0.636527


[I 2025-03-28 00:05:24,598] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.00032282399298829724, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6767,3.107118,0.584785,0.56362,0.476014,0.46081
2,2.6536,2.079084,0.797434,0.673713,0.683668,0.678039
3,1.721,1.452346,0.852429,0.718463,0.727731,0.722931
4,1.1467,1.154856,0.857929,0.727758,0.728755,0.727212
5,0.8199,0.965658,0.875344,0.735624,0.746755,0.74096
6,0.6046,0.898834,0.871677,0.731697,0.744264,0.737128
7,0.4642,0.838836,0.869844,0.733227,0.742214,0.736833
8,0.3439,0.78969,0.87626,0.90263,0.756238,0.759573
9,0.2816,0.774564,0.877177,0.870714,0.784383,0.801362
10,0.2432,0.764295,0.882676,0.885682,0.80731,0.828572


[I 2025-03-28 00:06:54,678] Trial 116 finished with value: 0.8484989686848351 and parameters: {'learning_rate': 0.00032282399298829724, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 5.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 117 with params: {'learning_rate': 0.00010846317855833546, 'weight_decay': 0.008, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.87,3.676151,0.428964,0.213903,0.322303,0.256958
2,3.5383,3.238133,0.56462,0.539663,0.457403,0.440568
3,3.1031,2.829867,0.692026,0.628976,0.587157,0.588769
4,2.7025,2.474108,0.75527,0.652225,0.64626,0.643826
5,2.3684,2.189322,0.802016,0.677489,0.687691,0.681117


[I 2025-03-28 00:07:23,435] Trial 117 pruned. 


Trial 118 with params: {'learning_rate': 0.00041942418918356953, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6105,2.916547,0.653529,0.598464,0.545835,0.549177
2,2.3687,1.763633,0.820348,0.691108,0.702118,0.696151
3,1.3733,1.174007,0.866178,0.728046,0.74068,0.734044
4,0.8226,0.928443,0.87626,0.738796,0.745932,0.742012
5,0.555,0.844183,0.878093,0.737521,0.749717,0.743006
6,0.3877,0.815122,0.872594,0.731486,0.746161,0.738447
7,0.2901,0.781014,0.877177,0.90179,0.767452,0.776465
8,0.2225,0.767167,0.88451,0.886995,0.808264,0.82982
9,0.1792,0.777281,0.889093,0.893335,0.831868,0.851432
10,0.1567,0.748598,0.893676,0.89861,0.834373,0.85545


[I 2025-03-28 00:08:50,430] Trial 118 finished with value: 0.8544720088354758 and parameters: {'learning_rate': 0.00041942418918356953, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 119 with params: {'learning_rate': 0.00041679306913721157, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6104,2.925182,0.628781,0.591792,0.519643,0.517175
2,2.3817,1.782226,0.821265,0.691768,0.703005,0.69711
3,1.3893,1.189623,0.864345,0.727318,0.739236,0.732934
4,0.8332,0.952975,0.867094,0.732641,0.737623,0.734567
5,0.5681,0.864587,0.874427,0.734106,0.746057,0.739303
6,0.3997,0.817163,0.873511,0.730286,0.746495,0.738041
7,0.305,0.792069,0.877177,0.903501,0.78441,0.803866
8,0.2281,0.751906,0.879927,0.884666,0.813874,0.835291
9,0.1873,0.782297,0.885426,0.891562,0.827969,0.848612
10,0.1635,0.763363,0.887259,0.878813,0.829007,0.846184


[I 2025-03-28 00:10:14,335] Trial 119 finished with value: 0.8472238782257356 and parameters: {'learning_rate': 0.00041679306913721157, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 120 with params: {'learning_rate': 0.00047388026980424257, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5727,2.808763,0.700275,0.622642,0.59239,0.596418
2,2.2288,1.616992,0.834097,0.701816,0.71349,0.707248
3,1.2222,1.069454,0.869844,0.731924,0.742849,0.737044
4,0.7046,0.859129,0.879927,0.741073,0.74981,0.745224
5,0.4547,0.7709,0.88451,0.742354,0.754147,0.747892
6,0.3168,0.760492,0.88176,0.904746,0.762816,0.763684
7,0.2334,0.752859,0.888176,0.889734,0.821607,0.841499
8,0.1894,0.750235,0.886343,0.893321,0.828383,0.849674
9,0.1575,0.770132,0.890009,0.877493,0.832754,0.847586
10,0.145,0.738409,0.892759,0.897168,0.833271,0.854127


[I 2025-03-28 00:11:39,736] Trial 120 finished with value: 0.8576139732422471 and parameters: {'learning_rate': 0.00047388026980424257, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 121 with params: {'learning_rate': 1.5745418122329243e-05, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0247,3.942669,0.247479,0.174362,0.208226,0.160871
2,3.9402,3.869138,0.267644,0.206824,0.197679,0.141297
3,3.8896,3.825488,0.318057,0.191749,0.237286,0.196532
4,3.8441,3.779621,0.378552,0.18967,0.28496,0.227677
5,3.8056,3.734799,0.405133,0.197284,0.305507,0.236945
6,3.767,3.691765,0.422548,0.205825,0.318858,0.245014
7,3.7338,3.650575,0.423465,0.206662,0.319912,0.241726
8,3.7002,3.612049,0.424381,0.37081,0.321073,0.242748
9,3.6596,3.577147,0.428964,0.371279,0.325491,0.250948
10,3.636,3.546288,0.454629,0.36639,0.348492,0.287271


[I 2025-03-28 00:12:37,735] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 3.480197626483917e-05, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9809,3.860907,0.272227,0.187767,0.201206,0.151409
2,3.8545,3.759274,0.373052,0.188549,0.281577,0.224406
3,3.7645,3.638503,0.420715,0.202432,0.318017,0.24032
4,3.6438,3.50816,0.458295,0.371275,0.353657,0.297384
5,3.515,3.371334,0.527956,0.545089,0.421938,0.390765


[I 2025-03-28 00:13:07,152] Trial 122 pruned. 


Trial 123 with params: {'learning_rate': 0.00046012046737579427, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5798,2.83205,0.692942,0.617845,0.585118,0.588941
2,2.2591,1.649145,0.831347,0.699493,0.711226,0.704852
3,1.2559,1.097768,0.869844,0.73122,0.743105,0.736814
4,0.7302,0.873277,0.874427,0.737518,0.745418,0.741138
5,0.4773,0.791374,0.87901,0.737788,0.749613,0.743488
6,0.3354,0.783235,0.87626,0.899672,0.758691,0.759019
7,0.2464,0.762984,0.887259,0.909252,0.794168,0.811992
8,0.1945,0.782226,0.880843,0.890508,0.823419,0.845362
9,0.1639,0.749917,0.890926,0.894141,0.833045,0.852582
10,0.1489,0.757541,0.883593,0.888863,0.826051,0.846433


[I 2025-03-28 00:14:31,124] Trial 123 finished with value: 0.8585564682190402 and parameters: {'learning_rate': 0.00046012046737579427, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 124 with params: {'learning_rate': 0.00033103472929355476, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6706,3.090832,0.586618,0.563614,0.478606,0.465044
2,2.6283,2.049722,0.802933,0.677301,0.688224,0.682045
3,1.6869,1.423926,0.851512,0.717621,0.727065,0.722173
4,1.1118,1.133167,0.859762,0.729085,0.730137,0.728566
5,0.7902,0.94932,0.878093,0.737384,0.748852,0.742864
6,0.5781,0.891089,0.871677,0.731646,0.744658,0.737558
7,0.4435,0.834687,0.873511,0.735915,0.745027,0.739632
8,0.3284,0.789833,0.874427,0.900747,0.754654,0.757842
9,0.266,0.775126,0.882676,0.880861,0.797666,0.817332
10,0.2289,0.770694,0.88176,0.884676,0.80611,0.827566


[I 2025-03-28 00:15:29,096] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 0.00043525094885982023, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5981,2.884207,0.673694,0.60871,0.566363,0.57069
2,2.3238,1.717148,0.828598,0.696912,0.70932,0.702487
3,1.3256,1.140994,0.866178,0.728044,0.740362,0.733875
4,0.7823,0.899481,0.874427,0.737523,0.744833,0.740798
5,0.5205,0.835882,0.878093,0.735575,0.749885,0.742164
6,0.3603,0.792024,0.87626,0.735557,0.748413,0.741616
7,0.2716,0.777053,0.882676,0.877902,0.799336,0.816872
8,0.2073,0.777183,0.882676,0.889891,0.825277,0.84637
9,0.1759,0.767358,0.888176,0.892263,0.83117,0.850648
10,0.1519,0.742969,0.891842,0.897299,0.83249,0.853756


[I 2025-03-28 00:16:55,083] Trial 125 finished with value: 0.8611045236003115 and parameters: {'learning_rate': 0.00043525094885982023, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 5.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 126 with params: {'learning_rate': 0.00032695636245912174, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6735,3.098344,0.589368,0.568979,0.480606,0.46751
2,2.6401,2.063607,0.79835,0.674125,0.684334,0.678582
3,1.7032,1.437533,0.852429,0.718463,0.727731,0.722931
4,1.1285,1.143696,0.859762,0.729085,0.730137,0.728566
5,0.8041,0.957003,0.87626,0.736291,0.74747,0.74165
6,0.5913,0.895249,0.871677,0.731775,0.744278,0.737335
7,0.454,0.8359,0.871677,0.734554,0.743645,0.738264
8,0.3362,0.789956,0.874427,0.901183,0.75464,0.758044
9,0.2739,0.774849,0.879927,0.878725,0.795618,0.815261
10,0.2356,0.762262,0.88451,0.886813,0.80874,0.82997


[I 2025-03-28 00:17:52,394] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 0.0004275712614921812, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.603,2.902339,0.649863,0.59614,0.54391,0.546624
2,2.3554,1.749567,0.826764,0.696104,0.707768,0.701572
3,1.3627,1.162534,0.867094,0.729206,0.740867,0.734632
4,0.8109,0.914936,0.871677,0.736422,0.742135,0.738889
5,0.5403,0.811901,0.88176,0.737579,0.752349,0.744649
6,0.3682,0.766339,0.882676,0.90647,0.772058,0.780972
7,0.2754,0.740481,0.885426,0.909914,0.792171,0.811309
8,0.2185,0.786875,0.878093,0.885129,0.811684,0.833816
9,0.1736,0.789839,0.886343,0.877067,0.828881,0.845299
10,0.1541,0.756343,0.886343,0.892907,0.827175,0.849113


[I 2025-03-28 00:19:18,768] Trial 127 finished with value: 0.852420801382277 and parameters: {'learning_rate': 0.0004275712614921812, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 128 with params: {'learning_rate': 1.1489587887855655e-05, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0359,3.969876,0.220898,0.169097,0.201557,0.140802
2,3.9695,3.902338,0.261228,0.197429,0.197074,0.150788
3,3.9233,3.860815,0.283226,0.189683,0.209406,0.158422
4,3.8823,3.830363,0.316224,0.185056,0.236277,0.196297
5,3.8549,3.802589,0.343721,0.186656,0.258087,0.212176
6,3.8296,3.773534,0.373052,0.190791,0.280855,0.226734
7,3.8114,3.744033,0.401467,0.195381,0.302819,0.233519
8,3.7895,3.720249,0.411549,0.201419,0.310594,0.236769
9,3.7631,3.699386,0.420715,0.206843,0.317434,0.24297
10,3.7531,3.680793,0.423465,0.206404,0.31958,0.24453


[I 2025-03-28 00:20:16,400] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 9.081766045823687e-05, 'weight_decay': 0.005, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.9064,3.730895,0.374885,0.211014,0.281409,0.233967
2,3.6396,3.391552,0.52154,0.49763,0.412303,0.374829
3,3.2815,3.02446,0.637947,0.607608,0.532682,0.532396
4,2.9212,2.695762,0.724106,0.636494,0.617248,0.616895
5,2.6122,2.428425,0.752521,0.643782,0.64676,0.639997


[I 2025-03-28 00:20:48,148] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.00041206423745911586, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6159,2.931135,0.648029,0.596477,0.541172,0.544004
2,2.3891,1.785079,0.819432,0.690364,0.701451,0.695414
3,1.3961,1.190969,0.866178,0.727987,0.740896,0.734152
4,0.8418,0.942228,0.877177,0.73976,0.746612,0.742845
5,0.5729,0.853635,0.87626,0.736023,0.748384,0.741583
6,0.4007,0.814107,0.874427,0.733218,0.747341,0.739859
7,0.3005,0.789526,0.874427,0.900631,0.764896,0.774587
8,0.2283,0.772604,0.88451,0.891536,0.817058,0.839705
9,0.1815,0.776789,0.889093,0.895252,0.83095,0.851793
10,0.1585,0.761345,0.889093,0.895175,0.830497,0.851834


[I 2025-03-28 00:22:17,896] Trial 130 finished with value: 0.8551794858814104 and parameters: {'learning_rate': 0.00041206423745911586, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 131 with params: {'learning_rate': 0.0004754158471713139, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5736,2.815066,0.656279,0.602833,0.547977,0.55141
2,2.2305,1.627096,0.832264,0.70049,0.711593,0.705608
3,1.2264,1.080885,0.869844,0.732571,0.742676,0.737355
4,0.7032,0.878765,0.88176,0.742027,0.751171,0.746433
5,0.4681,0.803027,0.882676,0.740795,0.753032,0.746186
6,0.3187,0.768138,0.880843,0.905195,0.777785,0.793207
7,0.2455,0.76458,0.888176,0.889829,0.821317,0.841573
8,0.1941,0.752278,0.886343,0.893726,0.82738,0.84929
9,0.1658,0.802261,0.882676,0.88806,0.825508,0.845485
10,0.1498,0.751301,0.890009,0.894483,0.830784,0.85164


[I 2025-03-28 00:23:44,065] Trial 131 finished with value: 0.8570980760792409 and parameters: {'learning_rate': 0.0004754158471713139, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 3.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 132 with params: {'learning_rate': 0.000478609439770032, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5695,2.799751,0.699358,0.623142,0.59232,0.596365
2,2.2192,1.607091,0.834097,0.702182,0.713477,0.707477
3,1.2127,1.060783,0.869844,0.731679,0.742911,0.736901
4,0.6958,0.854315,0.878093,0.739354,0.748358,0.743642
5,0.4461,0.75679,0.885426,0.744159,0.754598,0.749
6,0.3114,0.746751,0.883593,0.905839,0.782348,0.795738
7,0.2307,0.741701,0.889093,0.892654,0.831773,0.851267
8,0.1813,0.781525,0.883593,0.877564,0.826412,0.84427
9,0.1493,0.78951,0.885426,0.874251,0.829938,0.844394
10,0.1403,0.758318,0.890926,0.880929,0.832539,0.849458


[I 2025-03-28 00:25:09,241] Trial 132 finished with value: 0.8573789530034636 and parameters: {'learning_rate': 0.000478609439770032, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 133 with params: {'learning_rate': 0.00026789951797849684, 'weight_decay': 0.007, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.73,3.243443,0.544455,0.570002,0.433822,0.401476
2,2.8563,2.318679,0.764436,0.65617,0.653694,0.651891
3,2.0005,1.694209,0.833181,0.70164,0.71369,0.707417
4,1.4303,1.352941,0.849679,0.720921,0.723205,0.721161
5,1.0667,1.132279,0.862511,0.726489,0.736911,0.731439
6,0.8104,0.997759,0.868928,0.729691,0.742397,0.73561
7,0.6321,0.887898,0.87626,0.738513,0.747358,0.742133
8,0.4977,0.829901,0.87626,0.735982,0.747198,0.741495
9,0.4056,0.780164,0.88176,0.737623,0.752327,0.744628
10,0.3467,0.793485,0.87901,0.73513,0.749899,0.742219


[I 2025-03-28 00:26:09,151] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 0.0001418653757938253, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8479,3.595653,0.441797,0.384511,0.333193,0.262696
2,3.3905,3.021677,0.635197,0.591422,0.531762,0.530923
3,2.8334,2.529248,0.745188,0.65438,0.637268,0.636266
4,2.367,2.143888,0.785518,0.67036,0.670782,0.668632
5,1.9933,1.836626,0.834097,0.703116,0.714012,0.708376


[I 2025-03-28 00:26:40,322] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 0.0002954992719017488, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6992,3.1691,0.56462,0.574923,0.4538,0.427746
2,2.7471,2.188637,0.785518,0.668752,0.672045,0.668894
3,1.8484,1.561642,0.84143,0.70992,0.71906,0.714185
4,1.2761,1.239075,0.851512,0.723024,0.724212,0.72247
5,0.933,1.036723,0.868011,0.730112,0.741332,0.735522
6,0.6972,0.93581,0.873511,0.733791,0.746393,0.739461
7,0.5382,0.854271,0.872594,0.734954,0.744311,0.738832
8,0.4091,0.799945,0.874427,0.734065,0.745886,0.739876
9,0.3364,0.775689,0.877177,0.901476,0.758139,0.75961
10,0.2881,0.786878,0.880843,0.871606,0.787568,0.803499


[I 2025-03-28 00:27:37,422] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.000144126682671934, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8327,3.580186,0.44363,0.34608,0.33487,0.269323
2,3.3665,2.994012,0.63703,0.590027,0.532437,0.533953
3,2.8035,2.498372,0.748854,0.653334,0.640073,0.639806
4,2.3356,2.116355,0.796517,0.678579,0.679258,0.677397
5,1.9638,1.811856,0.833181,0.703123,0.712256,0.707526


[I 2025-03-28 00:28:05,905] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 0.0004074926320741934, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6151,2.937114,0.626948,0.58944,0.517897,0.514532
2,2.4029,1.807442,0.820348,0.691081,0.702338,0.696375
3,1.4174,1.216788,0.862511,0.725899,0.73732,0.731369
4,0.8604,0.973607,0.866178,0.732794,0.736357,0.73383
5,0.5911,0.873061,0.87626,0.73669,0.747369,0.741387
6,0.4201,0.822166,0.87626,0.732563,0.748314,0.74011
7,0.3174,0.812315,0.874427,0.903161,0.772973,0.789042
8,0.2382,0.781656,0.88176,0.890052,0.824058,0.84605
9,0.1916,0.783056,0.883593,0.890193,0.826031,0.846903
10,0.1648,0.757923,0.889093,0.894651,0.83047,0.851672


[I 2025-03-28 00:29:34,053] Trial 137 finished with value: 0.8471787314525612 and parameters: {'learning_rate': 0.0004074926320741934, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 138 with params: {'learning_rate': 0.000496284607922274, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5579,2.767952,0.706691,0.62821,0.598329,0.602486
2,2.1802,1.570478,0.837764,0.705111,0.716426,0.710561
3,1.1743,1.032519,0.872594,0.733216,0.745071,0.738938
4,0.6662,0.850872,0.877177,0.73906,0.748384,0.743308
5,0.4303,0.766665,0.88176,0.741158,0.752737,0.746575
6,0.3037,0.794178,0.874427,0.866083,0.784604,0.799344
7,0.2249,0.771119,0.880843,0.884451,0.815414,0.83586
8,0.1795,0.756954,0.887259,0.878634,0.829907,0.846932
9,0.1496,0.792544,0.888176,0.87739,0.831042,0.846697
10,0.1371,0.759291,0.888176,0.867421,0.829908,0.843975


[I 2025-03-28 00:31:01,269] Trial 138 finished with value: 0.8570223358503432 and parameters: {'learning_rate': 0.000496284607922274, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 5.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 139 with params: {'learning_rate': 0.00038954511826543996, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6319,2.976993,0.623281,0.58422,0.5155,0.51315
2,2.4542,1.857057,0.818515,0.688313,0.701636,0.694571
3,1.4711,1.248888,0.862511,0.724767,0.737647,0.730957
4,0.9062,0.989845,0.868011,0.733612,0.739098,0.735763
5,0.6259,0.884646,0.872594,0.733701,0.744445,0.738438


[I 2025-03-28 00:31:32,284] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.00029497757037821496, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.9, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6997,3.170303,0.56462,0.574923,0.4538,0.427746
2,2.749,2.190859,0.784601,0.668159,0.671365,0.668283
3,1.851,1.56394,0.84143,0.70992,0.71906,0.714185
4,1.2788,1.241011,0.851512,0.723024,0.724212,0.72247
5,0.9354,1.038508,0.868928,0.730903,0.741999,0.73623
6,0.6993,0.937051,0.871677,0.732361,0.744464,0.737841
7,0.5396,0.854172,0.873511,0.735732,0.744978,0.739515
8,0.4104,0.800799,0.874427,0.734065,0.745886,0.739876
9,0.3375,0.775204,0.877177,0.901476,0.758139,0.75961
10,0.2891,0.785596,0.879927,0.862433,0.777764,0.790086


[I 2025-03-28 00:32:29,263] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 0.00042297165729851576, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.606,2.910068,0.649863,0.59653,0.543578,0.546303
2,2.3661,1.76103,0.824931,0.694664,0.706386,0.700176
3,1.3755,1.171766,0.868011,0.729909,0.741865,0.735458
4,0.822,0.921504,0.871677,0.735727,0.742467,0.738789
5,0.5502,0.825073,0.875344,0.731837,0.747669,0.739336


[I 2025-03-28 00:32:59,255] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 0.0004358474817772437, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5977,2.883048,0.673694,0.60871,0.566363,0.57069
2,2.3222,1.715456,0.828598,0.696912,0.70932,0.702487
3,1.3239,1.139812,0.865261,0.727368,0.739695,0.733185
4,0.781,0.898453,0.874427,0.737523,0.744833,0.740798
5,0.5194,0.834252,0.878093,0.735575,0.749885,0.742164
6,0.3594,0.790634,0.878093,0.903768,0.758883,0.761263
7,0.2708,0.776535,0.883593,0.878645,0.800002,0.81757
8,0.208,0.795714,0.878093,0.887041,0.821736,0.842993
9,0.1789,0.765225,0.889093,0.892809,0.831038,0.850884
10,0.1521,0.741924,0.890926,0.881717,0.83156,0.849224


[I 2025-03-28 00:34:28,555] Trial 142 finished with value: 0.8609285137996928 and parameters: {'learning_rate': 0.0004358474817772437, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.5}. Best is trial 56 with value: 0.8639964697109707.


Trial 143 with params: {'learning_rate': 0.00040051443410772544, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6189,2.945219,0.653529,0.595951,0.54713,0.549578
2,2.4175,1.817334,0.821265,0.691417,0.70372,0.697177
3,1.434,1.217167,0.866178,0.72781,0.740769,0.733956
4,0.8693,0.961611,0.872594,0.736728,0.742604,0.739211
5,0.5885,0.861249,0.875344,0.734239,0.746909,0.740261


[I 2025-03-28 00:34:57,263] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 1.1375872635111501e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0344,3.968575,0.220898,0.169202,0.201459,0.140403
2,3.9688,3.902242,0.260312,0.197033,0.196358,0.150004
3,3.9235,3.861357,0.27956,0.186989,0.206615,0.155048
4,3.883,3.831789,0.31714,0.189871,0.236736,0.19706
5,3.856,3.804914,0.340972,0.189255,0.255789,0.211245
6,3.8312,3.776793,0.370302,0.193262,0.278503,0.226601
7,3.8136,3.747493,0.398717,0.19448,0.300661,0.233824
8,3.792,3.723468,0.411549,0.200732,0.310497,0.237363
9,3.7659,3.702911,0.420715,0.206534,0.317385,0.24325
10,3.7562,3.684543,0.424381,0.206721,0.320246,0.245327


[I 2025-03-28 00:35:53,392] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 0.0003778902210541236, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6398,3.001477,0.619615,0.584426,0.511839,0.508511
2,2.4891,1.895279,0.814849,0.685563,0.698346,0.691657
3,1.5123,1.280388,0.858845,0.722286,0.734605,0.728202
4,0.942,1.013703,0.866178,0.731754,0.737649,0.734114
5,0.6524,0.898686,0.877177,0.737809,0.748423,0.74234
6,0.4599,0.822323,0.879927,0.736688,0.751075,0.743657
7,0.3558,0.804399,0.870761,0.898701,0.75201,0.754733
8,0.2646,0.807329,0.87626,0.876084,0.792328,0.812066
9,0.2119,0.78838,0.880843,0.887359,0.823975,0.844497
10,0.1842,0.771576,0.887259,0.893352,0.828373,0.850034


[I 2025-03-28 00:37:19,702] Trial 145 finished with value: 0.8531622675507963 and parameters: {'learning_rate': 0.0003778902210541236, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 146 with params: {'learning_rate': 0.0003311267076628163, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6706,3.090728,0.586618,0.563614,0.478606,0.465044
2,2.6281,2.049443,0.802933,0.677301,0.688224,0.682045
3,1.6867,1.423644,0.851512,0.717621,0.727065,0.722173
4,1.1115,1.132974,0.859762,0.729085,0.730137,0.728566
5,0.7901,0.949153,0.878093,0.737384,0.748852,0.742864
6,0.5779,0.890921,0.871677,0.731646,0.744658,0.737558
7,0.4433,0.83447,0.873511,0.735915,0.745027,0.739632
8,0.3282,0.789157,0.873511,0.899824,0.753723,0.756895
9,0.2658,0.774941,0.88176,0.880164,0.797,0.816647
10,0.229,0.768832,0.880843,0.884191,0.805429,0.826973


[I 2025-03-28 00:38:45,402] Trial 146 finished with value: 0.8500362733746778 and parameters: {'learning_rate': 0.0003311267076628163, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.0}. Best is trial 56 with value: 0.8639964697109707.


Trial 147 with params: {'learning_rate': 0.00031914175756970017, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6834,3.116971,0.577452,0.555382,0.470117,0.454195
2,2.6692,2.100063,0.7956,0.672357,0.681103,0.675937
3,1.7448,1.473286,0.847846,0.715282,0.723678,0.719253
4,1.1716,1.170517,0.857012,0.726173,0.728137,0.726214
5,0.8371,0.972004,0.869844,0.73171,0.742485,0.736866


[I 2025-03-28 00:39:15,642] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.00048013459807663277, 'weight_decay': 0.0, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5591,2.788498,0.694775,0.622195,0.587258,0.591377
2,2.2195,1.615663,0.834097,0.702267,0.713097,0.707349
3,1.2142,1.063234,0.865261,0.728259,0.738874,0.732637
4,0.6911,0.855028,0.87626,0.737313,0.747556,0.742216
5,0.4436,0.768001,0.88451,0.741797,0.755085,0.748037
6,0.3087,0.754175,0.889093,0.912351,0.79453,0.813793
7,0.2349,0.782648,0.882676,0.885029,0.817165,0.837001
8,0.1851,0.750016,0.890009,0.896512,0.830693,0.852239
9,0.1564,0.785068,0.885426,0.892354,0.827855,0.848582
10,0.1406,0.736983,0.889093,0.869083,0.830269,0.844756


[I 2025-03-28 00:40:13,126] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.00046921616487852295, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5753,2.816762,0.694775,0.61778,0.587065,0.590773
2,2.2387,1.627392,0.832264,0.700188,0.711893,0.7056
3,1.233,1.08012,0.868928,0.730453,0.74212,0.735939
4,0.7136,0.864935,0.87626,0.738005,0.746998,0.7423
5,0.4625,0.776164,0.883593,0.741146,0.753807,0.747151
6,0.3232,0.775091,0.88176,0.904239,0.772023,0.779579
7,0.2407,0.763121,0.887259,0.881195,0.802578,0.820225
8,0.1913,0.760994,0.887259,0.894084,0.829117,0.85033
9,0.157,0.764362,0.890009,0.891671,0.832775,0.851039
10,0.1444,0.743787,0.891842,0.895856,0.832768,0.853309


[I 2025-03-28 00:41:39,765] Trial 149 finished with value: 0.8565205530377753 and parameters: {'learning_rate': 0.00046921616487852295, 'weight_decay': 0.001, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 4.5}. Best is trial 56 with value: 0.8639964697109707.


In [22]:
print(best_trial2)

BestRun(run_id='56', objective=0.8639964697109707, hyperparameters={'learning_rate': 0.00043703488150776966, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}, run_summary=None)


In [23]:
#Nápočet epoch na steps
data_length = len(train_aug)
min_r = math.ceil(data_length/batch_size)*5
max_r = math.ceil(data_length/batch_size)*num_epochs
warm_up = math.ceil(data_length/batch_size/10)

In [24]:
base.reset_seed()

In [25]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-base_coarse_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bert-base_coarse_aug_hp-search", epochs=num_epochs, batch_size=batch_size)

In [26]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [27]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [28]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert()
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
best_trial3 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-base-aug",
    n_trials=150
)

[I 2025-03-28 00:41:40,866] A new study created in memory with name: Test-base-aug


Trial 0 with params: {'learning_rate': 4.3284502212938785e-05, 'weight_decay': 0.01, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3477,0.864301,0.774519,0.664628,0.659635,0.65889
2,0.6267,0.542831,0.852429,0.714985,0.729179,0.721744
3,0.3816,0.452832,0.869844,0.894396,0.770352,0.784002
4,0.2792,0.424973,0.883593,0.862096,0.826964,0.839647
5,0.2217,0.417501,0.874427,0.867224,0.820224,0.836156


[I 2025-03-28 00:43:12,467] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 0.00010401663679887307, 'weight_decay': 0.001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.935,0.493736,0.855179,0.720179,0.730055,0.724772
2,0.2895,0.435235,0.87626,0.861953,0.813596,0.827314
3,0.1655,0.414293,0.891842,0.880062,0.835096,0.850056
4,0.1184,0.427359,0.88451,0.876367,0.827957,0.844811
5,0.0871,0.472583,0.88176,0.862928,0.83572,0.845582
6,0.0699,0.478874,0.87901,0.853821,0.832437,0.841102
7,0.0583,0.499537,0.882676,0.855634,0.835368,0.843416
8,0.0475,0.561624,0.875344,0.849484,0.829169,0.837232
9,0.0403,0.564853,0.874427,0.842389,0.827696,0.833936
10,0.0365,0.590918,0.868928,0.838356,0.823108,0.829673


[I 2025-03-28 00:46:11,366] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 1.2551115172973821e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6666,1.514099,0.461045,0.40113,0.347203,0.279476
2,1.3686,1.181633,0.690192,0.620898,0.58359,0.590061
3,1.0765,0.941429,0.754354,0.648087,0.644028,0.642534
4,0.8654,0.787699,0.806599,0.682563,0.690154,0.68523
5,0.7244,0.689544,0.827681,0.697591,0.708532,0.702548


[I 2025-03-28 00:47:42,505] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 0.00015958573588141273, 'weight_decay': 0.0, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8121,0.423079,0.875344,0.901228,0.791359,0.813963
2,0.2035,0.441263,0.87901,0.867283,0.824824,0.838239
3,0.1101,0.459716,0.882676,0.8756,0.835586,0.849961
4,0.0741,0.49687,0.87901,0.872818,0.832768,0.847367
5,0.0511,0.559086,0.873511,0.857278,0.82829,0.838812
6,0.0391,0.580732,0.875344,0.84546,0.828278,0.835638
7,0.0337,0.603204,0.877177,0.853508,0.829624,0.839547
8,0.0261,0.672136,0.868928,0.820559,0.824167,0.822173
9,0.0228,0.68121,0.868928,0.845031,0.823351,0.832198
10,0.0199,0.682913,0.867094,0.820593,0.821036,0.820674


[I 2025-03-28 00:50:40,906] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00025959425503112657, 'weight_decay': 0.002, 'warmup_steps': 5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6142,0.409412,0.88451,0.895117,0.824028,0.847603
2,0.1376,0.45557,0.873511,0.871375,0.826971,0.843494
3,0.0704,0.540606,0.872594,0.8236,0.827566,0.825183
4,0.0475,0.601267,0.868928,0.838769,0.82383,0.830007
5,0.0309,0.638226,0.865261,0.830647,0.829992,0.829579


[I 2025-03-28 00:52:11,499] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5815,1.301046,0.612282,0.585078,0.502202,0.507006
2,1.0861,0.879056,0.780018,0.663237,0.666378,0.662935
3,0.748,0.66555,0.829514,0.699452,0.709618,0.704025
4,0.5621,0.562404,0.848763,0.712559,0.725882,0.718931
5,0.4563,0.505371,0.856095,0.717417,0.732364,0.724585
6,0.3864,0.46859,0.863428,0.723736,0.737693,0.730522
7,0.3406,0.447357,0.873511,0.897785,0.772924,0.787145
8,0.3069,0.433395,0.873511,0.87521,0.800295,0.820128
9,0.2804,0.428096,0.879927,0.874836,0.797154,0.814201
10,0.2609,0.421123,0.88176,0.868265,0.816247,0.832671


[I 2025-03-28 00:56:49,817] Trial 5 finished with value: 0.8442316287028672 and parameters: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 16}. Best is trial 5 with value: 0.8442316287028672.


Trial 6 with params: {'learning_rate': 5.4182823195332406e-05, 'weight_decay': 0.003, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2482,0.737672,0.812099,0.68951,0.692619,0.689801
2,0.5153,0.49085,0.858845,0.718424,0.734984,0.726161
3,0.3079,0.42902,0.88176,0.868406,0.817002,0.832865
4,0.2245,0.417771,0.882676,0.861867,0.826835,0.839477
5,0.1763,0.423379,0.882676,0.873433,0.827505,0.842908
6,0.1448,0.426474,0.879927,0.871614,0.824936,0.840832
7,0.1244,0.42871,0.886343,0.876247,0.829643,0.845513
8,0.109,0.449294,0.885426,0.863815,0.828977,0.84153
9,0.0975,0.462303,0.880843,0.86344,0.834322,0.84541
10,0.0914,0.476708,0.877177,0.862102,0.830451,0.842594


[I 2025-03-28 00:59:54,669] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 1.7258215396625005e-05, 'weight_decay': 0.003, 'warmup_steps': 11}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6145,1.386307,0.51604,0.545184,0.402226,0.371085
2,1.1916,0.982253,0.749771,0.644143,0.640144,0.638179
3,0.8605,0.753085,0.813932,0.687785,0.696229,0.69119
4,0.6587,0.627598,0.836847,0.703709,0.716134,0.709594
5,0.5401,0.557769,0.849679,0.713255,0.726762,0.71969
6,0.4588,0.510436,0.859762,0.721102,0.734797,0.727679
7,0.4056,0.48159,0.860678,0.722206,0.73545,0.728598
8,0.3662,0.461721,0.865261,0.726501,0.73911,0.732658
9,0.3356,0.450867,0.87626,0.900022,0.775736,0.789637
10,0.312,0.441113,0.875344,0.898828,0.793295,0.813749


[I 2025-03-28 01:02:55,733] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 5.954553793888986e-05, 'weight_decay': 0.008, 'warmup_steps': 6}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1896,0.686842,0.819432,0.693196,0.699329,0.695531
2,0.4722,0.475177,0.862511,0.887621,0.746864,0.746988
3,0.2816,0.421382,0.882676,0.872005,0.826724,0.841749
4,0.2045,0.414615,0.88451,0.863411,0.828549,0.840948
5,0.1606,0.42649,0.878093,0.868853,0.824075,0.838814
6,0.1321,0.430051,0.880843,0.87177,0.825549,0.84127
7,0.1129,0.435581,0.886343,0.868807,0.838735,0.850344
8,0.0982,0.458962,0.883593,0.865723,0.836366,0.847595
9,0.0876,0.47204,0.879927,0.862917,0.833156,0.844621
10,0.0813,0.489393,0.875344,0.860632,0.82877,0.840978


[I 2025-03-28 01:07:30,434] Trial 8 finished with value: 0.8411878761285722 and parameters: {'learning_rate': 5.954553793888986e-05, 'weight_decay': 0.008, 'warmup_steps': 6}. Best is trial 5 with value: 0.8442316287028672.


Trial 9 with params: {'learning_rate': 7.475992999956501e-05, 'weight_decay': 0.006, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0775,0.588101,0.83868,0.707863,0.716085,0.711423
2,0.3847,0.44257,0.875344,0.871372,0.794116,0.810784
3,0.2254,0.407458,0.890009,0.866182,0.833547,0.84473
4,0.1635,0.407993,0.887259,0.877073,0.831096,0.846638
5,0.1276,0.447395,0.883593,0.874153,0.83776,0.849914
6,0.1031,0.444259,0.883593,0.877604,0.836986,0.851856
7,0.087,0.458398,0.883593,0.866405,0.836198,0.847871
8,0.0737,0.485563,0.885426,0.8791,0.837942,0.853125
9,0.0648,0.491462,0.88176,0.865564,0.834538,0.846655
10,0.0595,0.506162,0.88176,0.865829,0.834452,0.84669


[I 2025-03-28 01:10:31,711] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 1.1286255962651763e-05, 'weight_decay': 0.0, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6812,1.547807,0.453712,0.235116,0.341218,0.275921
2,1.4188,1.246564,0.656279,0.605444,0.549092,0.558221
3,1.1465,1.007983,0.742438,0.641521,0.63381,0.632628
4,0.9382,0.850891,0.788268,0.669685,0.674339,0.670216
5,0.7939,0.745132,0.819432,0.691355,0.701631,0.695884


[I 2025-03-28 01:12:03,474] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 3.771200161681978e-05, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3853,0.940956,0.753437,0.652592,0.640844,0.641231
2,0.7025,0.587498,0.845096,0.709639,0.723206,0.716006
3,0.4358,0.47802,0.862511,0.88915,0.746363,0.747771
4,0.3198,0.436584,0.878093,0.901201,0.795239,0.815794
5,0.2555,0.420975,0.87901,0.87066,0.823564,0.839732
6,0.2128,0.411346,0.880843,0.871692,0.825229,0.8411
7,0.1841,0.410619,0.885426,0.87517,0.82908,0.844679
8,0.1635,0.416751,0.880843,0.872278,0.825396,0.841522
9,0.1467,0.422165,0.88451,0.874183,0.828734,0.844002
10,0.1377,0.427257,0.883593,0.873872,0.827573,0.843312


[I 2025-03-28 01:16:39,012] Trial 11 finished with value: 0.8415826832136083 and parameters: {'learning_rate': 3.771200161681978e-05, 'weight_decay': 0.01, 'warmup_steps': 3}. Best is trial 5 with value: 0.8442316287028672.


Trial 12 with params: {'learning_rate': 2.038908512355954e-05, 'weight_decay': 0.008, 'warmup_steps': 7}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5778,1.296005,0.607699,0.581045,0.497911,0.502525
2,1.0866,0.883226,0.779102,0.662616,0.665231,0.661806
3,0.7525,0.670539,0.822181,0.693475,0.704034,0.698212
4,0.5668,0.566429,0.846929,0.711324,0.724271,0.717486
5,0.4603,0.508862,0.856095,0.717417,0.732364,0.724585


[I 2025-03-28 01:18:10,575] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 3.0246017282050585e-05, 'weight_decay': 0.01, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4627,1.069259,0.714024,0.630597,0.606275,0.609288
2,0.8343,0.674057,0.825848,0.695732,0.707185,0.700753
3,0.5332,0.529096,0.854262,0.717256,0.730192,0.723443
4,0.3932,0.468754,0.865261,0.892154,0.74853,0.750331
5,0.3159,0.439222,0.874427,0.897982,0.792528,0.812911
6,0.2651,0.41836,0.87901,0.854527,0.814056,0.827935
7,0.2325,0.413056,0.885426,0.874723,0.828294,0.844146
8,0.2076,0.411698,0.879927,0.870989,0.823926,0.84016
9,0.1878,0.415881,0.889093,0.8777,0.832421,0.847543
10,0.1746,0.413942,0.883593,0.873514,0.827649,0.84313


[I 2025-03-28 01:22:48,807] Trial 13 finished with value: 0.8456743306085545 and parameters: {'learning_rate': 3.0246017282050585e-05, 'weight_decay': 0.01, 'warmup_steps': 1}. Best is trial 13 with value: 0.8456743306085545.


Trial 14 with params: {'learning_rate': 3.70515995846801e-05, 'weight_decay': 0.004, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4145,0.957402,0.752521,0.651648,0.640588,0.640508
2,0.7133,0.590197,0.845096,0.7094,0.723355,0.716014
3,0.4399,0.477417,0.862511,0.722231,0.737204,0.7294
4,0.322,0.436372,0.877177,0.900377,0.776827,0.790129
5,0.2572,0.420728,0.877177,0.869061,0.822133,0.838169
6,0.2145,0.412066,0.88176,0.872133,0.826473,0.841891
7,0.1857,0.410815,0.88451,0.873958,0.828197,0.843583
8,0.165,0.417455,0.879927,0.870789,0.82446,0.840275
9,0.1479,0.42334,0.886343,0.875373,0.830561,0.845477
10,0.1389,0.427503,0.885426,0.874792,0.82915,0.844536


[I 2025-03-28 01:25:51,704] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.00037737682452583217, 'weight_decay': 0.007, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5622,0.424665,0.880843,0.888536,0.823382,0.844625
2,0.1103,0.497851,0.879927,0.848306,0.833,0.839363
3,0.0556,0.624104,0.868928,0.832071,0.82274,0.826535
4,0.035,0.70491,0.860678,0.821288,0.824459,0.822401
5,0.0252,0.753609,0.855179,0.821721,0.811269,0.814752
6,0.0203,0.775629,0.860678,0.838957,0.817211,0.826131
7,0.0167,0.821499,0.852429,0.802066,0.811025,0.805383
8,0.0124,0.866592,0.856095,0.793862,0.814337,0.802583
9,0.01,0.859157,0.857929,0.820693,0.815887,0.817502
10,0.0094,0.902581,0.859762,0.809341,0.81592,0.812286


[I 2025-03-28 01:28:55,966] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 1.226026449648691e-05, 'weight_decay': 0.004, 'warmup_steps': 21}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6702,1.521834,0.458295,0.400932,0.344995,0.278582
2,1.3802,1.196299,0.681027,0.615441,0.574989,0.582144
3,1.0923,0.956077,0.753437,0.647797,0.643048,0.641781
4,0.8815,0.801426,0.804766,0.681421,0.68857,0.683834
5,0.7394,0.701263,0.823098,0.693957,0.704921,0.69887


[I 2025-03-28 01:30:27,060] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 1.6269537137899774e-05, 'weight_decay': 0.008, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6197,1.408336,0.494959,0.545669,0.381074,0.3372
2,1.2234,1.017344,0.738772,0.638296,0.631086,0.62943
3,0.8999,0.78791,0.79835,0.676141,0.682601,0.678288
4,0.6959,0.654753,0.828598,0.698302,0.709445,0.703433
5,0.5727,0.579765,0.849679,0.713347,0.726762,0.719691
6,0.4878,0.528738,0.855179,0.717733,0.731123,0.724127
7,0.4315,0.496868,0.857929,0.720403,0.733158,0.726542
8,0.3899,0.474745,0.862511,0.723732,0.737034,0.730154
9,0.3579,0.461993,0.868011,0.893935,0.759816,0.768633
10,0.3329,0.451066,0.872594,0.897752,0.772443,0.786891


[I 2025-03-28 01:33:32,674] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.0002950137270531351, 'weight_decay': 0.01, 'warmup_steps': 10}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5939,0.414851,0.880843,0.891467,0.822756,0.845549
2,0.1257,0.488329,0.874427,0.857803,0.829233,0.839955
3,0.0652,0.546491,0.872594,0.836708,0.826322,0.830811
4,0.0425,0.635347,0.859762,0.828401,0.824759,0.826393
5,0.0287,0.634867,0.870761,0.84551,0.825581,0.833309
6,0.0239,0.702521,0.864345,0.844554,0.81983,0.830044
7,0.0194,0.74981,0.862511,0.818339,0.81985,0.818332
8,0.0149,0.779098,0.867094,0.813758,0.822363,0.817758
9,0.0115,0.796104,0.857929,0.814972,0.816605,0.815146
10,0.012,0.809654,0.855179,0.804922,0.813503,0.808594


[I 2025-03-28 01:36:34,772] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 2.1689644102833747e-05, 'weight_decay': 0.0, 'warmup_steps': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5682,1.26861,0.629698,0.594462,0.520467,0.527805
2,1.0495,0.845665,0.788268,0.668882,0.673915,0.669752
3,0.7126,0.640589,0.830431,0.699872,0.710563,0.704842
4,0.5334,0.544393,0.850596,0.713892,0.72748,0.720405
5,0.4318,0.491661,0.858845,0.720129,0.734364,0.726984


[I 2025-03-28 01:38:06,695] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 9.909923996016043e-05, 'weight_decay': 0.01, 'warmup_steps': 12}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9677,0.50796,0.852429,0.718682,0.727199,0.722484
2,0.3037,0.436093,0.874427,0.860712,0.811619,0.825736
3,0.1737,0.413305,0.892759,0.880602,0.835762,0.850646
4,0.1251,0.425341,0.88451,0.876047,0.827992,0.844722
5,0.0925,0.47082,0.882676,0.863039,0.836846,0.84604
6,0.074,0.473608,0.88176,0.85626,0.834952,0.843631
7,0.0623,0.496433,0.883593,0.856845,0.836194,0.8445
8,0.0507,0.547215,0.874427,0.848843,0.828141,0.83648
9,0.0428,0.554231,0.874427,0.850251,0.828296,0.837242
10,0.0389,0.579737,0.866178,0.835233,0.820962,0.827


[I 2025-03-28 01:41:11,726] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 4.8395053456701976e-05, 'weight_decay': 0.01, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2784,0.794721,0.790101,0.674811,0.673479,0.671817
2,0.5667,0.515807,0.855179,0.715721,0.732018,0.723378
3,0.3432,0.438351,0.877177,0.900723,0.803738,0.826016
4,0.2508,0.419415,0.88176,0.860829,0.825948,0.838403
5,0.1978,0.419422,0.878093,0.869837,0.82363,0.839172
6,0.1629,0.416927,0.883593,0.874336,0.827762,0.843606
7,0.14,0.420819,0.885426,0.875192,0.829192,0.844753
8,0.1241,0.43604,0.886343,0.876064,0.830151,0.845707
9,0.111,0.445637,0.878093,0.862101,0.832141,0.843635
10,0.1047,0.459023,0.878093,0.862642,0.831717,0.8436


[I 2025-03-28 01:44:17,804] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 1.209647793536676e-05, 'weight_decay': 0.01, 'warmup_steps': 6}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6663,1.522141,0.457379,0.400067,0.344474,0.277682
2,1.3831,1.199972,0.68011,0.615444,0.573713,0.581307
3,1.0985,0.963793,0.748854,0.644857,0.638921,0.637776
4,0.8903,0.810572,0.797434,0.676226,0.682063,0.677793
5,0.7491,0.709716,0.823098,0.694564,0.704657,0.698965
6,0.6446,0.639748,0.83593,0.702941,0.715499,0.708789
7,0.5752,0.592417,0.84418,0.709655,0.721564,0.715294
8,0.5223,0.55862,0.851512,0.71534,0.727586,0.721203
9,0.4822,0.535846,0.856095,0.718551,0.731278,0.724635
10,0.4496,0.51809,0.860678,0.721591,0.736049,0.728529


[I 2025-03-28 01:47:22,610] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 3.9309543242353126e-05, 'weight_decay': 0.01, 'warmup_steps': 6}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.372,0.917828,0.758937,0.654905,0.645686,0.645845
2,0.6793,0.574003,0.849679,0.713223,0.727117,0.719801
3,0.4195,0.470128,0.865261,0.890795,0.748404,0.749537
4,0.3076,0.432036,0.878093,0.88069,0.813214,0.832717
5,0.2453,0.419158,0.87901,0.870437,0.823564,0.839589
6,0.2039,0.411571,0.879927,0.871188,0.8245,0.840487
7,0.1759,0.411569,0.882676,0.87281,0.827031,0.842438
8,0.1563,0.418948,0.88176,0.872778,0.826394,0.842229
9,0.14,0.424904,0.882676,0.87286,0.826823,0.842395
10,0.1318,0.431515,0.883593,0.873928,0.827309,0.843219


[I 2025-03-28 01:50:24,061] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 3.356036225718895e-05, 'weight_decay': 0.01, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4264,1.007501,0.735105,0.641545,0.625572,0.625961
2,0.7706,0.629986,0.837764,0.704711,0.716519,0.710047
3,0.485,0.502976,0.857012,0.718726,0.732268,0.725249
4,0.3568,0.451859,0.875344,0.899932,0.774854,0.789119
5,0.2861,0.428974,0.875344,0.863548,0.811393,0.827849
6,0.2393,0.412963,0.87901,0.870351,0.823497,0.83963
7,0.2086,0.410413,0.885426,0.863535,0.828563,0.841194
8,0.1856,0.413393,0.880843,0.872441,0.824862,0.841344
9,0.1672,0.417446,0.885426,0.87448,0.829428,0.844406
10,0.1559,0.418359,0.886343,0.875988,0.829851,0.845488


[I 2025-03-28 01:54:59,384] Trial 24 finished with value: 0.8431726332651289 and parameters: {'learning_rate': 3.356036225718895e-05, 'weight_decay': 0.01, 'warmup_steps': 1}. Best is trial 13 with value: 0.8456743306085545.


Trial 25 with params: {'learning_rate': 0.0003026895453749053, 'weight_decay': 0.0, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6048,0.407435,0.880843,0.890693,0.821548,0.844701
2,0.126,0.485591,0.87901,0.889078,0.832134,0.852023
3,0.0648,0.578922,0.868011,0.842144,0.832115,0.836041
4,0.0424,0.617348,0.864345,0.832001,0.821323,0.825224
5,0.0273,0.644379,0.870761,0.868324,0.824399,0.84094
6,0.0247,0.67568,0.866178,0.820522,0.820407,0.820253
7,0.019,0.734342,0.861595,0.81804,0.817958,0.817639
8,0.0142,0.790079,0.861595,0.813386,0.81862,0.815364
9,0.0126,0.804219,0.853346,0.825944,0.811151,0.817473
10,0.0118,0.822435,0.865261,0.828742,0.820765,0.823683


[I 2025-03-28 01:58:01,465] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 4.193034167165574e-05, 'weight_decay': 0.006, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3568,0.881724,0.769936,0.661998,0.655226,0.654638
2,0.6435,0.552395,0.851512,0.71434,0.728464,0.721054
3,0.3936,0.457753,0.867094,0.892796,0.759187,0.767683
4,0.2881,0.426894,0.882676,0.869028,0.817173,0.83335
5,0.2292,0.417547,0.875344,0.867648,0.820737,0.836717
6,0.1898,0.413922,0.880843,0.871511,0.825682,0.841168
7,0.1633,0.414646,0.882676,0.872575,0.827268,0.842482
8,0.1453,0.42494,0.883593,0.873626,0.828019,0.843389
9,0.1297,0.432405,0.880843,0.871187,0.825393,0.840805
10,0.1224,0.440452,0.882676,0.873074,0.826545,0.842344


[I 2025-03-28 02:02:35,952] Trial 26 finished with value: 0.850863037784695 and parameters: {'learning_rate': 4.193034167165574e-05, 'weight_decay': 0.006, 'warmup_steps': 19}. Best is trial 26 with value: 0.850863037784695.


Trial 27 with params: {'learning_rate': 4.79732368713819e-05, 'weight_decay': 0.006, 'warmup_steps': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2989,0.80451,0.791934,0.676287,0.674578,0.67311
2,0.5736,0.517965,0.855179,0.716038,0.731458,0.72331
3,0.3464,0.440357,0.87626,0.899728,0.802907,0.825199
4,0.2532,0.420093,0.88176,0.860785,0.825617,0.838303
5,0.2,0.419699,0.878093,0.869656,0.823298,0.83894
6,0.1643,0.418835,0.880843,0.871961,0.825748,0.841368
7,0.1412,0.420814,0.886343,0.875836,0.829657,0.84528
8,0.1251,0.436956,0.885426,0.875024,0.829282,0.844735
9,0.1117,0.447206,0.878093,0.861711,0.831855,0.843289
10,0.1054,0.459315,0.87901,0.862629,0.832578,0.843988


[I 2025-03-28 02:07:10,990] Trial 27 finished with value: 0.8436704177571858 and parameters: {'learning_rate': 4.79732368713819e-05, 'weight_decay': 0.006, 'warmup_steps': 16}. Best is trial 26 with value: 0.850863037784695.


Trial 28 with params: {'learning_rate': 8.27550295019173e-05, 'weight_decay': 0.005, 'warmup_steps': 25}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0654,0.56062,0.843263,0.711547,0.719861,0.715233
2,0.3575,0.437675,0.875344,0.862669,0.812342,0.827573
3,0.2067,0.411134,0.888176,0.876388,0.831935,0.846621
4,0.1499,0.420288,0.888176,0.877258,0.831692,0.846994
5,0.1153,0.458986,0.885426,0.876002,0.839212,0.851762
6,0.092,0.465972,0.87901,0.873639,0.833026,0.847903
7,0.0769,0.477657,0.883593,0.866831,0.83574,0.847859
8,0.0647,0.514193,0.880843,0.855186,0.83377,0.8425
9,0.0561,0.527521,0.874427,0.851418,0.827911,0.83762
10,0.0512,0.538501,0.877177,0.86215,0.830716,0.842872


[I 2025-03-28 02:10:12,549] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 2.0641950878300647e-05, 'weight_decay': 0.003, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5702,1.285815,0.611366,0.583073,0.501836,0.506964
2,1.0768,0.875691,0.780018,0.663215,0.666229,0.662563
3,0.7447,0.665436,0.823098,0.694205,0.704701,0.698914
4,0.5608,0.562642,0.846929,0.711403,0.724271,0.717529
5,0.4553,0.50607,0.856095,0.717417,0.732364,0.724585
6,0.3857,0.469457,0.864345,0.891032,0.747828,0.749491
7,0.3401,0.44867,0.872594,0.897301,0.772258,0.786589
8,0.3069,0.434337,0.873511,0.870969,0.791172,0.809442
9,0.2803,0.428972,0.87626,0.876165,0.803034,0.821821
10,0.261,0.42198,0.879927,0.858779,0.823622,0.836396


[I 2025-03-28 02:14:48,195] Trial 29 finished with value: 0.8442107055080833 and parameters: {'learning_rate': 2.0641950878300647e-05, 'weight_decay': 0.003, 'warmup_steps': 0}. Best is trial 26 with value: 0.850863037784695.


Trial 30 with params: {'learning_rate': 3.728074305736512e-05, 'weight_decay': 0.007, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4059,0.95088,0.749771,0.649729,0.638491,0.638611
2,0.7089,0.588504,0.845096,0.709283,0.723355,0.715961
3,0.4381,0.477151,0.862511,0.722231,0.737204,0.7294
4,0.3211,0.43662,0.877177,0.900213,0.785964,0.803195
5,0.2566,0.420802,0.87901,0.870501,0.823529,0.839636
6,0.2138,0.41216,0.880843,0.871452,0.825757,0.841208
7,0.1851,0.411012,0.88451,0.873958,0.828197,0.843583
8,0.1645,0.417518,0.880843,0.871746,0.825458,0.841222
9,0.1474,0.423381,0.886343,0.875373,0.830561,0.845477
10,0.1384,0.427738,0.887259,0.876503,0.830532,0.846103


[I 2025-03-28 02:19:22,382] Trial 30 finished with value: 0.8424281619355719 and parameters: {'learning_rate': 3.728074305736512e-05, 'weight_decay': 0.007, 'warmup_steps': 24}. Best is trial 26 with value: 0.850863037784695.


Trial 31 with params: {'learning_rate': 5.0748544018172296e-05, 'weight_decay': 0.003, 'warmup_steps': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2584,0.768823,0.79835,0.6796,0.680321,0.678033
2,0.5437,0.505567,0.856095,0.71632,0.732685,0.723954
3,0.3278,0.433521,0.878093,0.865466,0.813493,0.829702
4,0.2393,0.418556,0.882676,0.861607,0.826518,0.839169
5,0.1883,0.420792,0.880843,0.872302,0.825908,0.84158
6,0.155,0.419531,0.88176,0.873147,0.826367,0.842292
7,0.1333,0.423505,0.887259,0.876573,0.830574,0.846105
8,0.1176,0.441032,0.886343,0.875814,0.830151,0.845571
9,0.1052,0.45155,0.87901,0.863501,0.832543,0.844553
10,0.0991,0.465952,0.87626,0.861382,0.830286,0.842209


[I 2025-03-28 02:22:25,168] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 1.8208943742256654e-05, 'weight_decay': 0.003, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5993,1.35338,0.554537,0.557913,0.441435,0.428992
2,1.1563,0.9498,0.752521,0.644987,0.643039,0.640494
3,0.8253,0.726299,0.820348,0.692295,0.701831,0.696345
4,0.6289,0.60761,0.83868,0.704993,0.717265,0.710776
5,0.5142,0.541265,0.850596,0.713571,0.727393,0.720176
6,0.4361,0.49766,0.860678,0.72178,0.735415,0.728369
7,0.3856,0.471299,0.861595,0.722939,0.736117,0.729298
8,0.348,0.453084,0.868928,0.895813,0.769536,0.784521
9,0.3185,0.44387,0.873511,0.897396,0.791481,0.81212
10,0.2963,0.435007,0.873511,0.857147,0.800521,0.816696


[I 2025-03-28 02:25:28,814] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 2.1294796715500925e-05, 'weight_decay': 0.003, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5742,1.282692,0.621448,0.588358,0.511251,0.516963
2,1.0627,0.856295,0.787351,0.668566,0.672984,0.66908
3,0.7238,0.647936,0.831347,0.70094,0.711216,0.705681
4,0.542,0.549482,0.850596,0.713892,0.72748,0.720405
5,0.4391,0.495421,0.858845,0.720129,0.734364,0.726984


[I 2025-03-28 02:27:00,396] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 1.1498771132186459e-05, 'weight_decay': 0.005, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6698,1.534673,0.451879,0.232809,0.340009,0.274026
2,1.4049,1.22832,0.666361,0.610276,0.559606,0.568533
3,1.1307,0.994891,0.748854,0.646202,0.638719,0.63759
4,0.9248,0.840926,0.790101,0.671468,0.675225,0.671708
5,0.7827,0.737051,0.819432,0.691863,0.701447,0.695891
6,0.6755,0.663502,0.830431,0.699402,0.710608,0.704437
7,0.6038,0.613241,0.837764,0.70476,0.716589,0.710254
8,0.549,0.577087,0.849679,0.71412,0.72619,0.719834
9,0.5075,0.552801,0.855179,0.717802,0.730611,0.723927
10,0.4737,0.533653,0.857929,0.71972,0.733471,0.726302


[I 2025-03-28 02:30:02,782] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 1.76565632955443e-05, 'weight_decay': 0.001, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.607,1.370029,0.531622,0.552033,0.418868,0.398267
2,1.1758,0.968589,0.749771,0.643532,0.640726,0.63846
3,0.8458,0.742705,0.817599,0.690229,0.699303,0.693917
4,0.6467,0.619926,0.836847,0.703752,0.715918,0.709482
5,0.5296,0.551337,0.850596,0.714085,0.727428,0.720446


[I 2025-03-28 02:31:34,048] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.0004180301872969493, 'weight_decay': 0.006, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4863,0.404668,0.882676,0.891783,0.82525,0.847211
2,0.1002,0.504942,0.875344,0.840053,0.829226,0.832885
3,0.0494,0.644375,0.869844,0.852583,0.825946,0.835216
4,0.0356,0.685661,0.855179,0.802394,0.821536,0.810138
5,0.0246,0.837704,0.843263,0.821639,0.800824,0.80735
6,0.0218,0.775249,0.858845,0.831769,0.814609,0.822252
7,0.0154,0.91976,0.846929,0.806254,0.797155,0.799932
8,0.0129,0.887614,0.854262,0.813888,0.810011,0.81152
9,0.0107,0.910092,0.852429,0.834682,0.802192,0.813352
10,0.0095,0.867745,0.851512,0.794774,0.808246,0.800824


[I 2025-03-28 02:34:37,495] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 2.870505282297996e-05, 'weight_decay': 0.005, 'warmup_steps': 12}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.487,1.103449,0.699358,0.624397,0.592898,0.59761
2,0.8672,0.695935,0.824015,0.694054,0.705852,0.699214
3,0.5572,0.541843,0.852429,0.715352,0.728859,0.721754
4,0.4112,0.476711,0.862511,0.723411,0.737282,0.730041
5,0.3307,0.444159,0.873511,0.897319,0.7736,0.787146


[I 2025-03-28 02:36:08,771] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 6.660664513238181e-05, 'weight_decay': 0.001, 'warmup_steps': 11}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1441,0.637079,0.832264,0.702976,0.710294,0.706009
2,0.4288,0.457911,0.867094,0.891514,0.777609,0.794649
3,0.2533,0.415059,0.883593,0.861058,0.828038,0.839461
4,0.1836,0.410411,0.88451,0.874718,0.829047,0.844411
5,0.1442,0.435045,0.88176,0.870631,0.827262,0.841161
6,0.1174,0.441921,0.880843,0.872081,0.825536,0.841413
7,0.0993,0.450594,0.880843,0.86404,0.834109,0.845654
8,0.0857,0.472564,0.882676,0.865501,0.835669,0.847071
9,0.0761,0.485324,0.878093,0.86196,0.83151,0.843275
10,0.0697,0.501885,0.878093,0.862878,0.831083,0.843285


[I 2025-03-28 02:40:43,755] Trial 38 finished with value: 0.8429280864394536 and parameters: {'learning_rate': 6.660664513238181e-05, 'weight_decay': 0.001, 'warmup_steps': 11}. Best is trial 26 with value: 0.850863037784695.


Trial 39 with params: {'learning_rate': 0.00013253735630179916, 'weight_decay': 0.009000000000000001, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.88,0.451913,0.865261,0.894059,0.755272,0.766229
2,0.2383,0.434529,0.879927,0.868552,0.82532,0.839164
3,0.1315,0.430046,0.889093,0.877402,0.831909,0.847162
4,0.091,0.466788,0.87901,0.863012,0.832123,0.844083
5,0.0644,0.506624,0.886343,0.867759,0.837818,0.84891
6,0.05,0.537239,0.875344,0.845132,0.828393,0.835384
7,0.0413,0.557271,0.880843,0.847836,0.833317,0.839447
8,0.0331,0.608877,0.87626,0.84346,0.829563,0.835437
9,0.0282,0.625991,0.873511,0.848922,0.827113,0.836025
10,0.0255,0.664111,0.866178,0.830765,0.820784,0.825145


[I 2025-03-28 02:43:45,424] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 3.371723902711669e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4366,1.007846,0.734189,0.641277,0.624864,0.625576
2,0.7679,0.625708,0.84143,0.707292,0.71993,0.713082
3,0.4813,0.4997,0.857929,0.719342,0.732983,0.725898
4,0.3537,0.449865,0.873511,0.897737,0.77352,0.787279
5,0.2835,0.427753,0.874427,0.862822,0.810678,0.827115


[I 2025-03-28 02:45:16,544] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 6.370365085168923e-05, 'weight_decay': 0.006, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1745,0.657449,0.828598,0.700778,0.706618,0.702949
2,0.4465,0.463284,0.867094,0.891645,0.768472,0.781579
3,0.2641,0.419233,0.88176,0.859727,0.826726,0.838151
4,0.1915,0.413544,0.879927,0.871453,0.825422,0.840953
5,0.1502,0.43207,0.883593,0.872408,0.828631,0.842804
6,0.1227,0.439969,0.87901,0.870526,0.824189,0.839918
7,0.1043,0.446217,0.88451,0.866965,0.837151,0.84865
8,0.0902,0.467468,0.88176,0.864958,0.834988,0.846462
9,0.0801,0.481655,0.880843,0.864786,0.83351,0.845681
10,0.0738,0.497132,0.87901,0.86326,0.831547,0.843712


[I 2025-03-28 02:49:50,436] Trial 41 finished with value: 0.8429878766007187 and parameters: {'learning_rate': 6.370365085168923e-05, 'weight_decay': 0.006, 'warmup_steps': 19}. Best is trial 26 with value: 0.850863037784695.


Trial 42 with params: {'learning_rate': 5.509360618360087e-05, 'weight_decay': 0.007, 'warmup_steps': 14}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.235,0.727781,0.814849,0.690676,0.695493,0.692021
2,0.5072,0.488262,0.858845,0.718549,0.734936,0.726175
3,0.3033,0.427639,0.87901,0.866461,0.814209,0.830552
4,0.2209,0.417772,0.882676,0.861746,0.8271,0.839521
5,0.1734,0.423632,0.882676,0.873433,0.827505,0.842908
6,0.1425,0.426982,0.87901,0.870689,0.824221,0.840006
7,0.1224,0.429767,0.886343,0.876339,0.829643,0.845552
8,0.1071,0.45105,0.886343,0.868236,0.838781,0.850048
9,0.0957,0.464213,0.879927,0.862874,0.833175,0.844564
10,0.0897,0.479436,0.877177,0.862265,0.830403,0.842589


[I 2025-03-28 02:52:50,342] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 2.3125018733011798e-05, 'weight_decay': 0.006, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5534,1.233726,0.650779,0.605066,0.542842,0.55211
2,1.0085,0.808364,0.79835,0.675686,0.683002,0.678338
3,0.6738,0.614031,0.84143,0.706855,0.719508,0.712872
4,0.5022,0.52523,0.854262,0.717061,0.730575,0.723481
5,0.4056,0.477197,0.860678,0.721235,0.735962,0.728301
6,0.3425,0.446275,0.873511,0.897673,0.772938,0.787084
7,0.3015,0.430904,0.87626,0.853174,0.793592,0.808554
8,0.2714,0.420491,0.87901,0.866749,0.813641,0.8307
9,0.2474,0.41915,0.885426,0.874381,0.828726,0.844178
10,0.2302,0.413619,0.885426,0.8752,0.828431,0.844474


[I 2025-03-28 02:57:22,814] Trial 43 finished with value: 0.8444292373687404 and parameters: {'learning_rate': 2.3125018733011798e-05, 'weight_decay': 0.006, 'warmup_steps': 18}. Best is trial 26 with value: 0.850863037784695.


Trial 44 with params: {'learning_rate': 1.9169194265002284e-05, 'weight_decay': 0.006, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.597,1.337719,0.571952,0.565681,0.459526,0.453448
2,1.1296,0.919187,0.76352,0.65231,0.652764,0.649725
3,0.7911,0.696956,0.823098,0.694335,0.704705,0.698883
4,0.5976,0.585213,0.84418,0.708863,0.721891,0.715046
5,0.4867,0.523203,0.856095,0.717357,0.732351,0.72452


[I 2025-03-28 02:58:54,292] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 1.1119465144252193e-05, 'weight_decay': 0.007, 'warmup_steps': 14}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6788,1.548356,0.452796,0.235323,0.340551,0.275654
2,1.4228,1.251334,0.648946,0.600707,0.541322,0.549958
3,1.1541,1.016469,0.743355,0.643433,0.634476,0.633723
4,0.9478,0.860204,0.782768,0.665617,0.669067,0.665515
5,0.8041,0.754141,0.815765,0.688932,0.698158,0.692692
6,0.6949,0.678468,0.827681,0.697256,0.708581,0.702378
7,0.6215,0.626395,0.834097,0.703062,0.713366,0.707686
8,0.5653,0.588897,0.846929,0.71171,0.724411,0.717725
9,0.5229,0.563567,0.855179,0.71765,0.730929,0.72394
10,0.4883,0.5437,0.856095,0.718536,0.731529,0.724762


[I 2025-03-28 03:01:52,263] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 2.7534592054740236e-05, 'weight_decay': 0.006, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5057,1.132243,0.698442,0.626978,0.59092,0.59732
2,0.8948,0.713924,0.819432,0.691252,0.702214,0.69594
3,0.5767,0.55251,0.852429,0.715312,0.728527,0.721601
4,0.4259,0.482811,0.862511,0.723468,0.73718,0.730087
5,0.3424,0.447871,0.874427,0.898122,0.774219,0.787881


[I 2025-03-28 03:03:31,117] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 2.7998130077074985e-05, 'weight_decay': 0.003, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4877,1.11557,0.695692,0.621033,0.5899,0.59431
2,0.8827,0.70976,0.818515,0.689763,0.701296,0.694712
3,0.5711,0.550913,0.853346,0.716014,0.729574,0.722438
4,0.4223,0.483279,0.860678,0.722035,0.735617,0.728541
5,0.3398,0.448469,0.872594,0.855814,0.772602,0.784387
6,0.2858,0.424711,0.878093,0.865923,0.813257,0.830087
7,0.2513,0.416792,0.88451,0.862517,0.827296,0.840105
8,0.225,0.411915,0.879927,0.859385,0.823326,0.836639
9,0.2042,0.415482,0.890009,0.878296,0.833115,0.848214
10,0.1898,0.412337,0.887259,0.865103,0.830047,0.842726


[I 2025-03-28 03:08:04,947] Trial 47 finished with value: 0.8441477251127399 and parameters: {'learning_rate': 2.7998130077074985e-05, 'weight_decay': 0.003, 'warmup_steps': 1}. Best is trial 26 with value: 0.850863037784695.


Trial 48 with params: {'learning_rate': 2.337348909924057e-05, 'weight_decay': 0.003, 'warmup_steps': 14}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5481,1.224697,0.657195,0.607232,0.549363,0.558458
2,1.0002,0.802978,0.800183,0.676831,0.684481,0.679655
3,0.668,0.610851,0.837764,0.70416,0.716312,0.709863
4,0.4979,0.523233,0.851512,0.715091,0.728311,0.721331
5,0.4021,0.475911,0.860678,0.721384,0.735976,0.728416
6,0.3395,0.445404,0.873511,0.897673,0.772938,0.787084
7,0.2989,0.430416,0.87626,0.853174,0.793592,0.808554
8,0.2691,0.420165,0.87901,0.866749,0.813641,0.8307
9,0.2452,0.419002,0.885426,0.874381,0.828726,0.844178
10,0.2282,0.413589,0.886343,0.875861,0.829146,0.845163


[I 2025-03-28 03:12:39,006] Trial 48 finished with value: 0.8444292373687404 and parameters: {'learning_rate': 2.337348909924057e-05, 'weight_decay': 0.003, 'warmup_steps': 14}. Best is trial 26 with value: 0.850863037784695.


Trial 49 with params: {'learning_rate': 1.3953407978753091e-05, 'weight_decay': 0.002, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6518,1.47779,0.462878,0.398467,0.349648,0.284659
2,1.3148,1.116111,0.71769,0.630841,0.611075,0.613506
3,1.0052,0.876354,0.783685,0.668057,0.669393,0.666959
4,0.7936,0.728303,0.816682,0.689831,0.699402,0.694018
5,0.6584,0.639793,0.832264,0.700323,0.712387,0.705955


[I 2025-03-28 03:14:09,539] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 1.5505459228319995e-05, 'weight_decay': 0.004, 'warmup_steps': 13}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6341,1.435524,0.47846,0.54734,0.363607,0.304967
2,1.2557,1.049123,0.731439,0.634984,0.624385,0.623667
3,0.9329,0.813458,0.797434,0.677234,0.681356,0.677945
4,0.7246,0.675126,0.824931,0.696306,0.706236,0.700794
5,0.5973,0.596365,0.845096,0.708905,0.723528,0.715856


[I 2025-03-28 03:15:40,022] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 2.6218743903424214e-05, 'weight_decay': 0.005, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5181,1.159961,0.687443,0.620369,0.579886,0.586761
2,0.9259,0.73942,0.817599,0.690003,0.700303,0.694357
3,0.6029,0.568907,0.850596,0.713892,0.726863,0.720083
4,0.4465,0.494049,0.861595,0.72288,0.736249,0.729325
5,0.3594,0.455189,0.869844,0.894709,0.770492,0.784321
6,0.3026,0.429468,0.877177,0.860351,0.803135,0.819709
7,0.2662,0.419354,0.885426,0.874848,0.827914,0.844098
8,0.2388,0.412748,0.88176,0.860775,0.824924,0.838143
9,0.2172,0.415245,0.888176,0.876652,0.831119,0.846456
10,0.202,0.41128,0.887259,0.864752,0.83006,0.842527


[I 2025-03-28 03:20:14,193] Trial 51 finished with value: 0.8484606073284771 and parameters: {'learning_rate': 2.6218743903424214e-05, 'weight_decay': 0.005, 'warmup_steps': 17}. Best is trial 26 with value: 0.850863037784695.


Trial 52 with params: {'learning_rate': 3.2143133140800936e-05, 'weight_decay': 0.004, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4532,1.036236,0.725023,0.636263,0.616559,0.618642
2,0.7968,0.644719,0.835014,0.702656,0.71447,0.70797
3,0.5025,0.510775,0.856095,0.717904,0.731574,0.724473
4,0.3696,0.456622,0.873511,0.898458,0.764167,0.77297
5,0.2965,0.431735,0.877177,0.859447,0.804097,0.819539


[I 2025-03-28 03:21:45,122] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 2.7928748815014052e-05, 'weight_decay': 0.003, 'warmup_steps': 11}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4949,1.11972,0.696609,0.624469,0.589918,0.595705
2,0.8846,0.709331,0.819432,0.690663,0.702227,0.695698
3,0.5712,0.550303,0.852429,0.715273,0.728907,0.721726
4,0.4221,0.482338,0.859762,0.721627,0.734902,0.728013
5,0.3396,0.447727,0.872594,0.855559,0.772602,0.784252
6,0.2856,0.423988,0.87901,0.866527,0.813937,0.830725
7,0.251,0.416136,0.88451,0.862517,0.827296,0.840105
8,0.2247,0.411399,0.880843,0.860111,0.823993,0.837345
9,0.2041,0.415243,0.890926,0.87901,0.833782,0.848895
10,0.1896,0.412109,0.887259,0.876554,0.830378,0.846054


[I 2025-03-28 03:26:18,725] Trial 53 finished with value: 0.8441477251127399 and parameters: {'learning_rate': 2.7928748815014052e-05, 'weight_decay': 0.003, 'warmup_steps': 11}. Best is trial 26 with value: 0.850863037784695.


Trial 54 with params: {'learning_rate': 5.8986909446740275e-05, 'weight_decay': 0.0, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2144,0.694793,0.823098,0.696485,0.702626,0.698826
2,0.4782,0.474849,0.861595,0.720538,0.737025,0.728296
3,0.284,0.42372,0.88176,0.871767,0.826475,0.84155
4,0.2065,0.416099,0.879927,0.859382,0.825024,0.83727
5,0.1617,0.427594,0.88176,0.871668,0.82727,0.841801
6,0.1327,0.433926,0.882676,0.873028,0.827729,0.842928
7,0.1137,0.437138,0.88451,0.863925,0.828014,0.841146
8,0.0988,0.459317,0.88451,0.866541,0.83735,0.848461
9,0.0881,0.473375,0.88451,0.866743,0.837385,0.848616
10,0.0818,0.48823,0.877177,0.862441,0.830403,0.842734


[I 2025-03-28 03:29:21,968] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 2.5163384809924603e-05, 'weight_decay': 0.005, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5306,1.184741,0.678277,0.616343,0.571253,0.579241
2,0.9531,0.761087,0.813016,0.686579,0.69606,0.690498
3,0.6253,0.582716,0.846013,0.709883,0.723168,0.71619
4,0.4639,0.503354,0.858845,0.720591,0.734235,0.727124
5,0.3738,0.461419,0.866178,0.89182,0.758375,0.766773
6,0.3149,0.434232,0.873511,0.870736,0.791234,0.809352
7,0.2772,0.422506,0.883593,0.869565,0.817443,0.833917
8,0.2489,0.41461,0.879927,0.854964,0.814405,0.828339
9,0.2266,0.415931,0.886343,0.875314,0.829441,0.844992
10,0.2108,0.411414,0.883593,0.862291,0.827,0.839799


[I 2025-03-28 03:33:56,306] Trial 55 finished with value: 0.847736908232406 and parameters: {'learning_rate': 2.5163384809924603e-05, 'weight_decay': 0.005, 'warmup_steps': 18}. Best is trial 26 with value: 0.850863037784695.


Trial 56 with params: {'learning_rate': 1.6644460906350915e-05, 'weight_decay': 0.006, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6269,1.407572,0.496792,0.546367,0.381957,0.335746
2,1.2188,1.007625,0.740605,0.636818,0.63383,0.631006
3,0.8867,0.772046,0.808433,0.684746,0.691376,0.687036
4,0.6799,0.641953,0.832264,0.700311,0.712689,0.70618
5,0.5575,0.569133,0.855179,0.717555,0.731555,0.724253


[I 2025-03-28 03:35:26,297] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 4.5896440070917636e-05, 'weight_decay': 0.004, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3229,0.830355,0.789184,0.674908,0.6723,0.670991
2,0.596,0.527413,0.855179,0.71666,0.731444,0.723651
3,0.3609,0.445255,0.870761,0.895357,0.771067,0.784879
4,0.264,0.422006,0.883593,0.86239,0.827012,0.839781
5,0.209,0.418502,0.87901,0.870684,0.823951,0.83971
6,0.172,0.417397,0.883593,0.873923,0.828326,0.843616
7,0.1479,0.418581,0.88451,0.874384,0.828539,0.844031
8,0.1315,0.433163,0.88451,0.874352,0.828602,0.844072
9,0.1172,0.442584,0.878093,0.857981,0.822766,0.835472
10,0.1107,0.453028,0.88176,0.864845,0.834689,0.846195


[I 2025-03-28 03:39:55,862] Trial 57 finished with value: 0.8443762929534229 and parameters: {'learning_rate': 4.5896440070917636e-05, 'weight_decay': 0.004, 'warmup_steps': 22}. Best is trial 26 with value: 0.850863037784695.


Trial 58 with params: {'learning_rate': 3.8481566253685874e-05, 'weight_decay': 0.005, 'warmup_steps': 21}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3916,0.931743,0.756187,0.653362,0.643686,0.643772
2,0.6908,0.578394,0.848763,0.71215,0.726451,0.718909
3,0.426,0.471667,0.864345,0.890165,0.747723,0.748901
4,0.3122,0.433689,0.880843,0.902928,0.816042,0.83869
5,0.2492,0.419518,0.879927,0.871148,0.824195,0.840243
6,0.2073,0.412385,0.88176,0.872344,0.82641,0.841992
7,0.1792,0.411713,0.882676,0.872487,0.826864,0.842166
8,0.1593,0.419155,0.882676,0.873141,0.82684,0.84261
9,0.1426,0.425178,0.885426,0.874708,0.829881,0.844806
10,0.1341,0.430749,0.88451,0.874287,0.82824,0.843854


[I 2025-03-28 03:44:28,020] Trial 58 finished with value: 0.8415686266596509 and parameters: {'learning_rate': 3.8481566253685874e-05, 'weight_decay': 0.005, 'warmup_steps': 21}. Best is trial 26 with value: 0.850863037784695.


Trial 59 with params: {'learning_rate': 2.2361562731410567e-05, 'weight_decay': 0.006, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5613,1.252147,0.63978,0.599131,0.531062,0.53936
2,1.0301,0.827801,0.7956,0.673074,0.680786,0.675925
3,0.694,0.627674,0.839597,0.705995,0.717896,0.711657
4,0.5183,0.53502,0.852429,0.715637,0.729193,0.722086
5,0.4191,0.484564,0.859762,0.720876,0.735031,0.727693


[I 2025-03-28 03:45:56,583] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 2.2018528373710634e-05, 'weight_decay': 0.005, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5657,1.262284,0.63703,0.598881,0.528117,0.536257
2,1.0405,0.83665,0.792851,0.672037,0.678306,0.67389
3,0.7033,0.633814,0.837764,0.704919,0.716452,0.710408
4,0.5256,0.539402,0.851512,0.714922,0.728195,0.721264
5,0.4252,0.487789,0.859762,0.720876,0.735031,0.727693
6,0.3596,0.454438,0.870761,0.895769,0.770889,0.78511
7,0.3167,0.436779,0.87626,0.899769,0.784172,0.802321
8,0.2852,0.424951,0.874427,0.858675,0.80101,0.817812
9,0.2602,0.422034,0.88451,0.873757,0.827962,0.843507
10,0.2422,0.415967,0.882676,0.872597,0.826051,0.842


[I 2025-03-28 03:50:31,635] Trial 60 finished with value: 0.8428483398403458 and parameters: {'learning_rate': 2.2018528373710634e-05, 'weight_decay': 0.005, 'warmup_steps': 18}. Best is trial 26 with value: 0.850863037784695.


Trial 61 with params: {'learning_rate': 3.322440569788184e-05, 'weight_decay': 0.003, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4457,1.018167,0.732356,0.640491,0.6232,0.624343
2,0.7768,0.630557,0.83868,0.705037,0.717882,0.710939
3,0.4869,0.501845,0.858845,0.720196,0.733636,0.726679
4,0.3576,0.450884,0.873511,0.897963,0.77352,0.787418
5,0.2866,0.428324,0.87626,0.863749,0.812492,0.82843
6,0.2398,0.413023,0.882676,0.872798,0.826755,0.842438
7,0.2092,0.410369,0.885426,0.875341,0.828347,0.844443
8,0.1862,0.41339,0.88176,0.872903,0.825542,0.841927
9,0.1678,0.418044,0.887259,0.876108,0.831074,0.846025
10,0.1563,0.418495,0.88451,0.874344,0.828518,0.843975


[I 2025-03-28 03:55:04,705] Trial 61 finished with value: 0.8424061608033937 and parameters: {'learning_rate': 3.322440569788184e-05, 'weight_decay': 0.003, 'warmup_steps': 22}. Best is trial 26 with value: 0.850863037784695.


Trial 62 with params: {'learning_rate': 9.765720978539949e-05, 'weight_decay': 0.003, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9827,0.512735,0.851512,0.717742,0.726797,0.7219
2,0.3086,0.435842,0.87626,0.862479,0.81275,0.827413
3,0.1761,0.413744,0.893676,0.881152,0.836451,0.851315
4,0.1271,0.428546,0.88176,0.873315,0.826159,0.842409
5,0.0939,0.469482,0.883593,0.873988,0.837879,0.849914
6,0.0748,0.474552,0.883593,0.867095,0.836519,0.848464
7,0.0629,0.496211,0.885426,0.858612,0.837033,0.845799
8,0.0518,0.546646,0.87626,0.850854,0.829806,0.838286
9,0.0435,0.553976,0.868928,0.838119,0.823307,0.829616
10,0.0397,0.577459,0.871677,0.840768,0.82501,0.831706


[I 2025-03-28 03:58:06,545] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 4.1990209313257634e-05, 'weight_decay': 0.007, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3374,0.875265,0.772686,0.663835,0.657522,0.65673
2,0.6405,0.552646,0.848763,0.71189,0.726416,0.718797
3,0.3936,0.457363,0.868928,0.894013,0.769671,0.783523
4,0.2884,0.424428,0.879927,0.882957,0.814547,0.834565
5,0.2292,0.416881,0.878093,0.869675,0.8228,0.838747
6,0.1899,0.410596,0.880843,0.87204,0.825449,0.84131
7,0.1634,0.412914,0.883593,0.873573,0.827886,0.843286
8,0.1454,0.421606,0.882676,0.873057,0.82729,0.842771
9,0.13,0.428454,0.88451,0.874292,0.828121,0.843728
10,0.1228,0.437836,0.882676,0.873482,0.826531,0.842534


[I 2025-03-28 04:01:08,563] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 5.2251859965165314e-05, 'weight_decay': 0.004, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2729,0.759218,0.804766,0.684233,0.685997,0.683772
2,0.5328,0.497285,0.857929,0.717912,0.734053,0.725515
3,0.3184,0.43212,0.88176,0.86839,0.817002,0.83289
4,0.2325,0.418208,0.88451,0.863254,0.828266,0.840862
5,0.1827,0.422778,0.88176,0.873057,0.826243,0.84209
6,0.15,0.424738,0.882676,0.873496,0.827562,0.84307
7,0.1291,0.426148,0.883593,0.874162,0.827379,0.843344
8,0.1136,0.445616,0.883593,0.862271,0.827908,0.840224
9,0.1014,0.458323,0.880843,0.863994,0.83437,0.845695
10,0.0954,0.471064,0.878093,0.862414,0.831647,0.843391


[I 2025-03-28 04:05:44,796] Trial 64 finished with value: 0.8432808439714986 and parameters: {'learning_rate': 5.2251859965165314e-05, 'weight_decay': 0.004, 'warmup_steps': 27}. Best is trial 26 with value: 0.850863037784695.


Trial 65 with params: {'learning_rate': 0.00018354250754825782, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7116,0.399939,0.889093,0.894546,0.829711,0.851219
2,0.18,0.456474,0.87626,0.865162,0.822664,0.835923
3,0.0971,0.474907,0.878093,0.861408,0.831832,0.843156
4,0.065,0.523427,0.879927,0.862123,0.833612,0.844015
5,0.044,0.567961,0.87901,0.884531,0.832847,0.850146
6,0.035,0.613131,0.866178,0.836895,0.82157,0.827947
7,0.0285,0.662774,0.867094,0.843864,0.821829,0.830825
8,0.0227,0.693437,0.866178,0.814699,0.821024,0.817609
9,0.02,0.715496,0.866178,0.843036,0.822178,0.830521
10,0.0166,0.707979,0.868011,0.82584,0.822514,0.823892


[I 2025-03-28 04:08:45,528] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 1.9033740053701716e-05, 'weight_decay': 0.01, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5887,1.328269,0.580202,0.568372,0.468272,0.465185
2,1.1279,0.923225,0.76352,0.652228,0.652182,0.64926
3,0.7964,0.704032,0.820348,0.692143,0.702441,0.696609
4,0.6042,0.590973,0.842346,0.707503,0.720293,0.713543
5,0.4927,0.527915,0.852429,0.71457,0.729155,0.721529
6,0.4177,0.487079,0.859762,0.720847,0.734748,0.727575
7,0.369,0.462699,0.865261,0.892243,0.748212,0.750347
8,0.333,0.445859,0.869844,0.896401,0.770217,0.785156
9,0.3046,0.437989,0.873511,0.850841,0.79153,0.806349
10,0.2835,0.429821,0.87626,0.859053,0.802632,0.818712


[I 2025-03-28 04:11:00,772] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 5.177128400309818e-05, 'weight_decay': 0.004, 'warmup_steps': 12}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.261,0.761099,0.79835,0.679411,0.680753,0.678498
2,0.536,0.501606,0.857012,0.71731,0.733387,0.724849
3,0.3221,0.432791,0.877177,0.864809,0.812562,0.828932
4,0.2351,0.418668,0.882676,0.861801,0.826849,0.839439
5,0.1849,0.421851,0.88176,0.873062,0.826574,0.842287
6,0.1518,0.422496,0.880843,0.872365,0.825916,0.841689
7,0.1305,0.42531,0.885426,0.875746,0.828963,0.844907
8,0.1148,0.44433,0.883593,0.862343,0.827859,0.840231
9,0.1028,0.45601,0.880843,0.864411,0.83389,0.845694
10,0.0966,0.470062,0.877177,0.861776,0.831002,0.842766


[I 2025-03-28 04:13:03,206] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 5.32962167516752e-05, 'weight_decay': 0.003, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2556,0.746619,0.808433,0.686595,0.689688,0.686869
2,0.5229,0.4943,0.857929,0.717739,0.734053,0.725377
3,0.3128,0.430302,0.880843,0.867743,0.816071,0.832099
4,0.2282,0.417979,0.882676,0.861867,0.826835,0.839477
5,0.1793,0.422846,0.883593,0.874424,0.828172,0.843739
6,0.1472,0.425297,0.879927,0.871729,0.824936,0.840878
7,0.1265,0.427486,0.886343,0.876247,0.829643,0.845513
8,0.111,0.447551,0.886343,0.864488,0.829908,0.84233
9,0.0993,0.46023,0.879927,0.863473,0.833126,0.84483
10,0.0933,0.474272,0.87626,0.860993,0.830049,0.841881


[I 2025-03-28 04:14:59,646] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.00043745040632071956, 'weight_decay': 0.003, 'warmup_steps': 21}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5241,0.432814,0.877177,0.871618,0.821739,0.839049
2,0.1022,0.575231,0.859762,0.801946,0.819078,0.809113
3,0.0535,0.687776,0.852429,0.82744,0.809836,0.817468
4,0.0337,0.710435,0.862511,0.850598,0.819006,0.831098
5,0.0222,0.811386,0.860678,0.829792,0.817644,0.821574
6,0.0206,0.818572,0.864345,0.865832,0.820634,0.837332
7,0.0152,0.943761,0.846929,0.812591,0.806414,0.80763
8,0.0119,0.922459,0.850596,0.820775,0.810074,0.813377
9,0.0097,0.973752,0.852429,0.850131,0.812515,0.824424
10,0.009,0.943513,0.858845,0.815916,0.815954,0.814684


[I 2025-03-28 04:16:59,461] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 2.2171647541058204e-05, 'weight_decay': 0.007, 'warmup_steps': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5628,1.256063,0.63703,0.597417,0.5284,0.536556
2,1.0352,0.832874,0.791934,0.67061,0.67759,0.673004
3,0.6992,0.631397,0.837764,0.704756,0.716501,0.710348
4,0.5227,0.537764,0.850596,0.714318,0.727264,0.720479
5,0.4227,0.486651,0.859762,0.720876,0.735031,0.727693
6,0.3574,0.45361,0.870761,0.895769,0.770889,0.78511
7,0.3147,0.436226,0.877177,0.90036,0.793976,0.814884
8,0.2835,0.424549,0.874427,0.858675,0.80101,0.817812
9,0.2586,0.421782,0.885426,0.874433,0.828677,0.844198
10,0.2407,0.41575,0.882676,0.872597,0.826051,0.842


[I 2025-03-28 04:20:00,587] Trial 70 finished with value: 0.8434907631034756 and parameters: {'learning_rate': 2.2171647541058204e-05, 'weight_decay': 0.007, 'warmup_steps': 16}. Best is trial 26 with value: 0.850863037784695.


Trial 71 with params: {'learning_rate': 2.4214881473945043e-05, 'weight_decay': 0.003, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5392,1.205177,0.670944,0.613099,0.563734,0.57215
2,0.9774,0.782821,0.804766,0.680492,0.688626,0.683603
3,0.6473,0.597179,0.842346,0.707522,0.720223,0.713555
4,0.4815,0.513594,0.853346,0.716382,0.729658,0.72266
5,0.3884,0.468796,0.864345,0.890783,0.747855,0.749357


[I 2025-03-28 04:20:59,043] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 1.998052599003102e-05, 'weight_decay': 0.004, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5902,1.318631,0.594867,0.572913,0.483475,0.484034
2,1.1051,0.89472,0.778185,0.662163,0.665398,0.661847
3,0.7645,0.676647,0.828598,0.698561,0.709234,0.703313
4,0.5748,0.570114,0.849679,0.713129,0.7268,0.719663
5,0.4669,0.511232,0.857012,0.718163,0.733031,0.725298
6,0.3955,0.47329,0.862511,0.722846,0.737013,0.729709
7,0.3488,0.45088,0.871677,0.896875,0.762189,0.771317
8,0.3143,0.436289,0.873511,0.898856,0.800295,0.823644
9,0.2872,0.430187,0.87901,0.874161,0.796439,0.81351
10,0.2672,0.423037,0.879927,0.866653,0.814816,0.831109


[I 2025-03-28 04:22:59,292] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 1.1597714681187563e-05, 'weight_decay': 0.01, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6786,1.540125,0.453712,0.234089,0.34128,0.275375
2,1.407,1.230685,0.664528,0.609028,0.557874,0.566887
3,1.1291,0.990962,0.747938,0.644242,0.638303,0.636969
4,0.9196,0.834503,0.792851,0.672752,0.678201,0.673896
5,0.7757,0.730243,0.821265,0.692464,0.70331,0.697196


[I 2025-03-28 04:24:00,753] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 1.4119579421359505e-05, 'weight_decay': 0.004, 'warmup_steps': 15}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6493,1.472948,0.463795,0.398658,0.350329,0.285023
2,1.3079,1.108141,0.71769,0.629394,0.611393,0.612972
3,0.9967,0.869088,0.785518,0.669558,0.670678,0.668373
4,0.7856,0.722087,0.816682,0.689756,0.699402,0.693992
5,0.6514,0.634682,0.834097,0.701617,0.713734,0.707268
6,0.5571,0.575491,0.850596,0.713628,0.727693,0.720326
7,0.4947,0.536485,0.855179,0.717896,0.730562,0.723952
8,0.4473,0.508825,0.858845,0.721328,0.734027,0.727422
9,0.4116,0.491433,0.861595,0.7228,0.736632,0.729496
10,0.3835,0.478175,0.862511,0.723124,0.737514,0.730097


[I 2025-03-28 04:26:02,822] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 4.622086545925952e-05, 'weight_decay': 0.005, 'warmup_steps': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3147,0.825534,0.787351,0.672721,0.670967,0.669394
2,0.5923,0.526818,0.854262,0.715883,0.730777,0.722939
3,0.3591,0.444737,0.872594,0.896926,0.781586,0.799516
4,0.2626,0.421349,0.883593,0.86209,0.827012,0.839641
5,0.2078,0.418636,0.878093,0.869884,0.823285,0.839008
6,0.171,0.417048,0.880843,0.872064,0.825748,0.841412
7,0.1469,0.418723,0.88451,0.874384,0.828539,0.844031
8,0.1305,0.433222,0.88451,0.874558,0.828615,0.844169
9,0.1165,0.442703,0.878093,0.857981,0.822766,0.835472
10,0.1099,0.453874,0.880843,0.864254,0.834009,0.845554


[I 2025-03-28 04:29:05,897] Trial 75 finished with value: 0.8443762929534229 and parameters: {'learning_rate': 4.622086545925952e-05, 'weight_decay': 0.005, 'warmup_steps': 16}. Best is trial 26 with value: 0.850863037784695.


Trial 76 with params: {'learning_rate': 2.9404775679017797e-05, 'weight_decay': 0.005, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4843,1.091359,0.710357,0.631311,0.603187,0.608019
2,0.8523,0.682696,0.826764,0.696505,0.707852,0.701467
3,0.5438,0.533121,0.852429,0.715268,0.72881,0.721735
4,0.4007,0.470362,0.866178,0.725726,0.740302,0.732719
5,0.3219,0.440155,0.877177,0.900012,0.794542,0.814921
6,0.2703,0.418579,0.87901,0.866897,0.814056,0.830958
7,0.2371,0.412794,0.885426,0.874723,0.828294,0.844146
8,0.2118,0.410644,0.882676,0.872987,0.825989,0.842185
9,0.1921,0.41541,0.890009,0.878143,0.833101,0.848045
10,0.1785,0.413174,0.885426,0.874762,0.829045,0.844426


Using the latest cached version of the module from /home/jovyan/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sat Oct 12 13:56:14 2024) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
[I 2025-03-28 04:35:11,532] Trial 77 finished with value: 0.8421179834739047 and parameters: {'learning_rate': 5.577616614888407e-05, 'weight_decay': 0.007, 'warmup_steps': 19}. Best is trial 26 with value: 0.850863037784695.


Trial 78 with params: {'learning_rate': 3.063887722916546e-05, 'weight_decay': 0.006, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4749,1.068892,0.72044,0.636724,0.61229,0.616274
2,0.8273,0.663661,0.829514,0.698749,0.710081,0.703744
3,0.5235,0.521279,0.855179,0.717546,0.730907,0.723951
4,0.385,0.462525,0.869844,0.895539,0.752217,0.753878
5,0.3089,0.435296,0.87901,0.877953,0.805492,0.823899


[I 2025-03-28 04:36:13,587] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 3.739481105676959e-05, 'weight_decay': 0.005, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4007,0.947898,0.752521,0.650757,0.640539,0.640419
2,0.7071,0.588106,0.846929,0.710793,0.724786,0.717444
3,0.4377,0.477413,0.862511,0.722231,0.737204,0.7294
4,0.3209,0.436869,0.87626,0.899648,0.77616,0.789408
5,0.2565,0.420939,0.879927,0.871369,0.824195,0.840384
6,0.2137,0.412161,0.88176,0.872205,0.826424,0.841924
7,0.185,0.411124,0.88451,0.873958,0.828197,0.843583
8,0.1644,0.417622,0.880843,0.871746,0.825458,0.841222
9,0.1473,0.423257,0.886343,0.875352,0.830561,0.845454
10,0.1383,0.427866,0.886343,0.87589,0.829851,0.845461


[I 2025-03-28 04:39:13,885] Trial 79 finished with value: 0.8424281619355719 and parameters: {'learning_rate': 3.739481105676959e-05, 'weight_decay': 0.005, 'warmup_steps': 19}. Best is trial 26 with value: 0.850863037784695.


Trial 80 with params: {'learning_rate': 0.00025631051413890516, 'weight_decay': 0.004, 'warmup_steps': 11}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6335,0.414086,0.87901,0.890177,0.820119,0.843295
2,0.1428,0.459999,0.875344,0.882978,0.82981,0.847979
3,0.0733,0.518645,0.865261,0.819608,0.820637,0.819785
4,0.0494,0.604401,0.867094,0.829214,0.822658,0.825191
5,0.0307,0.665971,0.872594,0.881078,0.82749,0.845072
6,0.0265,0.703929,0.858845,0.811624,0.815866,0.81356
7,0.0214,0.745692,0.853346,0.807817,0.811801,0.809154
8,0.0151,0.773473,0.858845,0.802942,0.82379,0.811638
9,0.015,0.770058,0.857929,0.821051,0.815193,0.817476
10,0.0129,0.754005,0.866178,0.809266,0.820532,0.814218


[I 2025-03-28 04:41:21,977] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 5.162009628136399e-05, 'weight_decay': 0.005, 'warmup_steps': 16}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2667,0.763527,0.799267,0.680274,0.681419,0.679237
2,0.5378,0.501588,0.857012,0.71731,0.733387,0.724849
3,0.3228,0.432998,0.878093,0.865453,0.813493,0.829706
4,0.2356,0.418563,0.88176,0.86117,0.826169,0.838795
5,0.1854,0.421871,0.88176,0.873205,0.826574,0.842331
6,0.1522,0.422784,0.88176,0.873029,0.826582,0.842354
7,0.1309,0.42525,0.886343,0.87633,0.829643,0.845545
8,0.1152,0.444325,0.885426,0.875191,0.829255,0.844822
9,0.103,0.456109,0.87901,0.862792,0.832508,0.844152
10,0.0969,0.469674,0.87901,0.863041,0.832362,0.844088


[I 2025-03-28 04:44:27,586] Trial 81 finished with value: 0.8436865695260525 and parameters: {'learning_rate': 5.162009628136399e-05, 'weight_decay': 0.005, 'warmup_steps': 16}. Best is trial 26 with value: 0.850863037784695.


Trial 82 with params: {'learning_rate': 0.0002891902282670203, 'weight_decay': 0.0, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5825,0.400479,0.88176,0.877905,0.823444,0.842629
2,0.1259,0.495748,0.878093,0.888848,0.830873,0.850904
3,0.065,0.555805,0.871677,0.8226,0.826089,0.823794
4,0.0425,0.626065,0.868928,0.8466,0.823371,0.833083
5,0.0292,0.697989,0.859762,0.847318,0.816564,0.827414


[I 2025-03-28 04:45:30,858] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 3.7363981343763786e-05, 'weight_decay': 0.005, 'warmup_steps': 13}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3965,0.947708,0.749771,0.649008,0.638178,0.638234
2,0.7079,0.589618,0.847846,0.711444,0.725453,0.718103
3,0.439,0.478839,0.862511,0.722526,0.737239,0.729596
4,0.322,0.437606,0.872594,0.896967,0.772951,0.786517
5,0.2574,0.421302,0.87901,0.87062,0.823529,0.839677
6,0.2144,0.412067,0.880843,0.871692,0.825229,0.8411
7,0.1856,0.411059,0.885426,0.875179,0.828864,0.844565
8,0.1649,0.41722,0.88176,0.872813,0.826111,0.842136
9,0.1479,0.422728,0.885426,0.874708,0.829881,0.844806
10,0.1388,0.427359,0.88451,0.874487,0.828254,0.843954


[I 2025-03-28 04:48:34,608] Trial 83 finished with value: 0.8417891996419513 and parameters: {'learning_rate': 3.7363981343763786e-05, 'weight_decay': 0.005, 'warmup_steps': 13}. Best is trial 26 with value: 0.850863037784695.


Trial 84 with params: {'learning_rate': 0.00017456084246660265, 'weight_decay': 0.008, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7578,0.408106,0.886343,0.889809,0.818609,0.84011
2,0.1902,0.459252,0.87901,0.870463,0.833578,0.845971
3,0.1027,0.468049,0.87901,0.872303,0.832032,0.846589
4,0.0686,0.523915,0.87626,0.849995,0.830596,0.83804
5,0.0465,0.589608,0.871677,0.856899,0.826811,0.837844
6,0.0366,0.582523,0.873511,0.843242,0.826877,0.834042
7,0.0314,0.623972,0.871677,0.848182,0.826553,0.83512
8,0.0244,0.661125,0.870761,0.817913,0.825616,0.821334
9,0.0211,0.677525,0.870761,0.838632,0.825263,0.830774
10,0.0177,0.681483,0.871677,0.819326,0.825676,0.822194


[I 2025-03-28 04:50:38,285] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 4.749961028292682e-05, 'weight_decay': 0.005, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3095,0.811183,0.792851,0.677114,0.675509,0.674122
2,0.5787,0.51902,0.854262,0.715664,0.730777,0.72282
3,0.3491,0.441243,0.877177,0.900417,0.803623,0.825893
4,0.2553,0.420739,0.882676,0.861754,0.826332,0.839115
5,0.2017,0.419549,0.879927,0.871249,0.824632,0.840345
6,0.1657,0.419034,0.883593,0.874035,0.828277,0.843648
7,0.1425,0.420415,0.88451,0.874367,0.828324,0.84392
8,0.1265,0.436469,0.885426,0.875289,0.829282,0.84488
9,0.1128,0.446682,0.87626,0.856699,0.821371,0.834136
10,0.1065,0.458007,0.88176,0.865158,0.834654,0.846305


[I 2025-03-28 04:53:40,925] Trial 85 finished with value: 0.844374788118822 and parameters: {'learning_rate': 4.749961028292682e-05, 'weight_decay': 0.005, 'warmup_steps': 23}. Best is trial 26 with value: 0.850863037784695.


Trial 86 with params: {'learning_rate': 3.5848963292846626e-05, 'weight_decay': 0.006, 'warmup_steps': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4147,0.971976,0.745188,0.647301,0.634413,0.634649
2,0.7316,0.603042,0.846013,0.710507,0.724057,0.716916
3,0.4552,0.486324,0.861595,0.721951,0.73651,0.728981
4,0.3341,0.441982,0.871677,0.896267,0.772235,0.785838
5,0.2674,0.423405,0.877177,0.869116,0.821864,0.838153
6,0.2231,0.412246,0.88176,0.860683,0.82581,0.838454
7,0.1937,0.410661,0.88451,0.874398,0.828197,0.843866
8,0.1721,0.41577,0.879927,0.871145,0.824196,0.840367
9,0.1545,0.42086,0.887259,0.876099,0.831228,0.846184
10,0.1447,0.424032,0.883593,0.873536,0.827803,0.843209


[I 2025-03-28 04:56:36,604] Trial 86 finished with value: 0.8424337243427763 and parameters: {'learning_rate': 3.5848963292846626e-05, 'weight_decay': 0.006, 'warmup_steps': 17}. Best is trial 26 with value: 0.850863037784695.


Trial 87 with params: {'learning_rate': 2.9526526749362293e-05, 'weight_decay': 0.01, 'warmup_steps': 5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4737,1.085573,0.704858,0.626612,0.598421,0.602299
2,0.8501,0.685227,0.824015,0.693686,0.705852,0.699056
3,0.545,0.535863,0.854262,0.71696,0.730241,0.723282
4,0.402,0.473258,0.864345,0.724923,0.738726,0.731521
5,0.3231,0.442096,0.874427,0.897978,0.783404,0.800977
6,0.2713,0.42035,0.879927,0.855137,0.814736,0.828575
7,0.2382,0.414304,0.885426,0.874723,0.828294,0.844146
8,0.2129,0.411626,0.880843,0.87181,0.824628,0.840914
9,0.1928,0.415845,0.888176,0.876997,0.831755,0.846859
10,0.1792,0.413458,0.88451,0.874248,0.828364,0.843854


[I 2025-03-28 04:59:35,055] Trial 87 finished with value: 0.8449660617652239 and parameters: {'learning_rate': 2.9526526749362293e-05, 'weight_decay': 0.01, 'warmup_steps': 5}. Best is trial 26 with value: 0.850863037784695.


Trial 88 with params: {'learning_rate': 3.536988479699025e-05, 'weight_decay': 0.01, 'warmup_steps': 8}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4131,0.979237,0.740605,0.645387,0.630022,0.630709
2,0.7401,0.609939,0.845096,0.71071,0.72291,0.716352
3,0.4623,0.491177,0.858845,0.720071,0.733981,0.726742
4,0.3396,0.444753,0.871677,0.896339,0.772187,0.785869
5,0.2718,0.424961,0.87626,0.864869,0.81206,0.828871
6,0.227,0.412216,0.878093,0.858173,0.822566,0.83561
7,0.1972,0.410423,0.886343,0.876117,0.82983,0.845556
8,0.1752,0.414786,0.880843,0.872085,0.824849,0.841174
9,0.1575,0.419546,0.886343,0.875437,0.830095,0.845285
10,0.1473,0.422255,0.886343,0.875637,0.829816,0.845258


[I 2025-03-28 05:02:34,039] Trial 88 finished with value: 0.8432927951859873 and parameters: {'learning_rate': 3.536988479699025e-05, 'weight_decay': 0.01, 'warmup_steps': 8}. Best is trial 26 with value: 0.850863037784695.


Trial 89 with params: {'learning_rate': 3.381400320856641e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4272,1.005051,0.734189,0.640938,0.624856,0.62535
2,0.7669,0.627465,0.83868,0.705533,0.71745,0.710987
3,0.4819,0.501601,0.857012,0.718726,0.732268,0.725249
4,0.3544,0.45113,0.874427,0.899008,0.774187,0.7883
5,0.284,0.428572,0.875344,0.863548,0.811393,0.827849


[I 2025-03-28 05:03:34,327] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 7.804181522315505e-05, 'weight_decay': 0.01, 'warmup_steps': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0579,0.572892,0.840513,0.709249,0.717764,0.712918
2,0.3707,0.438794,0.875344,0.875045,0.803204,0.821098
3,0.2163,0.406116,0.890926,0.866905,0.834262,0.845429
4,0.1569,0.408824,0.890009,0.878945,0.833387,0.848685
5,0.122,0.451382,0.88451,0.874665,0.838524,0.850494
6,0.098,0.445616,0.882676,0.87655,0.836319,0.851002
7,0.0825,0.462375,0.88451,0.867357,0.836914,0.848743
8,0.0695,0.491717,0.88176,0.87574,0.834962,0.849927
9,0.0608,0.496887,0.883593,0.877946,0.836203,0.851656
10,0.0559,0.5112,0.88176,0.866045,0.834473,0.846798


[I 2025-03-28 05:06:45,005] Trial 90 finished with value: 0.8355592854265427 and parameters: {'learning_rate': 7.804181522315505e-05, 'weight_decay': 0.01, 'warmup_steps': 1}. Best is trial 26 with value: 0.850863037784695.


Trial 91 with params: {'learning_rate': 3.376555996952573e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4262,1.005271,0.734189,0.640938,0.624856,0.62535
2,0.7676,0.628,0.839597,0.706299,0.718117,0.711698
3,0.4825,0.501889,0.857012,0.718726,0.732268,0.725249
4,0.3549,0.451288,0.874427,0.899008,0.774187,0.7883
5,0.2844,0.428687,0.875344,0.863548,0.811393,0.827849
6,0.2379,0.413069,0.878093,0.869696,0.822566,0.838835
7,0.2074,0.410551,0.885426,0.863535,0.828563,0.841194
8,0.1844,0.413568,0.880843,0.872152,0.824862,0.841206
9,0.1661,0.417653,0.886343,0.875136,0.830359,0.845197
10,0.1549,0.418809,0.886343,0.875926,0.829851,0.845452


[I 2025-03-28 05:09:53,007] Trial 91 finished with value: 0.8424698956148434 and parameters: {'learning_rate': 3.376555996952573e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3}. Best is trial 26 with value: 0.850863037784695.


Trial 92 with params: {'learning_rate': 2.9779930656995445e-05, 'weight_decay': 0.01, 'warmup_steps': 4}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4703,1.08024,0.709441,0.629111,0.602148,0.605516
2,0.8448,0.68154,0.825848,0.695732,0.707185,0.700753
3,0.5409,0.533616,0.854262,0.71696,0.730241,0.723282
4,0.399,0.471828,0.864345,0.724923,0.738726,0.731521
5,0.3206,0.441202,0.875344,0.89873,0.793195,0.813616


[I 2025-03-28 05:10:54,302] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 2.6825883865882788e-05, 'weight_decay': 0.006, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5128,1.14747,0.694775,0.624288,0.587971,0.59425
2,0.9115,0.727238,0.819432,0.69106,0.702214,0.695921
3,0.5905,0.561024,0.852429,0.715312,0.728527,0.721601
4,0.4366,0.488522,0.862511,0.723468,0.73718,0.730087
5,0.3513,0.451522,0.872594,0.89671,0.772554,0.786343
6,0.2956,0.42663,0.878093,0.861019,0.80385,0.820394
7,0.2599,0.417493,0.88451,0.862607,0.827247,0.840169
8,0.2329,0.411699,0.88176,0.860775,0.824924,0.838143
9,0.2117,0.41493,0.890009,0.866807,0.832784,0.844862
10,0.1969,0.4113,0.888176,0.865481,0.830727,0.843235


[I 2025-03-28 05:14:04,151] Trial 93 finished with value: 0.8484606073284771 and parameters: {'learning_rate': 2.6825883865882788e-05, 'weight_decay': 0.006, 'warmup_steps': 19}. Best is trial 26 with value: 0.850863037784695.


Trial 94 with params: {'learning_rate': 1.244193790218357e-05, 'weight_decay': 0.004, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6721,1.520267,0.459212,0.401319,0.34571,0.279085
2,1.3754,1.189281,0.686526,0.618941,0.579916,0.586995
3,1.0836,0.947505,0.754354,0.648343,0.644014,0.642676
4,0.8718,0.792778,0.805683,0.682044,0.689223,0.684448
5,0.7298,0.693558,0.826764,0.696632,0.707866,0.701702


[I 2025-03-28 05:15:08,038] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 2.1552915347302542e-05, 'weight_decay': 0.006, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5708,1.27475,0.627864,0.593586,0.518189,0.525158
2,1.0544,0.849178,0.788268,0.668882,0.673915,0.669752
3,0.7163,0.642868,0.831347,0.700855,0.711216,0.705644
4,0.5362,0.545932,0.850596,0.713892,0.72748,0.720405
5,0.4341,0.492746,0.858845,0.720129,0.734364,0.726984
6,0.3674,0.458338,0.866178,0.892094,0.757995,0.76679
7,0.3235,0.439606,0.874427,0.898235,0.77364,0.787699
8,0.2914,0.427196,0.873511,0.857684,0.800344,0.816961
9,0.266,0.423578,0.885426,0.874502,0.828628,0.844215
10,0.2475,0.417256,0.883593,0.873354,0.826717,0.842707


[I 2025-03-28 05:18:18,632] Trial 95 finished with value: 0.8435231036730043 and parameters: {'learning_rate': 2.1552915347302542e-05, 'weight_decay': 0.006, 'warmup_steps': 18}. Best is trial 26 with value: 0.850863037784695.


Trial 96 with params: {'learning_rate': 2.0108259968324582e-05, 'weight_decay': 0.007, 'warmup_steps': 8}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5815,1.303861,0.598533,0.577564,0.488131,0.491157
2,1.0956,0.89122,0.775435,0.659655,0.6623,0.658682
3,0.7611,0.676674,0.823098,0.694296,0.705032,0.699111
4,0.5739,0.57087,0.846013,0.710392,0.723604,0.716661
5,0.4664,0.512209,0.856095,0.717573,0.732351,0.724668


[I 2025-03-28 05:19:19,695] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 2.6845855351169957e-05, 'weight_decay': 0.004, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5126,1.146997,0.694775,0.624288,0.587971,0.59425
2,0.9111,0.726858,0.819432,0.69106,0.702214,0.695921
3,0.5901,0.560764,0.852429,0.715312,0.728527,0.721601
4,0.4363,0.488383,0.862511,0.723468,0.73718,0.730087
5,0.351,0.451423,0.872594,0.89671,0.772554,0.786343
6,0.2953,0.426555,0.878093,0.861019,0.80385,0.820394
7,0.2597,0.417429,0.88451,0.862607,0.827247,0.840169
8,0.2327,0.411693,0.88176,0.860775,0.824924,0.838143
9,0.2116,0.414923,0.889093,0.866077,0.832117,0.844151
10,0.1968,0.411286,0.888176,0.865481,0.830727,0.843235


[I 2025-03-28 05:22:29,241] Trial 97 finished with value: 0.8484606073284771 and parameters: {'learning_rate': 2.6845855351169957e-05, 'weight_decay': 0.004, 'warmup_steps': 19}. Best is trial 26 with value: 0.850863037784695.


Trial 98 with params: {'learning_rate': 1.5455469746057767e-05, 'weight_decay': 0.008, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6376,1.438665,0.47571,0.378519,0.361012,0.300465
2,1.2603,1.053086,0.733272,0.636345,0.625719,0.625058
3,0.9364,0.814868,0.800183,0.679591,0.683939,0.68043
4,0.7265,0.676154,0.824931,0.696028,0.706487,0.700829
5,0.5984,0.59703,0.846013,0.709848,0.724195,0.716692
6,0.5102,0.543555,0.855179,0.717769,0.730792,0.724012
7,0.4518,0.508759,0.858845,0.721156,0.733825,0.727248
8,0.4081,0.485092,0.859762,0.721826,0.734756,0.728054
9,0.3751,0.470847,0.866178,0.726228,0.740243,0.733013
10,0.3491,0.459225,0.867094,0.727003,0.740888,0.73378


[I 2025-03-28 05:24:35,321] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 1.6609798899362378e-05, 'weight_decay': 0.005, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6255,1.40721,0.495875,0.546514,0.381493,0.336306
2,1.2187,1.007971,0.741522,0.637374,0.634511,0.631592
3,0.8874,0.773159,0.807516,0.684065,0.690494,0.686188
4,0.6812,0.642953,0.832264,0.700268,0.712439,0.706015
5,0.5589,0.570026,0.855179,0.717555,0.731555,0.724253


[I 2025-03-28 05:25:37,013] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 2.125979762319959e-05, 'weight_decay': 0.005, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5741,1.282614,0.619615,0.587303,0.509654,0.515242
2,1.0633,0.857275,0.787351,0.668566,0.672984,0.66908
3,0.7248,0.648784,0.831347,0.70094,0.711216,0.705681
4,0.543,0.550195,0.850596,0.713892,0.72748,0.720405
5,0.4399,0.495949,0.858845,0.720129,0.734364,0.726984


[I 2025-03-28 05:26:38,030] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 2.7172893373246226e-05, 'weight_decay': 0.003, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5127,1.142954,0.697525,0.626136,0.590302,0.596475
2,0.9048,0.720285,0.819432,0.691652,0.7022,0.69616
3,0.5833,0.556172,0.852429,0.71527,0.728527,0.721575
4,0.4307,0.484813,0.862511,0.723468,0.73718,0.730087
5,0.3463,0.449088,0.874427,0.898149,0.774219,0.787867
6,0.2913,0.424753,0.880843,0.868213,0.815036,0.832141
7,0.256,0.416336,0.886343,0.875489,0.828961,0.844895
8,0.2292,0.411115,0.88176,0.872318,0.824991,0.841389
9,0.2083,0.414797,0.890926,0.87901,0.833782,0.848895
10,0.1938,0.411313,0.888176,0.87694,0.831045,0.846561


[I 2025-03-28 05:29:45,544] Trial 101 finished with value: 0.8474436822789065 and parameters: {'learning_rate': 2.7172893373246226e-05, 'weight_decay': 0.003, 'warmup_steps': 24}. Best is trial 26 with value: 0.850863037784695.


Trial 102 with params: {'learning_rate': 3.0448702788300253e-05, 'weight_decay': 0.003, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4803,1.075326,0.719523,0.636857,0.611672,0.61592
2,0.8324,0.666241,0.828598,0.698205,0.709401,0.703124
3,0.5262,0.522473,0.855179,0.717546,0.730907,0.723951
4,0.3867,0.462995,0.868011,0.727679,0.741733,0.734427
5,0.3102,0.435593,0.87901,0.87791,0.805492,0.823864


[I 2025-03-28 05:30:46,385] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 3.157739885515651e-05, 'weight_decay': 0.002, 'warmup_steps': 21}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4624,1.048598,0.725023,0.636516,0.616546,0.618801
2,0.8079,0.651154,0.832264,0.70067,0.712144,0.705741
3,0.5098,0.514059,0.855179,0.717266,0.730859,0.723801
4,0.3749,0.4583,0.870761,0.896476,0.752981,0.754713
5,0.3007,0.432736,0.87901,0.877953,0.805492,0.823899
6,0.2521,0.414517,0.882676,0.869209,0.817631,0.833823
7,0.2205,0.410856,0.88451,0.874033,0.827614,0.843446
8,0.1965,0.411936,0.882676,0.873684,0.825975,0.842548
9,0.1776,0.416781,0.887259,0.876096,0.831074,0.846018
10,0.1652,0.415683,0.886343,0.875853,0.829914,0.845447


[I 2025-03-28 05:33:47,661] Trial 103 finished with value: 0.8446184587290256 and parameters: {'learning_rate': 3.157739885515651e-05, 'weight_decay': 0.002, 'warmup_steps': 21}. Best is trial 26 with value: 0.850863037784695.


Trial 104 with params: {'learning_rate': 1.8881131725911846e-05, 'weight_decay': 0.002, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6026,1.347302,0.562786,0.561176,0.449333,0.438607
2,1.1415,0.929379,0.762603,0.651356,0.651896,0.648592
3,0.8017,0.704299,0.824015,0.695258,0.705358,0.69966
4,0.6057,0.590407,0.84418,0.709007,0.722107,0.715251
5,0.4934,0.527285,0.854262,0.715678,0.730753,0.722841


[I 2025-03-28 05:34:47,641] Trial 104 pruned. 


Trial 105 with params: {'learning_rate': 2.8169770285687927e-05, 'weight_decay': 0.0, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5033,1.122315,0.698442,0.626349,0.591556,0.597698
2,0.882,0.70239,0.821265,0.692304,0.703825,0.697316
3,0.5647,0.544706,0.853346,0.716021,0.729525,0.722422
4,0.4161,0.477204,0.862511,0.723061,0.737194,0.729882
5,0.3343,0.444307,0.874427,0.898122,0.774219,0.787881
6,0.281,0.421227,0.88176,0.868839,0.816083,0.83294
7,0.2468,0.414202,0.885426,0.874755,0.828294,0.84418
8,0.2206,0.410391,0.882676,0.873135,0.825975,0.842258
9,0.2003,0.41493,0.890009,0.878094,0.833101,0.848054
10,0.1863,0.411902,0.888176,0.877169,0.831093,0.846711


[I 2025-03-28 05:37:48,161] Trial 105 finished with value: 0.845364211849101 and parameters: {'learning_rate': 2.8169770285687927e-05, 'weight_decay': 0.0, 'warmup_steps': 26}. Best is trial 26 with value: 0.850863037784695.


Trial 106 with params: {'learning_rate': 3.4629928116921516e-05, 'weight_decay': 0.0, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4328,0.994266,0.740605,0.645734,0.63014,0.630706
2,0.7521,0.614562,0.84143,0.707273,0.720098,0.713277
3,0.4687,0.492273,0.859762,0.720755,0.734317,0.727326
4,0.3438,0.445171,0.874427,0.898353,0.774284,0.787932
5,0.2753,0.425089,0.877177,0.868609,0.822296,0.83807
6,0.2301,0.412301,0.882676,0.872798,0.826755,0.842438
7,0.2001,0.410293,0.885426,0.875073,0.828913,0.844555
8,0.1779,0.41459,0.879927,0.871503,0.824196,0.840544
9,0.16,0.419556,0.889093,0.877467,0.832623,0.847555
10,0.1495,0.421517,0.88451,0.87415,0.828483,0.843849


[I 2025-03-28 05:40:48,488] Trial 106 finished with value: 0.8417668650755075 and parameters: {'learning_rate': 3.4629928116921516e-05, 'weight_decay': 0.0, 'warmup_steps': 24}. Best is trial 26 with value: 0.850863037784695.


Trial 107 with params: {'learning_rate': 2.0447584636600805e-05, 'weight_decay': 0.001, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5841,1.305537,0.606783,0.579909,0.496796,0.500718
2,1.0894,0.880532,0.782768,0.665059,0.668974,0.665352
3,0.7494,0.665998,0.830431,0.699913,0.710616,0.704679
4,0.5627,0.562467,0.850596,0.713923,0.72748,0.72043
5,0.4567,0.505309,0.857012,0.718427,0.733031,0.725426
6,0.3868,0.468513,0.863428,0.723736,0.737693,0.730522
7,0.3409,0.447218,0.873511,0.897785,0.772924,0.787145
8,0.3072,0.433263,0.872594,0.874221,0.799628,0.819296
9,0.2806,0.427947,0.879927,0.874836,0.797154,0.814201
10,0.2611,0.421045,0.88176,0.868265,0.816247,0.832671


[I 2025-03-28 05:42:48,725] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 3.426814626010116e-05, 'weight_decay': 0.0, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4409,1.00313,0.737855,0.643999,0.627545,0.628332
2,0.7592,0.618056,0.840513,0.706508,0.719431,0.712569
3,0.4726,0.493929,0.860678,0.72149,0.734983,0.728034
4,0.3465,0.445763,0.873511,0.89766,0.773569,0.787253
5,0.2774,0.425484,0.879927,0.870766,0.824344,0.840175
6,0.2319,0.412247,0.882676,0.872798,0.826755,0.842438
7,0.2019,0.410057,0.886343,0.876071,0.829579,0.845401
8,0.1795,0.414181,0.880843,0.872217,0.824862,0.841244
9,0.1615,0.419231,0.888176,0.876806,0.831692,0.846765
10,0.1508,0.420903,0.885426,0.874819,0.829198,0.844547


[I 2025-03-28 05:45:49,229] Trial 108 finished with value: 0.8424061608033937 and parameters: {'learning_rate': 3.426814626010116e-05, 'weight_decay': 0.0, 'warmup_steps': 29}. Best is trial 26 with value: 0.850863037784695.


Trial 109 with params: {'learning_rate': 1.558163350423446e-05, 'weight_decay': 0.0, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.638,1.436494,0.474794,0.544675,0.360663,0.301575
2,1.2569,1.048957,0.733272,0.634978,0.626315,0.625023
3,0.9315,0.810222,0.800183,0.679591,0.683939,0.68043
4,0.7214,0.6722,0.824015,0.695031,0.705807,0.70001
5,0.5937,0.593726,0.847846,0.711533,0.725528,0.718236


[I 2025-03-28 05:46:48,594] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 2.725486244144337e-05, 'weight_decay': 0.003, 'warmup_steps': 21}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5096,1.139132,0.696609,0.625581,0.589587,0.595892
2,0.9018,0.718955,0.819432,0.691252,0.702214,0.69594
3,0.582,0.555572,0.852429,0.715312,0.728527,0.721601
4,0.4298,0.484678,0.862511,0.723468,0.73718,0.730087
5,0.3457,0.449024,0.873511,0.897449,0.773221,0.787056


[I 2025-03-28 05:47:48,590] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 2.6461884613481845e-05, 'weight_decay': 0.002, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.522,1.160274,0.692942,0.62378,0.585145,0.59183
2,0.923,0.733831,0.818515,0.690562,0.701533,0.695319
3,0.5974,0.564583,0.852429,0.715312,0.728527,0.721601
4,0.4414,0.490379,0.863428,0.724212,0.737847,0.730795
5,0.3551,0.452595,0.872594,0.89671,0.772554,0.786343
6,0.2988,0.427365,0.878093,0.861019,0.80385,0.820394
7,0.2628,0.418016,0.886343,0.875489,0.828961,0.844895
8,0.2355,0.411842,0.882676,0.86151,0.82559,0.83885
9,0.2142,0.414902,0.888176,0.86513,0.831119,0.843204
10,0.1992,0.411008,0.888176,0.865432,0.830727,0.8432


[I 2025-03-28 05:50:49,021] Trial 111 finished with value: 0.8483208726638273 and parameters: {'learning_rate': 2.6461884613481845e-05, 'weight_decay': 0.002, 'warmup_steps': 26}. Best is trial 26 with value: 0.850863037784695.


Trial 112 with params: {'learning_rate': 2.4661659308864655e-05, 'weight_decay': 0.001, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5429,1.204421,0.673694,0.614817,0.565932,0.574591
2,0.9703,0.771795,0.809349,0.684321,0.693089,0.687754
3,0.6363,0.588651,0.846013,0.710095,0.723154,0.716313
4,0.4716,0.506859,0.858845,0.720731,0.733971,0.72705
5,0.3799,0.463757,0.866178,0.892048,0.758326,0.766889


[I 2025-03-28 05:51:49,601] Trial 112 pruned. 


Trial 113 with params: {'learning_rate': 4.58173466256452e-05, 'weight_decay': 0.002, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3255,0.831844,0.789184,0.675276,0.6723,0.671189
2,0.5971,0.527642,0.854262,0.716098,0.730764,0.723018
3,0.3613,0.445413,0.871677,0.896021,0.772065,0.785656
4,0.2643,0.422193,0.882676,0.861736,0.826332,0.839136
5,0.2092,0.418521,0.879927,0.871249,0.824632,0.840345
6,0.1723,0.417484,0.883593,0.873923,0.828326,0.843616
7,0.1481,0.418557,0.882676,0.872964,0.826942,0.842528
8,0.1318,0.433215,0.88451,0.874352,0.828602,0.844072
9,0.1174,0.442507,0.878093,0.857981,0.822766,0.835472
10,0.1109,0.452861,0.879927,0.863315,0.833355,0.8448


[I 2025-03-28 05:54:49,273] Trial 113 finished with value: 0.8443762929534229 and parameters: {'learning_rate': 4.58173466256452e-05, 'weight_decay': 0.002, 'warmup_steps': 24}. Best is trial 26 with value: 0.850863037784695.


Trial 114 with params: {'learning_rate': 1.9481794227913714e-05, 'weight_decay': 0.003, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5977,1.333051,0.578368,0.570236,0.465782,0.461833
2,1.123,0.910879,0.769019,0.655043,0.657638,0.65386
3,0.7815,0.688721,0.828598,0.698737,0.709234,0.703441
4,0.5884,0.578786,0.848763,0.712183,0.726133,0.718819
5,0.4783,0.518035,0.856095,0.717357,0.732351,0.72452


[I 2025-03-28 05:55:48,674] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 6.888535367796023e-05, 'weight_decay': 0.002, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1505,0.626794,0.831347,0.702447,0.709363,0.705295
2,0.4185,0.452615,0.865261,0.890164,0.776262,0.793316
3,0.2457,0.416536,0.882676,0.860403,0.827824,0.839065
4,0.1779,0.413345,0.882676,0.873366,0.827687,0.843073
5,0.1391,0.439187,0.883593,0.872382,0.828631,0.842795
6,0.1128,0.449431,0.882676,0.8763,0.836521,0.85096
7,0.0957,0.455777,0.882676,0.865691,0.835289,0.847082
8,0.082,0.478784,0.879927,0.863689,0.833377,0.845015
9,0.0725,0.490977,0.877177,0.862665,0.830234,0.842969
10,0.0661,0.506683,0.87901,0.863977,0.832133,0.844569


[I 2025-03-28 05:58:47,806] Trial 115 finished with value: 0.8445035794178897 and parameters: {'learning_rate': 6.888535367796023e-05, 'weight_decay': 0.002, 'warmup_steps': 28}. Best is trial 26 with value: 0.850863037784695.


Trial 116 with params: {'learning_rate': 4.1112689788198324e-05, 'weight_decay': 0.001, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3637,0.892984,0.768103,0.66024,0.653607,0.652761
2,0.6542,0.558299,0.851512,0.71434,0.728464,0.721054
3,0.4011,0.460943,0.867094,0.892796,0.759187,0.767683
4,0.2937,0.428277,0.88176,0.868054,0.816458,0.83251
5,0.2338,0.417748,0.87626,0.868559,0.821404,0.83747
6,0.1938,0.413315,0.880843,0.871599,0.825682,0.841226
7,0.1669,0.413819,0.882676,0.872443,0.827282,0.842407
8,0.1485,0.423265,0.883593,0.873713,0.828054,0.843463
9,0.1326,0.430446,0.882676,0.872557,0.826775,0.842221
10,0.1251,0.437957,0.88451,0.874425,0.827927,0.84376


[I 2025-03-28 06:01:51,178] Trial 116 finished with value: 0.8516606919574525 and parameters: {'learning_rate': 4.1112689788198324e-05, 'weight_decay': 0.001, 'warmup_steps': 18}. Best is trial 116 with value: 0.8516606919574525.


Trial 117 with params: {'learning_rate': 2.9032939881709474e-05, 'weight_decay': 0.001, 'warmup_steps': 14}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4847,1.097048,0.704858,0.627879,0.598076,0.602745
2,0.86,0.689937,0.825848,0.695758,0.707185,0.700769
3,0.5511,0.537988,0.852429,0.715566,0.72881,0.72187
4,0.4065,0.473993,0.864345,0.724437,0.738907,0.731398
5,0.3267,0.442463,0.875344,0.898718,0.784071,0.801691
6,0.2745,0.420169,0.87901,0.866897,0.814056,0.830958
7,0.241,0.413745,0.885426,0.874723,0.828294,0.844146
8,0.2154,0.410861,0.882676,0.873135,0.825975,0.842258
9,0.1954,0.415284,0.890009,0.878363,0.833101,0.848185
10,0.1815,0.412889,0.885426,0.87482,0.829045,0.844457


[I 2025-03-28 06:04:55,839] Trial 117 finished with value: 0.844059391663972 and parameters: {'learning_rate': 2.9032939881709474e-05, 'weight_decay': 0.001, 'warmup_steps': 14}. Best is trial 116 with value: 0.8516606919574525.


Trial 118 with params: {'learning_rate': 4.8066577265738876e-05, 'weight_decay': 0.001, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3017,0.80391,0.793767,0.677881,0.676176,0.674826
2,0.5726,0.516755,0.854262,0.715471,0.730777,0.722678
3,0.3453,0.439976,0.87901,0.901849,0.804956,0.82731
4,0.2525,0.42015,0.883593,0.862487,0.826999,0.839834
5,0.1994,0.419762,0.879927,0.871249,0.824632,0.840345
6,0.1638,0.41924,0.88176,0.872736,0.826415,0.842082
7,0.1408,0.420947,0.885426,0.875172,0.82899,0.844618
8,0.1248,0.437336,0.88451,0.874627,0.828351,0.844079
9,0.1114,0.447742,0.877177,0.8611,0.831175,0.842647
10,0.1051,0.459535,0.880843,0.864571,0.833974,0.845666


[I 2025-03-28 06:07:59,938] Trial 118 finished with value: 0.844374788118822 and parameters: {'learning_rate': 4.8066577265738876e-05, 'weight_decay': 0.001, 'warmup_steps': 20}. Best is trial 116 with value: 0.8516606919574525.


Trial 119 with params: {'learning_rate': 1.2491306267059817e-05, 'weight_decay': 0.01, 'warmup_steps': 5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.662,1.512159,0.463795,0.402358,0.349265,0.281091
2,1.3676,1.180696,0.687443,0.618675,0.580994,0.587445
3,1.0773,0.944168,0.752521,0.646805,0.642179,0.64063
4,0.8685,0.791951,0.8011,0.678297,0.685523,0.680745
5,0.7285,0.69352,0.824931,0.695334,0.706268,0.700225


[I 2025-03-28 06:09:00,324] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 5.2800643670786514e-05, 'weight_decay': 0.008, 'warmup_steps': 23}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2641,0.752567,0.805683,0.684764,0.687043,0.684566
2,0.5276,0.495686,0.857929,0.717739,0.734053,0.725377
3,0.3154,0.43112,0.880843,0.867743,0.816071,0.832099
4,0.2302,0.417988,0.88451,0.863254,0.828266,0.840862
5,0.1809,0.422695,0.88451,0.875219,0.828839,0.844441
6,0.1485,0.424948,0.879927,0.871729,0.824936,0.840878
7,0.1277,0.426689,0.885426,0.875568,0.828712,0.844707
8,0.1122,0.446535,0.88451,0.863009,0.828574,0.840925
9,0.1003,0.459275,0.880843,0.863994,0.83437,0.845695
10,0.0943,0.472652,0.877177,0.861664,0.83098,0.842691


[I 2025-03-28 06:11:00,508] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 3.884820001389205e-05, 'weight_decay': 0.001, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3864,0.925859,0.75802,0.654424,0.64502,0.645011
2,0.6855,0.575663,0.848763,0.712035,0.726451,0.718857
3,0.4226,0.470338,0.864345,0.890165,0.747723,0.748901
4,0.3097,0.433017,0.880843,0.882691,0.816042,0.835079
5,0.2472,0.419235,0.879927,0.871193,0.82423,0.840297
6,0.2055,0.412442,0.88176,0.8724,0.826362,0.842012
7,0.1775,0.412035,0.880843,0.871184,0.825468,0.840839
8,0.1578,0.419692,0.88176,0.872463,0.826125,0.841925
9,0.1412,0.425741,0.88451,0.87404,0.828901,0.843999
10,0.1329,0.431613,0.88451,0.874287,0.82824,0.843854


[I 2025-03-28 06:14:02,338] Trial 121 finished with value: 0.8415686266596509 and parameters: {'learning_rate': 3.884820001389205e-05, 'weight_decay': 0.001, 'warmup_steps': 19}. Best is trial 116 with value: 0.8516606919574525.


Trial 122 with params: {'learning_rate': 3.668200474448707e-05, 'weight_decay': 0.002, 'warmup_steps': 20}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4086,0.959169,0.747938,0.648935,0.636893,0.637173
2,0.7181,0.594477,0.845096,0.709475,0.72339,0.716075
3,0.4453,0.481047,0.861595,0.721609,0.736524,0.728765
4,0.3266,0.438867,0.874427,0.898263,0.774813,0.788043
5,0.2612,0.42185,0.878093,0.869866,0.822531,0.838861
6,0.2177,0.412092,0.879927,0.859249,0.824759,0.837173
7,0.1887,0.410796,0.883593,0.873444,0.827531,0.84301
8,0.1677,0.416722,0.879927,0.870789,0.82446,0.840275
9,0.1504,0.422209,0.886343,0.875373,0.830561,0.845477
10,0.141,0.426093,0.88451,0.87421,0.828469,0.84388


[I 2025-03-28 06:17:02,518] Trial 122 finished with value: 0.8424448424317034 and parameters: {'learning_rate': 3.668200474448707e-05, 'weight_decay': 0.002, 'warmup_steps': 20}. Best is trial 116 with value: 0.8516606919574525.


Trial 123 with params: {'learning_rate': 2.6004107306320924e-05, 'weight_decay': 0.004, 'warmup_steps': 24}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5255,1.169338,0.683776,0.618591,0.576557,0.583688
2,0.9337,0.742964,0.817599,0.690286,0.700553,0.694687
3,0.6068,0.570583,0.851512,0.714628,0.727529,0.720792
4,0.4489,0.494542,0.862511,0.723626,0.736916,0.730034
5,0.3613,0.455382,0.869844,0.894368,0.770241,0.783975


[I 2025-03-28 06:18:02,500] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 3.922023162841399e-05, 'weight_decay': 0.01, 'warmup_steps': 0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3654,0.915426,0.758937,0.655191,0.645721,0.645773
2,0.6788,0.573719,0.847846,0.711946,0.725586,0.718354
3,0.4198,0.469402,0.865261,0.890795,0.748404,0.749537
4,0.308,0.430562,0.878093,0.880977,0.813165,0.832879
5,0.2457,0.41845,0.87901,0.870501,0.823515,0.839562
6,0.2042,0.409895,0.880843,0.871811,0.825215,0.841159
7,0.1762,0.410705,0.88176,0.872277,0.826302,0.841816
8,0.1566,0.417318,0.88176,0.87225,0.826394,0.841921
9,0.1404,0.423107,0.883593,0.873861,0.827538,0.843247
10,0.1322,0.430069,0.883593,0.873936,0.82726,0.843191


[I 2025-03-28 06:21:01,291] Trial 124 finished with value: 0.8414794814349857 and parameters: {'learning_rate': 3.922023162841399e-05, 'weight_decay': 0.01, 'warmup_steps': 0}. Best is trial 116 with value: 0.8516606919574525.


Trial 125 with params: {'learning_rate': 2.9361494266622753e-05, 'weight_decay': 0.001, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4872,1.094008,0.712191,0.632179,0.604799,0.609521
2,0.8539,0.682944,0.826764,0.696612,0.707852,0.70151
3,0.5441,0.533037,0.853346,0.715929,0.729525,0.722421
4,0.4007,0.470044,0.865261,0.725049,0.739587,0.732035
5,0.3218,0.439894,0.877177,0.900012,0.794542,0.814921


[I 2025-03-28 06:22:01,367] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 3.5770483479357655e-05, 'weight_decay': 0.003, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4162,0.973394,0.745188,0.647395,0.634413,0.634715
2,0.7328,0.603617,0.846013,0.710386,0.724057,0.716863
3,0.456,0.486573,0.861595,0.721951,0.73651,0.728981
4,0.3346,0.442104,0.871677,0.896267,0.772235,0.785838
5,0.2678,0.423437,0.877177,0.869116,0.821864,0.838153
6,0.2235,0.412231,0.882676,0.861313,0.82649,0.839102
7,0.1941,0.410592,0.885426,0.875031,0.828878,0.844506
8,0.1725,0.415647,0.879927,0.871145,0.824196,0.840367
9,0.1548,0.420799,0.887259,0.876099,0.831228,0.846184
10,0.1449,0.423872,0.883593,0.873536,0.827803,0.843209


[I 2025-03-28 06:25:02,506] Trial 126 finished with value: 0.8424337243427763 and parameters: {'learning_rate': 3.5770483479357655e-05, 'weight_decay': 0.003, 'warmup_steps': 18}. Best is trial 116 with value: 0.8516606919574525.


Trial 127 with params: {'learning_rate': 2.070569719567003e-05, 'weight_decay': 0.001, 'warmup_steps': 28}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5856,1.303176,0.611366,0.581197,0.501387,0.505853
2,1.0847,0.874231,0.782768,0.66521,0.669609,0.665681
3,0.7423,0.660062,0.830431,0.699863,0.710567,0.704686
4,0.5558,0.557695,0.851512,0.714595,0.728147,0.721099
5,0.4504,0.501626,0.857012,0.718427,0.733031,0.725426


[I 2025-03-28 06:26:03,380] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 3.0967684939393455e-05, 'weight_decay': 0.004, 'warmup_steps': 21}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4689,1.060567,0.722273,0.63664,0.614166,0.617622
2,0.82,0.659428,0.831347,0.700121,0.711463,0.705121
3,0.5189,0.518961,0.855179,0.717546,0.730907,0.723951
4,0.3817,0.461286,0.870761,0.896494,0.752933,0.754735
5,0.3063,0.434516,0.87901,0.877953,0.805492,0.823899
6,0.2569,0.415385,0.879927,0.855192,0.814771,0.828614
7,0.2249,0.411139,0.883593,0.873389,0.826933,0.842802
8,0.2005,0.411422,0.882676,0.87364,0.825975,0.842535
9,0.1814,0.416399,0.889093,0.877539,0.832408,0.84744
10,0.1687,0.414816,0.886343,0.875853,0.829914,0.845447


[I 2025-03-28 06:29:06,510] Trial 128 finished with value: 0.8453251325035679 and parameters: {'learning_rate': 3.0967684939393455e-05, 'weight_decay': 0.004, 'warmup_steps': 21}. Best is trial 116 with value: 0.8516606919574525.


Trial 129 with params: {'learning_rate': 3.384669601276606e-05, 'weight_decay': 0.005, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4392,1.007034,0.734189,0.641503,0.624533,0.625557
2,0.7656,0.623324,0.840513,0.706737,0.719431,0.71263
3,0.4788,0.497674,0.858845,0.720024,0.73365,0.726618
4,0.3515,0.448396,0.873511,0.897706,0.77352,0.78729
5,0.2816,0.426895,0.877177,0.868278,0.822344,0.837912
6,0.2355,0.4126,0.88176,0.872171,0.826074,0.841792
7,0.2052,0.410297,0.887259,0.876744,0.830012,0.845961
8,0.1825,0.413931,0.879927,0.871582,0.824196,0.840585
9,0.1643,0.418608,0.887259,0.875869,0.831026,0.845913
10,0.1533,0.419782,0.885426,0.874951,0.829198,0.844614


[I 2025-03-28 06:32:12,802] Trial 129 finished with value: 0.8424061608033937 and parameters: {'learning_rate': 3.384669601276606e-05, 'weight_decay': 0.005, 'warmup_steps': 22}. Best is trial 116 with value: 0.8516606919574525.


Trial 130 with params: {'learning_rate': 2.571455231964029e-05, 'weight_decay': 0.005, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5273,1.174658,0.681027,0.617185,0.573881,0.581472
2,0.9403,0.749027,0.816682,0.689397,0.699587,0.693685
3,0.613,0.574594,0.850596,0.713682,0.726863,0.719951
4,0.4539,0.49745,0.862511,0.723593,0.736916,0.73001
5,0.3655,0.45729,0.868928,0.893824,0.769561,0.783384
6,0.3078,0.430966,0.87901,0.861741,0.804517,0.821095
7,0.2708,0.420307,0.88451,0.870536,0.81811,0.834771
8,0.2429,0.413199,0.88176,0.860775,0.824924,0.838143
9,0.2211,0.415317,0.887259,0.876008,0.830439,0.845814
10,0.2057,0.411096,0.886343,0.864262,0.829394,0.841947


[I 2025-03-28 06:34:14,314] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 2.2509877273196477e-05, 'weight_decay': 0.004, 'warmup_steps': 18}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5602,1.249416,0.64253,0.600407,0.533989,0.542491
2,1.0262,0.823806,0.797434,0.674312,0.682384,0.677403
3,0.6899,0.624776,0.839597,0.705995,0.717896,0.711657
4,0.515,0.532896,0.852429,0.715637,0.729193,0.722086
5,0.4163,0.48292,0.860678,0.72146,0.735962,0.728451


[I 2025-03-28 06:35:13,685] Trial 131 pruned. 


Trial 132 with params: {'learning_rate': 3.475231352121727e-05, 'weight_decay': 0.007, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4274,0.990472,0.742438,0.646674,0.631501,0.631841
2,0.7498,0.613911,0.846013,0.71078,0.724057,0.717049
3,0.4679,0.492497,0.858845,0.720024,0.73365,0.726618
4,0.3436,0.445555,0.872594,0.896971,0.772854,0.786573
5,0.2752,0.425319,0.878093,0.869651,0.823011,0.838973
6,0.2299,0.412395,0.88176,0.872147,0.825824,0.841648
7,0.2,0.410463,0.88451,0.874389,0.827915,0.843741
8,0.1778,0.414693,0.879927,0.871145,0.824196,0.840367
9,0.1598,0.419529,0.889093,0.877467,0.832623,0.847555
10,0.1493,0.421571,0.88451,0.87415,0.828483,0.843849


[I 2025-03-28 06:38:16,334] Trial 132 finished with value: 0.8432927951859873 and parameters: {'learning_rate': 3.475231352121727e-05, 'weight_decay': 0.007, 'warmup_steps': 19}. Best is trial 116 with value: 0.8516606919574525.


Trial 133 with params: {'learning_rate': 0.00031614748075250965, 'weight_decay': 0.001, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.605,0.416455,0.877177,0.890144,0.818108,0.84227
2,0.1239,0.48513,0.878093,0.875589,0.830235,0.84711
3,0.0641,0.559378,0.872594,0.838666,0.828104,0.831879
4,0.042,0.598735,0.868928,0.836036,0.825793,0.829533
5,0.0276,0.687976,0.871677,0.855221,0.827518,0.837109
6,0.0235,0.708585,0.863428,0.834408,0.818606,0.825331
7,0.0198,0.754132,0.860678,0.814042,0.818331,0.814827
8,0.0144,0.799315,0.859762,0.812869,0.816505,0.813972
9,0.0122,0.875106,0.850596,0.821376,0.81026,0.813981
10,0.0113,0.819045,0.864345,0.823523,0.820189,0.821467


[I 2025-03-28 06:40:15,112] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 1.7020695136396012e-05, 'weight_decay': 0.01, 'warmup_steps': 14}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6184,1.394116,0.507791,0.535955,0.39305,0.354951
2,1.2013,0.991142,0.746104,0.641316,0.637415,0.635401
3,0.8698,0.759983,0.813932,0.688381,0.696264,0.691443
4,0.6665,0.632818,0.836847,0.703709,0.716134,0.709594
5,0.5467,0.562013,0.851512,0.715066,0.728028,0.72126
6,0.4647,0.513855,0.858845,0.720505,0.734117,0.727047
7,0.4108,0.484321,0.860678,0.722206,0.73545,0.728598
8,0.371,0.464063,0.866178,0.727257,0.739777,0.733366
9,0.3401,0.452787,0.87626,0.900272,0.775736,0.789791
10,0.3162,0.442857,0.875344,0.898969,0.784158,0.801898


[I 2025-03-28 06:42:15,833] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 2.751349205561589e-05, 'weight_decay': 0.003, 'warmup_steps': 22}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5075,1.134033,0.697525,0.62634,0.590253,0.596628
2,0.896,0.714123,0.819432,0.691252,0.702214,0.69594
3,0.577,0.552471,0.853346,0.716021,0.729525,0.722422
4,0.4259,0.482562,0.862511,0.723468,0.73718,0.730087
5,0.3424,0.447641,0.874427,0.898122,0.774219,0.787881


[I 2025-03-28 06:43:15,519] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.00010587424753231928, 'weight_decay': 0.001, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9515,0.493951,0.853346,0.719198,0.728179,0.72331
2,0.2879,0.43605,0.87626,0.861972,0.813359,0.827272
3,0.1628,0.418202,0.890009,0.877433,0.833687,0.847957
4,0.1169,0.433031,0.883593,0.87579,0.827228,0.844176
5,0.085,0.477289,0.879927,0.861387,0.833708,0.843802
6,0.0678,0.483289,0.88176,0.856544,0.834326,0.843447
7,0.0565,0.50907,0.880843,0.855689,0.833408,0.842554
8,0.0457,0.567108,0.872594,0.840392,0.82653,0.832344
9,0.0384,0.574234,0.874427,0.842151,0.828116,0.833967
10,0.0348,0.59958,0.868011,0.837393,0.82233,0.828711


[I 2025-03-28 06:45:14,487] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 2.2317503571952495e-05, 'weight_decay': 0.01, 'warmup_steps': 3}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5536,1.246223,0.641613,0.600387,0.533408,0.542016
2,1.0282,0.831229,0.787351,0.667933,0.672868,0.668936
3,0.697,0.631827,0.834097,0.701921,0.713583,0.707374
4,0.5217,0.538605,0.848763,0.713042,0.725666,0.719021
5,0.4221,0.48782,0.858845,0.71978,0.734413,0.726834


[I 2025-03-28 06:46:14,366] Trial 137 pruned. 


Trial 138 with params: {'learning_rate': 3.188398290886231e-05, 'weight_decay': 0.004, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4575,1.04176,0.724106,0.635748,0.615879,0.618065
2,0.8017,0.647519,0.834097,0.701861,0.713755,0.707163
3,0.5058,0.512166,0.857012,0.718642,0.732241,0.72518
4,0.3719,0.45734,0.870761,0.896476,0.752981,0.754713
5,0.2984,0.432132,0.877177,0.876672,0.804097,0.822568


[I 2025-03-28 06:47:14,736] Trial 138 pruned. 


Trial 139 with params: {'learning_rate': 7.586481554893302e-05, 'weight_decay': 0.005, 'warmup_steps': 19}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0957,0.588508,0.840513,0.709384,0.71723,0.712865
2,0.3842,0.443208,0.872594,0.87299,0.800264,0.818643
3,0.224,0.410548,0.885426,0.86252,0.829838,0.841138
4,0.1624,0.412546,0.887259,0.876732,0.831047,0.846475
5,0.1264,0.448513,0.888176,0.878291,0.84124,0.85393
6,0.1017,0.456799,0.883593,0.877069,0.836986,0.851581
7,0.0855,0.466964,0.87901,0.862589,0.83226,0.843973
8,0.0723,0.49439,0.88176,0.86519,0.834988,0.846609
9,0.0633,0.506346,0.87626,0.862097,0.82954,0.842357
10,0.0579,0.518876,0.87901,0.864367,0.832133,0.844742


[I 2025-03-28 06:50:18,627] Trial 139 finished with value: 0.8397613338863109 and parameters: {'learning_rate': 7.586481554893302e-05, 'weight_decay': 0.005, 'warmup_steps': 19}. Best is trial 116 with value: 0.8516606919574525.


Trial 140 with params: {'learning_rate': 3.232859704030805e-05, 'weight_decay': 0.005, 'warmup_steps': 14}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4491,1.03232,0.726856,0.63738,0.618272,0.620007
2,0.7935,0.643263,0.83593,0.703307,0.715137,0.70864
3,0.5006,0.510277,0.856095,0.717838,0.731623,0.724451
4,0.3683,0.456508,0.872594,0.897459,0.763452,0.772125
5,0.2955,0.431676,0.874427,0.857705,0.801554,0.817432


[I 2025-03-28 06:51:19,370] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 0.00011218241025953979, 'weight_decay': 0.001, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9406,0.483431,0.854262,0.720168,0.728846,0.724107
2,0.2747,0.436034,0.87626,0.865941,0.822469,0.836182
3,0.1541,0.424501,0.889093,0.877415,0.832125,0.847355
4,0.1104,0.447134,0.877177,0.873112,0.831024,0.84661
5,0.0787,0.491266,0.879927,0.862341,0.833667,0.844277
6,0.0627,0.503434,0.87901,0.854921,0.832034,0.841414
7,0.0523,0.526407,0.87626,0.851083,0.829978,0.838574
8,0.0425,0.591503,0.870761,0.838638,0.825382,0.830782
9,0.035,0.598892,0.871677,0.840064,0.825819,0.831796
10,0.032,0.625854,0.864345,0.834986,0.819019,0.825854


[I 2025-03-28 06:53:21,222] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 4.885615799096653e-05, 'weight_decay': 0.002, 'warmup_steps': 31}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3058,0.797626,0.796517,0.679044,0.678705,0.677006
2,0.5655,0.511827,0.854262,0.715471,0.730777,0.722678
3,0.3393,0.43823,0.879927,0.879214,0.806152,0.824815
4,0.2481,0.420032,0.882676,0.861848,0.826553,0.839358
5,0.1956,0.420673,0.879927,0.871598,0.824632,0.840517
6,0.1606,0.421174,0.883593,0.873933,0.828277,0.843605
7,0.1383,0.422398,0.883593,0.873829,0.827643,0.843275
8,0.1225,0.439537,0.883593,0.873999,0.827671,0.843435
9,0.1092,0.450571,0.877177,0.861037,0.831175,0.842618
10,0.103,0.461959,0.879927,0.863988,0.833293,0.845027


[I 2025-03-28 06:56:22,766] Trial 142 finished with value: 0.844374788118822 and parameters: {'learning_rate': 4.885615799096653e-05, 'weight_decay': 0.002, 'warmup_steps': 31}. Best is trial 116 with value: 0.8516606919574525.


Trial 143 with params: {'learning_rate': 7.8451904548004e-05, 'weight_decay': 0.003, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0901,0.578459,0.84143,0.710056,0.718228,0.713672
2,0.3742,0.440556,0.872594,0.842367,0.800657,0.812999
3,0.2174,0.411465,0.885426,0.862474,0.829887,0.841119
4,0.1575,0.416064,0.888176,0.877379,0.831679,0.847135
5,0.122,0.452695,0.886343,0.876713,0.839928,0.852502
6,0.0979,0.461289,0.882676,0.87608,0.836271,0.850724
7,0.0821,0.470885,0.880843,0.864404,0.833677,0.845626
8,0.0691,0.502695,0.882676,0.866009,0.835669,0.847403
9,0.0604,0.51537,0.877177,0.862834,0.830255,0.843089
10,0.0551,0.528079,0.87901,0.864068,0.832133,0.844607


[I 2025-03-28 06:59:24,977] Trial 143 finished with value: 0.8397507328471714 and parameters: {'learning_rate': 7.8451904548004e-05, 'weight_decay': 0.003, 'warmup_steps': 26}. Best is trial 116 with value: 0.8516606919574525.


Trial 144 with params: {'learning_rate': 7.840543853416233e-05, 'weight_decay': 0.001, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0944,0.579497,0.84143,0.710056,0.718228,0.713672
2,0.3749,0.440522,0.871677,0.841589,0.799991,0.812294
3,0.2177,0.412153,0.885426,0.862474,0.829887,0.841119
4,0.1577,0.416727,0.888176,0.877379,0.831679,0.847135
5,0.1221,0.452862,0.886343,0.877029,0.839914,0.852714
6,0.0979,0.462133,0.882676,0.876404,0.836271,0.850892
7,0.0822,0.471062,0.88176,0.865255,0.834344,0.846371
8,0.0691,0.503682,0.882676,0.866009,0.835669,0.847403
9,0.0604,0.516108,0.87626,0.862218,0.82954,0.842403
10,0.0552,0.528975,0.87901,0.864068,0.832133,0.844607


[I 2025-03-28 07:02:34,075] Trial 144 finished with value: 0.8392054724003013 and parameters: {'learning_rate': 7.840543853416233e-05, 'weight_decay': 0.001, 'warmup_steps': 29}. Best is trial 116 with value: 0.8516606919574525.


Trial 145 with params: {'learning_rate': 2.770996663992327e-05, 'weight_decay': 0.003, 'warmup_steps': 27}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5091,1.133068,0.696609,0.625581,0.589587,0.595892
2,0.8931,0.710444,0.818515,0.690715,0.701533,0.695321
3,0.5731,0.549681,0.853346,0.716021,0.729525,0.722422
4,0.4225,0.480377,0.863428,0.724074,0.73786,0.730724
5,0.3395,0.446261,0.874427,0.898122,0.774219,0.787881
6,0.2855,0.422655,0.88176,0.86888,0.815751,0.832824
7,0.2508,0.415045,0.886343,0.875489,0.828961,0.844895
8,0.2244,0.410603,0.882676,0.87303,0.825989,0.842216
9,0.2038,0.414797,0.890009,0.866591,0.833101,0.844834
10,0.1896,0.411564,0.887259,0.876192,0.830378,0.845856


[I 2025-03-28 07:05:35,583] Trial 145 finished with value: 0.8454007452463607 and parameters: {'learning_rate': 2.770996663992327e-05, 'weight_decay': 0.003, 'warmup_steps': 27}. Best is trial 116 with value: 0.8516606919574525.


Trial 146 with params: {'learning_rate': 2.777282677463979e-05, 'weight_decay': 0.002, 'warmup_steps': 26}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5077,1.131003,0.697525,0.626139,0.590585,0.596759
2,0.8913,0.709337,0.818515,0.690715,0.701533,0.695321
3,0.5719,0.549039,0.853346,0.716021,0.729525,0.722422
4,0.4217,0.480038,0.863428,0.724074,0.73786,0.730724
5,0.3389,0.446051,0.874427,0.898122,0.774219,0.787881


[I 2025-03-28 07:06:35,515] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 3.1650669174577075e-05, 'weight_decay': 0.005, 'warmup_steps': 29}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4683,1.051902,0.727773,0.640531,0.618241,0.62161
2,0.8082,0.649633,0.833181,0.701341,0.713026,0.706529
3,0.508,0.512525,0.855179,0.717266,0.730859,0.723801
4,0.373,0.456805,0.873511,0.898519,0.755294,0.756918
5,0.299,0.431874,0.880843,0.882451,0.815963,0.835024
6,0.2506,0.414115,0.883593,0.873514,0.827435,0.843148
7,0.2192,0.410612,0.88451,0.874033,0.827614,0.843446
8,0.1952,0.411903,0.883593,0.874398,0.826642,0.843248
9,0.1764,0.416911,0.887259,0.876096,0.831074,0.846018
10,0.1642,0.415896,0.886343,0.875631,0.829914,0.845305


[I 2025-03-28 07:09:39,345] Trial 147 finished with value: 0.8446184587290256 and parameters: {'learning_rate': 3.1650669174577075e-05, 'weight_decay': 0.005, 'warmup_steps': 29}. Best is trial 116 with value: 0.8516606919574525.


Trial 148 with params: {'learning_rate': 2.3915063786644546e-05, 'weight_decay': 0.003, 'warmup_steps': 25}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5491,1.220504,0.665445,0.611739,0.557549,0.56646
2,0.9894,0.788955,0.805683,0.681039,0.689293,0.68422
3,0.6539,0.600199,0.842346,0.707469,0.720439,0.71366
4,0.4857,0.515065,0.855179,0.717663,0.731256,0.724118
5,0.3917,0.469631,0.863428,0.723528,0.738051,0.730546
6,0.3305,0.44034,0.873511,0.897731,0.782061,0.80028
7,0.2909,0.42663,0.883593,0.869565,0.817443,0.833917


[W 2025-03-28 07:11:08,404] Trial 148 failed with parameters: {'learning_rate': 2.3915063786644546e-05, 'weight_decay': 0.003, 'warmup_steps': 25} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 250, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2241, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2581, in _inner_training_loop
    _grad_norm = self.accelerator.clip_grad_norm_(
  File "/usr/local/lib/python3.10/dist-packages/accelerate/accelerator.py", line 2510, in clip_grad_norm_
    return torch.nn.utils.clip_grad_norm_(parameters

KeyboardInterrupt: 

In [30]:
print(best_trial3)

NameError: name 'best_trial3' is not defined

In [13]:
base.reset_seed()

In [14]:
training_args = base.get_training_args(output_dir=f"~/results/{DATASET}/bert-distill_coarse_aug_hp-search", logging_dir=f"~/logs/{DATASET}/bert-distill_coarse_aug_hp-search", remove_unused_columns=False, epochs=num_epochs, batch_size=batch_size)

In [15]:
def hp_space(trial):
    params =  {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0, 1e-2, step=1e-3),
        "warmup_steps" : trial.suggest_int("warmup_steps", 0, warm_up),
        "lambda_param": trial.suggest_float("lambda_param",0,1,step=.1),
        "temperature": trial.suggest_float("temperature", 2,7, step=.5)
    }
    print(f"Trial {trial.number} with params: {params}")
    return params

In [16]:
pruner = optuna.pruners.HyperbandPruner(min_resource=min_r, max_resource=max_r, reduction_factor=2, bootstrap_count=2)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)



In [17]:
trainer = base.DistilTrainer(
    args=training_args,
    train_dataset=train_aug,
    eval_dataset=eval,
    compute_metrics=base.compute_metrics,
    model_init = lambda: get_Bert()
)
  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
best_trial4 = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    pruner=pruner,
    sampler=sampler,
    study_name="Test-Distill-aug",
    n_trials=150
)

[I 2025-03-28 11:31:23,503] A new study created in memory with name: Test-Distill-aug


Trial 0 with params: {'learning_rate': 4.3284502212938785e-05, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0564,2.337134,0.789184,0.674431,0.672664,0.671305


[I 2025-03-28 11:31:42,739] Trial 0 pruned. 


Trial 1 with params: {'learning_rate': 1.8408992080552506e-05, 'weight_decay': 0.0, 'warmup_steps': 4, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5252,3.297567,0.485793,0.572307,0.370993,0.318883
2,2.7802,2.551758,0.757104,0.650491,0.647379,0.644446


[I 2025-03-28 11:32:43,577] Trial 1 pruned. 


Trial 2 with params: {'learning_rate': 1.0838581269344744e-05, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6559,3.625613,0.428048,0.256814,0.321663,0.267995


[I 2025-03-28 11:33:13,378] Trial 2 pruned. 


Trial 3 with params: {'learning_rate': 2.049268011541735e-05, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4853,3.203858,0.542621,0.574206,0.427393,0.408764
2,2.6578,2.417004,0.779102,0.664628,0.66616,0.66307


[I 2025-03-28 11:34:21,517] Trial 3 pruned. 


Trial 4 with params: {'learning_rate': 0.00010952662748632558, 'weight_decay': 0.001, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1967,1.240707,0.857929,0.722136,0.732543,0.726902
2,0.6487,0.763742,0.882676,0.882819,0.808424,0.827696


[I 2025-03-28 11:35:20,928] Trial 4 pruned. 


Trial 5 with params: {'learning_rate': 0.0002157696745589684, 'weight_decay': 0.002, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5721,0.815709,0.875344,0.735716,0.746965,0.740739
2,0.319,0.73811,0.887259,0.889549,0.831265,0.849069
3,0.1986,0.656571,0.890926,0.895073,0.842664,0.860547


[I 2025-03-28 11:36:58,266] Trial 5 pruned. 


Trial 6 with params: {'learning_rate': 0.00010769622478263136, 'weight_decay': 0.001, 'warmup_steps': 0, 'lambda_param': 1.0, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2061,1.254543,0.859762,0.723836,0.733876,0.728407
2,0.6602,0.768629,0.880843,0.872029,0.788816,0.804363


[I 2025-03-28 11:37:57,675] Trial 6 pruned. 


Trial 7 with params: {'learning_rate': 0.000236288641842364, 'weight_decay': 0.003, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4806,0.789329,0.877177,0.7358,0.748216,0.741609
2,0.2964,0.711221,0.888176,0.876352,0.831112,0.846168
3,0.1889,0.636599,0.892759,0.88346,0.843401,0.858051


[I 2025-03-28 11:39:27,214] Trial 7 pruned. 


Trial 8 with params: {'learning_rate': 1.6119044727609182e-05, 'weight_decay': 0.005, 'warmup_steps': 0, 'lambda_param': 1.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5624,3.400874,0.463795,0.236104,0.348987,0.279674
2,2.9162,2.712078,0.730522,0.636889,0.622679,0.622602


[I 2025-03-28 11:40:33,880] Trial 8 pruned. 


Trial 9 with params: {'learning_rate': 0.00013353819088790598, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0096,1.073325,0.862511,0.726548,0.736785,0.73131
2,0.5156,0.724685,0.887259,0.886605,0.812593,0.831544
3,0.2691,0.639508,0.900092,0.899713,0.840655,0.859028


[I 2025-03-28 11:42:03,801] Trial 9 pruned. 


Trial 10 with params: {'learning_rate': 0.0003740714100285732, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1773,0.727851,0.885426,0.889844,0.818382,0.839755
2,0.2348,0.727183,0.882676,0.874955,0.835667,0.849782
3,0.163,0.710652,0.88451,0.877174,0.835698,0.851057


[I 2025-03-28 11:43:41,336] Trial 10 pruned. 


Trial 11 with params: {'learning_rate': 9.196440396296317e-05, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3742,1.424998,0.852429,0.718393,0.726739,0.722071
2,0.8015,0.82077,0.87901,0.736124,0.750643,0.742759
3,0.3965,0.696067,0.892759,0.89282,0.825292,0.844476


[I 2025-03-28 11:45:16,522] Trial 11 pruned. 


Trial 12 with params: {'learning_rate': 0.0002853534364565168, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3621,0.740758,0.88176,0.906015,0.778943,0.793957
2,0.2641,0.755874,0.879927,0.873366,0.834008,0.847811
3,0.1793,0.648263,0.888176,0.893293,0.839253,0.858052


[I 2025-03-28 11:46:47,996] Trial 12 pruned. 


Trial 13 with params: {'learning_rate': 0.00010892978013873446, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2061,1.247016,0.857929,0.722136,0.732543,0.726902
2,0.6536,0.763921,0.88176,0.878215,0.79862,0.816523
3,0.3258,0.650167,0.901925,0.89973,0.833231,0.852078


[I 2025-03-28 11:48:25,680] Trial 13 pruned. 


Trial 14 with params: {'learning_rate': 0.0003781217064086381, 'weight_decay': 0.007, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1628,0.740194,0.87626,0.879932,0.80128,0.822554
2,0.2344,0.708586,0.882676,0.864484,0.835683,0.846477
3,0.159,0.721523,0.877177,0.870997,0.830793,0.845021


[I 2025-03-28 11:49:58,639] Trial 14 pruned. 


Trial 15 with params: {'learning_rate': 0.00013240413524436405, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0176,1.0798,0.862511,0.726729,0.736785,0.731368
2,0.5206,0.724419,0.887259,0.886756,0.812593,0.831592
3,0.2709,0.639242,0.899175,0.899013,0.839657,0.858224
4,0.2068,0.666648,0.892759,0.897615,0.843581,0.862152
5,0.1712,0.646479,0.897342,0.900422,0.847755,0.865604


[I 2025-03-28 11:52:37,652] Trial 15 pruned. 


Trial 16 with params: {'learning_rate': 0.00021991508439980585, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5611,0.809811,0.875344,0.73565,0.746898,0.740731
2,0.3159,0.741587,0.882676,0.885226,0.827821,0.845085


[I 2025-03-28 11:53:39,556] Trial 16 pruned. 


Trial 17 with params: {'learning_rate': 0.0001142713235432526, 'weight_decay': 0.005, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1627,1.201543,0.859762,0.724077,0.733876,0.728539
2,0.6173,0.751489,0.886343,0.885433,0.811884,0.830692
3,0.3101,0.64352,0.900092,0.898033,0.831801,0.850548


[I 2025-03-28 11:55:16,148] Trial 17 pruned. 


Trial 18 with params: {'learning_rate': 0.00016916468903299534, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7872,0.923194,0.868011,0.730003,0.74061,0.734788
2,0.3962,0.73387,0.88176,0.884711,0.817752,0.83681
3,0.2281,0.641678,0.901925,0.90333,0.850559,0.868716
4,0.1793,0.677535,0.891842,0.881529,0.833065,0.850062
5,0.1482,0.67224,0.891842,0.883976,0.84215,0.857309


[I 2025-03-28 11:57:52,437] Trial 18 pruned. 


Trial 19 with params: {'learning_rate': 0.00037270043993891376, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.176,0.743707,0.882676,0.885184,0.806379,0.82775
2,0.2324,0.714311,0.887259,0.87911,0.839076,0.853509
3,0.1562,0.728497,0.88176,0.875493,0.832558,0.848537


[I 2025-03-28 11:59:24,518] Trial 19 pruned. 


Trial 20 with params: {'learning_rate': 7.596341819113778e-05, 'weight_decay': 0.008, 'warmup_steps': 0, 'lambda_param': 0.8, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5523,1.635606,0.839597,0.70899,0.717046,0.712376
2,0.9936,0.937624,0.873511,0.730975,0.745932,0.737987


[I 2025-03-28 12:00:28,706] Trial 20 pruned. 


Trial 21 with params: {'learning_rate': 7.262041187705736e-05, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.6068,1.696548,0.834097,0.705082,0.712706,0.708129


[I 2025-03-28 12:00:58,586] Trial 21 pruned. 


Trial 22 with params: {'learning_rate': 5.1893600283546746e-05, 'weight_decay': 0.008, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9037,2.102856,0.810266,0.688698,0.692398,0.689726
2,1.4523,1.286729,0.863428,0.722857,0.738844,0.730554


[I 2025-03-28 12:02:02,555] Trial 22 pruned. 


Trial 23 with params: {'learning_rate': 5.7933901797472217e-05, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8108,1.96884,0.814849,0.691687,0.69678,0.693586


[I 2025-03-28 12:02:35,391] Trial 23 pruned. 


Trial 24 with params: {'learning_rate': 9.66749103222849e-05, 'weight_decay': 0.008, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3207,1.368877,0.853346,0.718909,0.727754,0.722882
2,0.7539,0.797169,0.88176,0.905492,0.780338,0.794342


[I 2025-03-28 12:03:35,443] Trial 24 pruned. 


Trial 25 with params: {'learning_rate': 0.00023681744529668657, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.502,0.793296,0.877177,0.737051,0.747981,0.741939
2,0.2985,0.733673,0.890926,0.893301,0.833998,0.852303
3,0.1911,0.649683,0.890926,0.882499,0.842138,0.856864
4,0.1532,0.707643,0.889093,0.882604,0.83863,0.855267
5,0.128,0.68183,0.887259,0.880196,0.837227,0.853371


[I 2025-03-28 12:06:14,160] Trial 25 pruned. 


Trial 26 with params: {'learning_rate': 0.0001753313363349859, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7584,0.90572,0.865261,0.728212,0.738249,0.732719
2,0.3824,0.732053,0.886343,0.88789,0.821708,0.840431
3,0.2223,0.653721,0.900092,0.90221,0.849107,0.867426
4,0.1754,0.687492,0.886343,0.880265,0.837554,0.85358
5,0.1447,0.680808,0.885426,0.879193,0.83613,0.852073


[I 2025-03-28 12:08:51,802] Trial 26 pruned. 


Trial 27 with params: {'learning_rate': 8.362954082151853e-05, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.4711,1.532261,0.846929,0.713743,0.722482,0.717596


[I 2025-03-28 12:09:25,042] Trial 27 pruned. 


Trial 28 with params: {'learning_rate': 0.000343422476232343, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.237,0.736662,0.887259,0.891011,0.819179,0.840801
2,0.2416,0.743146,0.885426,0.890463,0.837651,0.855541
3,0.1692,0.702445,0.883593,0.877025,0.834794,0.850451


[I 2025-03-28 12:10:59,429] Trial 28 pruned. 


Trial 29 with params: {'learning_rate': 5.9801429014075965e-05, 'weight_decay': 0.005, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7825,1.930053,0.818515,0.694399,0.699496,0.696266
2,1.2781,1.145696,0.868928,0.726817,0.742784,0.734421


[I 2025-03-28 12:12:04,219] Trial 29 pruned. 


Trial 30 with params: {'learning_rate': 0.00030713498902625464, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.1, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3079,0.727224,0.88451,0.887109,0.817008,0.837823
2,0.2551,0.757271,0.88451,0.888828,0.837493,0.854575
3,0.1739,0.66178,0.887259,0.881267,0.837584,0.854185


[I 2025-03-28 12:13:35,304] Trial 30 pruned. 


Trial 31 with params: {'learning_rate': 0.00027492022436761325, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3895,0.75253,0.873511,0.900924,0.753988,0.757153
2,0.2697,0.752315,0.88451,0.889821,0.837895,0.855262
3,0.1813,0.669444,0.890009,0.881966,0.840131,0.855744
4,0.1445,0.711733,0.885426,0.876775,0.837472,0.851619
5,0.1201,0.687486,0.886343,0.87919,0.837452,0.852906


[I 2025-03-28 12:16:12,623] Trial 31 pruned. 


Trial 32 with params: {'learning_rate': 0.00016681359349865035, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8005,0.930609,0.869844,0.731264,0.742041,0.736159
2,0.4023,0.732849,0.88176,0.884711,0.817752,0.83681
3,0.2302,0.641555,0.901925,0.90333,0.850559,0.868716
4,0.1809,0.675091,0.889093,0.879573,0.830787,0.847935
5,0.1492,0.672559,0.892759,0.884367,0.842865,0.857874
6,0.1297,0.696264,0.877177,0.874753,0.829145,0.846229
7,0.1168,0.722192,0.879927,0.86394,0.832092,0.844155
8,0.1099,0.67839,0.885426,0.868759,0.83506,0.848575
9,0.1028,0.695834,0.88176,0.874909,0.833294,0.848717
10,0.0955,0.692497,0.88451,0.878933,0.83453,0.851168


[I 2025-03-28 12:21:31,339] Trial 32 pruned. 


Trial 33 with params: {'learning_rate': 0.00011605193236133345, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1441,1.187626,0.861595,0.725397,0.735474,0.729997


[I 2025-03-28 12:22:05,859] Trial 33 pruned. 


Trial 34 with params: {'learning_rate': 4.6920384774303475e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.9896,2.230602,0.7956,0.678179,0.678803,0.677071
2,1.5819,1.396433,0.857012,0.718349,0.733858,0.725767


[I 2025-03-28 12:23:06,028] Trial 34 pruned. 


Trial 35 with params: {'learning_rate': 0.00014906729150440562, 'weight_decay': 0.008, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9023,0.997014,0.866178,0.728486,0.739312,0.733534
2,0.456,0.726079,0.885426,0.887802,0.820365,0.839752


[I 2025-03-28 12:24:10,668] Trial 35 pruned. 


Trial 36 with params: {'learning_rate': 0.0002220324458974667, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5484,0.806886,0.874427,0.735057,0.745967,0.739936
2,0.3126,0.740345,0.882676,0.885612,0.827807,0.845297
3,0.1952,0.665963,0.888176,0.878588,0.840054,0.853601


[I 2025-03-28 12:25:44,882] Trial 36 pruned. 


Trial 37 with params: {'learning_rate': 1.0728159166824396e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6549,3.627757,0.422548,0.256039,0.31756,0.264605


[I 2025-03-28 12:26:15,634] Trial 37 pruned. 


Trial 38 with params: {'learning_rate': 0.00017980683664008923, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7308,0.89236,0.865261,0.728279,0.738531,0.732836
2,0.3728,0.739416,0.886343,0.889197,0.830762,0.848633
3,0.2186,0.656516,0.898258,0.900197,0.847558,0.865609
4,0.1736,0.692963,0.888176,0.881112,0.839095,0.85471
5,0.1428,0.674459,0.891842,0.883807,0.841385,0.857128
6,0.1241,0.71192,0.871677,0.869414,0.824187,0.841162
7,0.1123,0.751336,0.874427,0.858728,0.827227,0.83928
8,0.106,0.691465,0.88176,0.865303,0.831965,0.8453
9,0.1002,0.708259,0.87901,0.873134,0.831479,0.846896
10,0.0925,0.709745,0.88176,0.866219,0.83242,0.845646


[I 2025-03-28 12:31:36,132] Trial 38 pruned. 


Trial 39 with params: {'learning_rate': 0.000235846602217156, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1, 'lambda_param': 0.8, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4947,0.792963,0.877177,0.736425,0.747994,0.741707
2,0.2982,0.734334,0.888176,0.889961,0.831945,0.849562
3,0.1909,0.643816,0.894592,0.885228,0.84492,0.85966
4,0.1517,0.694998,0.879927,0.874031,0.832442,0.847905
5,0.1268,0.69803,0.885426,0.868204,0.83681,0.848948


[I 2025-03-28 12:34:17,058] Trial 39 pruned. 


Trial 40 with params: {'learning_rate': 0.0002586417015808675, 'weight_decay': 0.007, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4234,0.778098,0.872594,0.733885,0.744132,0.738405
2,0.2799,0.719972,0.889093,0.878092,0.831884,0.847418


[I 2025-03-28 12:35:20,990] Trial 40 pruned. 


Trial 41 with params: {'learning_rate': 0.00012478932967710795, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0741,1.126772,0.863428,0.727308,0.737466,0.731997
2,0.5572,0.729124,0.887259,0.886999,0.812523,0.831657
3,0.2848,0.641604,0.897342,0.897555,0.838342,0.856823


[I 2025-03-28 12:36:56,801] Trial 41 pruned. 


Trial 42 with params: {'learning_rate': 0.00013478139707972256, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0009,1.066502,0.862511,0.726548,0.736785,0.73131


[I 2025-03-28 12:37:28,366] Trial 42 pruned. 


Trial 43 with params: {'learning_rate': 0.0003598808907708501, 'weight_decay': 0.004, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1957,0.746072,0.878093,0.881343,0.803258,0.824261
2,0.234,0.749033,0.885426,0.878542,0.837772,0.85266
3,0.1625,0.735603,0.871677,0.867633,0.8258,0.840912


[I 2025-03-28 12:39:02,624] Trial 43 pruned. 


Trial 44 with params: {'learning_rate': 9.618899449630941e-05, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3295,1.375712,0.853346,0.718821,0.727719,0.722842


[I 2025-03-28 12:39:36,979] Trial 44 pruned. 


Trial 45 with params: {'learning_rate': 0.00043371026242218253, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0956,0.768706,0.874427,0.881439,0.808992,0.830925
2,0.2226,0.722314,0.88451,0.877723,0.837458,0.852003
3,0.159,0.771256,0.871677,0.869095,0.825483,0.841372


[I 2025-03-28 12:41:08,570] Trial 45 pruned. 


Trial 46 with params: {'learning_rate': 0.00023539427682875763, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5106,0.794187,0.87626,0.736651,0.7473,0.741344
2,0.3001,0.739136,0.887259,0.889885,0.831224,0.849039
3,0.1916,0.67265,0.887259,0.878068,0.839286,0.852845


[I 2025-03-28 12:42:45,853] Trial 46 pruned. 


Trial 47 with params: {'learning_rate': 0.00018760541323443894, 'weight_decay': 0.008, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6877,0.872749,0.869844,0.731992,0.74284,0.736693
2,0.3594,0.760609,0.87901,0.883522,0.825439,0.842778
3,0.2148,0.654752,0.894592,0.897385,0.844891,0.862779
4,0.1705,0.688789,0.889093,0.882335,0.840493,0.856067
5,0.1401,0.675442,0.88451,0.878916,0.835332,0.851689


[I 2025-03-28 12:45:24,462] Trial 47 pruned. 


Trial 48 with params: {'learning_rate': 0.0003010042867185686, 'weight_decay': 0.005, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2975,0.742986,0.885426,0.909165,0.791183,0.810277
2,0.2559,0.743412,0.885426,0.878508,0.838324,0.852792
3,0.1718,0.688733,0.88451,0.875035,0.837164,0.850125


[I 2025-03-28 12:47:03,118] Trial 48 pruned. 


Trial 49 with params: {'learning_rate': 0.00010294599463819358, 'weight_decay': 0.004, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2582,1.302776,0.855179,0.720359,0.729418,0.724391


[I 2025-03-28 12:47:33,847] Trial 49 pruned. 


Trial 50 with params: {'learning_rate': 0.00015744525062901543, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8557,0.963877,0.868011,0.729986,0.740645,0.734823
2,0.4296,0.724068,0.88451,0.887216,0.819766,0.839117


[I 2025-03-28 12:48:34,915] Trial 50 pruned. 


Trial 51 with params: {'learning_rate': 0.00026351727656108735, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.415,0.767602,0.87626,0.736529,0.74709,0.741286
2,0.2774,0.702981,0.890926,0.894881,0.84281,0.860424
3,0.1793,0.661129,0.889093,0.882237,0.83901,0.855373


[I 2025-03-28 12:50:13,153] Trial 51 pruned. 


Trial 52 with params: {'learning_rate': 0.00019304903438059422, 'weight_decay': 0.004, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6715,0.859309,0.868011,0.730635,0.74096,0.735198
2,0.3516,0.761645,0.87901,0.883471,0.82525,0.842716


[I 2025-03-28 12:51:18,259] Trial 52 pruned. 


Trial 53 with params: {'learning_rate': 0.00024698686592202265, 'weight_decay': 0.007, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4698,0.783069,0.875344,0.735201,0.746626,0.740441
2,0.2906,0.732508,0.887259,0.889835,0.831189,0.849186
3,0.1874,0.664124,0.890009,0.88111,0.84148,0.855681


[I 2025-03-28 12:52:56,194] Trial 53 pruned. 


Trial 54 with params: {'learning_rate': 2.2869967933363696e-05, 'weight_decay': 0.007, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4426,3.098711,0.603116,0.588554,0.488697,0.488296


[I 2025-03-28 12:53:26,518] Trial 54 pruned. 


Trial 55 with params: {'learning_rate': 5.7799966463908516e-05, 'weight_decay': 0.005, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.8171,1.973393,0.815765,0.692143,0.697213,0.694066
2,1.3215,1.180357,0.866178,0.72474,0.740439,0.732281


[I 2025-03-28 12:54:30,444] Trial 55 pruned. 


Trial 56 with params: {'learning_rate': 0.0002850907928358517, 'weight_decay': 0.01, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3628,0.741041,0.88176,0.906015,0.778943,0.793957
2,0.2642,0.753085,0.88176,0.887734,0.835721,0.852978
3,0.1793,0.647305,0.885426,0.890989,0.837351,0.85578
4,0.142,0.710418,0.887259,0.892684,0.839116,0.857558
5,0.1179,0.735881,0.882676,0.877021,0.833777,0.849049


[I 2025-03-28 12:57:10,471] Trial 56 pruned. 


Trial 57 with params: {'learning_rate': 2.5913119493443803e-05, 'weight_decay': 0.001, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3821,2.959979,0.664528,0.615596,0.553186,0.562724
2,2.3692,2.121408,0.813016,0.686615,0.69636,0.690649


[I 2025-03-28 12:58:15,137] Trial 57 pruned. 


Trial 58 with params: {'learning_rate': 0.00022931781750740578, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.531,0.799059,0.875344,0.735851,0.746634,0.74064
2,0.3056,0.738386,0.886343,0.889193,0.830307,0.848404
3,0.1936,0.655056,0.889093,0.89336,0.841052,0.858658
4,0.1534,0.722126,0.882676,0.875904,0.834253,0.849543
5,0.1291,0.671204,0.890009,0.881851,0.8405,0.855877


[I 2025-03-28 13:00:56,211] Trial 58 pruned. 


Trial 59 with params: {'learning_rate': 0.00014790027689514903, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.922,1.004948,0.866178,0.728659,0.739263,0.733559
2,0.4621,0.725384,0.88451,0.887642,0.819602,0.839148


[I 2025-03-28 13:01:56,950] Trial 59 pruned. 


Trial 60 with params: {'learning_rate': 2.9068676100418608e-05, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 1.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.3165,2.823639,0.703941,0.632872,0.594888,0.60197
2,2.2156,1.970862,0.827681,0.69755,0.709466,0.703151


[I 2025-03-28 13:03:06,247] Trial 60 pruned. 


Trial 61 with params: {'learning_rate': 0.00023516012912734582, 'weight_decay': 0.007, 'warmup_steps': 4, 'lambda_param': 0.4, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5113,0.794434,0.875344,0.735851,0.746634,0.74064
2,0.3005,0.738633,0.885426,0.88818,0.829758,0.84747
3,0.1918,0.668121,0.889093,0.879874,0.840646,0.854425


[I 2025-03-28 13:04:37,134] Trial 61 pruned. 


Trial 62 with params: {'learning_rate': 0.0002073395053896816, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6057,0.829261,0.874427,0.735575,0.74625,0.740275
2,0.3306,0.741331,0.88451,0.887147,0.829119,0.846777
3,0.2018,0.658405,0.894592,0.897225,0.845244,0.862812
4,0.1608,0.69594,0.889093,0.881706,0.839329,0.855105
5,0.1348,0.682157,0.887259,0.880306,0.837906,0.853681


[I 2025-03-28 13:07:15,950] Trial 62 pruned. 


Trial 63 with params: {'learning_rate': 9.62200245563008e-05, 'weight_decay': 0.005, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3292,1.375421,0.853346,0.718821,0.727719,0.722842


[I 2025-03-28 13:07:53,614] Trial 63 pruned. 


Trial 64 with params: {'learning_rate': 2.311211641086512e-05, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.2, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4327,3.080495,0.60495,0.589812,0.491307,0.492598
2,2.5111,2.263888,0.79835,0.676079,0.683399,0.678517


[I 2025-03-28 13:08:55,791] Trial 64 pruned. 


Trial 65 with params: {'learning_rate': 0.00015041103307543853, 'weight_decay': 0.006, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8938,0.991173,0.867094,0.729282,0.739979,0.734237
2,0.4515,0.725888,0.887259,0.889189,0.821747,0.841142


[I 2025-03-28 13:09:58,624] Trial 65 pruned. 


Trial 66 with params: {'learning_rate': 0.0001689961352690977, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7882,0.923826,0.868928,0.730793,0.741277,0.735486
2,0.3966,0.733795,0.88176,0.884711,0.817752,0.83681
3,0.2282,0.641807,0.901925,0.90333,0.850559,0.868716
4,0.1794,0.67751,0.892759,0.885156,0.842869,0.85874
5,0.1483,0.673388,0.891842,0.883976,0.84215,0.857309
6,0.1288,0.696009,0.875344,0.872332,0.827431,0.844205
7,0.1159,0.724835,0.88176,0.865465,0.833819,0.845809
8,0.1089,0.685373,0.88451,0.878766,0.834032,0.851007
9,0.1024,0.70165,0.88176,0.875156,0.833509,0.848969
10,0.0949,0.697245,0.883593,0.86758,0.833484,0.846935


[I 2025-03-28 13:15:18,891] Trial 66 pruned. 


Trial 67 with params: {'learning_rate': 0.00023866784754810727, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.5, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5,0.791244,0.875344,0.735698,0.746634,0.74059
2,0.2972,0.739326,0.887259,0.88982,0.831189,0.849114
3,0.1908,0.67263,0.88451,0.876593,0.837312,0.851432


[I 2025-03-28 13:16:54,049] Trial 67 pruned. 


Trial 68 with params: {'learning_rate': 0.0002480522678885345, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.461,0.783404,0.874427,0.73458,0.746126,0.739899
2,0.2901,0.730205,0.890926,0.893401,0.833799,0.852203
3,0.1875,0.64226,0.893676,0.885365,0.843831,0.859043
4,0.1501,0.710727,0.87626,0.871859,0.828875,0.844716
5,0.1257,0.688429,0.890009,0.882864,0.84066,0.856245
6,0.11,0.702511,0.880843,0.863625,0.832607,0.844811
7,0.1003,0.765784,0.87626,0.859192,0.829451,0.840586
8,0.0939,0.689336,0.885426,0.867095,0.835863,0.848091
9,0.0895,0.731794,0.878093,0.860451,0.830926,0.8421
10,0.0824,0.717635,0.882676,0.864967,0.833015,0.845478


[I 2025-03-28 13:22:14,395] Trial 68 pruned. 


Trial 69 with params: {'learning_rate': 0.000333095880962392, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2505,0.73245,0.886343,0.887344,0.809437,0.830426
2,0.2442,0.723459,0.882676,0.888297,0.83541,0.853504
3,0.1711,0.682352,0.888176,0.880591,0.839334,0.854495


[I 2025-03-28 13:23:50,621] Trial 69 pruned. 


Trial 70 with params: {'learning_rate': 0.0002061158689026567, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6107,0.831493,0.874427,0.735575,0.74625,0.740275
2,0.3324,0.743518,0.88451,0.88706,0.829154,0.846697
3,0.2024,0.655049,0.895509,0.897955,0.845959,0.863512
4,0.1618,0.690986,0.887259,0.880596,0.837658,0.8538
5,0.135,0.673376,0.890009,0.882317,0.839787,0.855707


[I 2025-03-28 13:26:28,675] Trial 70 pruned. 


Trial 71 with params: {'learning_rate': 0.0003720717424063221, 'weight_decay': 0.007, 'warmup_steps': 1, 'lambda_param': 0.30000000000000004, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1729,0.727899,0.879927,0.882538,0.80471,0.825736
2,0.2335,0.697329,0.888176,0.880502,0.840444,0.85504
3,0.1609,0.710658,0.877177,0.885525,0.83105,0.849234


[I 2025-03-28 13:28:05,623] Trial 71 pruned. 


Trial 72 with params: {'learning_rate': 0.00013102153800851205, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0275,1.087909,0.862511,0.726729,0.736785,0.731368
2,0.5268,0.724227,0.886343,0.88615,0.811878,0.830897


[I 2025-03-28 13:29:07,170] Trial 72 pruned. 


Trial 73 with params: {'learning_rate': 0.0004567498133669025, 'weight_decay': 0.01, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0556,0.76805,0.873511,0.88092,0.808249,0.830085
2,0.2187,0.70807,0.88451,0.878633,0.83649,0.852243
3,0.1593,0.753703,0.875344,0.860362,0.828139,0.840601


[I 2025-03-28 13:30:46,602] Trial 73 pruned. 


Trial 74 with params: {'learning_rate': 0.0001418498919907416, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9532,1.030997,0.864345,0.727157,0.73793,0.732179
2,0.4825,0.724948,0.882676,0.886322,0.817955,0.837661
3,0.2568,0.641094,0.900092,0.899625,0.840641,0.858934
4,0.1989,0.667806,0.893676,0.898292,0.844475,0.862949
5,0.1651,0.655233,0.895509,0.898644,0.846087,0.86386
6,0.1433,0.681969,0.880843,0.876464,0.833062,0.849162
7,0.1285,0.705793,0.882676,0.876608,0.834808,0.850059
8,0.1207,0.659241,0.888176,0.881278,0.83689,0.853719
9,0.1115,0.678292,0.887259,0.879677,0.837774,0.85341
10,0.1037,0.681384,0.886343,0.869779,0.83578,0.848942


[I 2025-03-28 13:35:59,971] Trial 74 pruned. 


Trial 75 with params: {'learning_rate': 0.00014856613635179827, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9102,1.000122,0.867094,0.729282,0.739979,0.734237
2,0.4585,0.725345,0.885426,0.888101,0.820317,0.839795


[I 2025-03-28 13:37:01,809] Trial 75 pruned. 


Trial 76 with params: {'learning_rate': 0.00017733910066847252, 'weight_decay': 0.01, 'warmup_steps': 4, 'lambda_param': 0.2, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7525,0.900482,0.865261,0.728036,0.738297,0.732703
2,0.3788,0.731179,0.887259,0.888229,0.822424,0.840965
3,0.2211,0.64964,0.901008,0.902556,0.849871,0.867951
4,0.1745,0.685065,0.890926,0.883415,0.841729,0.857295
5,0.1442,0.686799,0.889093,0.882199,0.840122,0.855379


[I 2025-03-28 13:39:42,423] Trial 76 pruned. 


Trial 77 with params: {'learning_rate': 0.0003025378851694067, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.4, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3193,0.735287,0.880843,0.881457,0.80511,0.825492
2,0.258,0.745362,0.880843,0.885814,0.834729,0.851627
3,0.1751,0.662344,0.894592,0.886332,0.843452,0.859612
4,0.1402,0.730243,0.878093,0.885135,0.831489,0.85008
5,0.1168,0.755691,0.879927,0.888144,0.83248,0.850871


[I 2025-03-28 13:42:22,012] Trial 77 pruned. 


Trial 78 with params: {'learning_rate': 6.210843826280204e-05, 'weight_decay': 0.006, 'warmup_steps': 0, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.7418,1.880383,0.819432,0.69462,0.700925,0.697202


[I 2025-03-28 13:42:56,349] Trial 78 pruned. 


Trial 79 with params: {'learning_rate': 0.00010570895260099929, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2358,1.277034,0.856095,0.721124,0.730416,0.725228
2,0.6778,0.769305,0.88176,0.905594,0.789531,0.807711


[I 2025-03-28 13:43:58,265] Trial 79 pruned. 


Trial 80 with params: {'learning_rate': 4.57424582908879e-05, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.5, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.0103,2.26331,0.796517,0.679715,0.679187,0.67795


[I 2025-03-28 13:44:28,238] Trial 80 pruned. 


Trial 81 with params: {'learning_rate': 0.00016039525453366506, 'weight_decay': 0.008, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8327,0.951968,0.870761,0.731942,0.742756,0.736853
2,0.42,0.727294,0.885426,0.887724,0.82023,0.839665
3,0.2354,0.64989,0.898258,0.899559,0.847491,0.865179
4,0.1846,0.672205,0.891842,0.897613,0.842887,0.862053
5,0.1523,0.663373,0.892759,0.883988,0.843415,0.857991
6,0.1329,0.685576,0.88176,0.877531,0.83277,0.849428
7,0.1191,0.696869,0.88176,0.864584,0.83282,0.845086
8,0.1127,0.667938,0.886343,0.879398,0.835574,0.852119
9,0.1046,0.687935,0.885426,0.867,0.836255,0.848332
10,0.0974,0.680921,0.88451,0.869259,0.834088,0.847877


[I 2025-03-28 13:49:50,356] Trial 81 pruned. 


Trial 82 with params: {'learning_rate': 0.00011180991757143405, 'weight_decay': 0.009000000000000001, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1692,1.218109,0.859762,0.723914,0.733876,0.728436


[I 2025-03-28 13:50:25,582] Trial 82 pruned. 


Trial 83 with params: {'learning_rate': 0.00027919465779551687, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3722,0.754778,0.872594,0.900402,0.753175,0.756529
2,0.2679,0.750719,0.885426,0.890064,0.838248,0.855444
3,0.1798,0.644504,0.890009,0.88326,0.839924,0.856347


[I 2025-03-28 13:52:00,103] Trial 83 pruned. 


Trial 84 with params: {'learning_rate': 1.0484571408639922e-05, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6606,3.638675,0.418882,0.258417,0.314749,0.262153


[I 2025-03-28 13:52:30,833] Trial 84 pruned. 


Trial 85 with params: {'learning_rate': 0.00019378139842687124, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6641,0.857406,0.868928,0.731435,0.741627,0.735898
2,0.3503,0.769719,0.877177,0.881686,0.824035,0.841258
3,0.2112,0.657549,0.892759,0.89608,0.843765,0.861626
4,0.1673,0.687216,0.889093,0.882678,0.84021,0.856148
5,0.1384,0.676497,0.892759,0.884651,0.842686,0.858276
6,0.1198,0.697457,0.878093,0.863487,0.830088,0.843348
7,0.1084,0.756051,0.874427,0.860042,0.826963,0.839675
8,0.1021,0.693666,0.882676,0.866375,0.832349,0.846015
9,0.0968,0.701457,0.88176,0.863716,0.833941,0.845393
10,0.0895,0.706347,0.880843,0.875299,0.832198,0.848081


[I 2025-03-28 13:57:50,661] Trial 85 pruned. 


Trial 86 with params: {'learning_rate': 0.00013942238008437586, 'weight_decay': 0.007, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9646,1.041714,0.862511,0.725993,0.736534,0.730863


[I 2025-03-28 13:58:26,084] Trial 86 pruned. 


Trial 87 with params: {'learning_rate': 0.0003261411104184409, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2606,0.735181,0.88451,0.887806,0.817211,0.83817
2,0.2456,0.73972,0.88451,0.889718,0.83748,0.85503
3,0.1701,0.703084,0.878093,0.873978,0.831014,0.846825


[I 2025-03-28 13:59:56,141] Trial 87 pruned. 


Trial 88 with params: {'learning_rate': 0.00012733803933693, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0547,1.110527,0.863428,0.727308,0.737466,0.731997
2,0.5444,0.72627,0.887259,0.886814,0.812774,0.831637


[I 2025-03-28 14:01:04,607] Trial 88 pruned. 


Trial 89 with params: {'learning_rate': 0.00018699603023442728, 'weight_decay': 0.005, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6955,0.873845,0.868928,0.731441,0.741842,0.736013
2,0.3604,0.755115,0.880843,0.885497,0.826583,0.844381


[I 2025-03-28 14:02:04,686] Trial 89 pruned. 


Trial 90 with params: {'learning_rate': 0.00025645804260917985, 'weight_decay': 0.005, 'warmup_steps': 1, 'lambda_param': 0.1, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.43,0.780815,0.874427,0.734922,0.745708,0.739772
2,0.2813,0.711941,0.893676,0.895297,0.835492,0.854146
3,0.1828,0.654654,0.890926,0.882753,0.841402,0.85668
4,0.148,0.723057,0.877177,0.872897,0.829838,0.84549
5,0.124,0.721093,0.88176,0.876818,0.832869,0.849002


[I 2025-03-28 14:04:44,671] Trial 90 pruned. 


Trial 91 with params: {'learning_rate': 0.000177446554432553, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7429,0.89888,0.865261,0.728279,0.738531,0.732836
2,0.3774,0.735777,0.887259,0.889839,0.831477,0.849334
3,0.2204,0.656256,0.898258,0.90051,0.84776,0.865898
4,0.1742,0.690739,0.887259,0.881376,0.837956,0.854284
5,0.1441,0.671089,0.891842,0.883462,0.841817,0.857184


[I 2025-03-28 14:07:24,460] Trial 91 pruned. 


Trial 92 with params: {'learning_rate': 0.0001374165732801677, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9828,1.052831,0.862511,0.726548,0.736785,0.73131


[I 2025-03-28 14:07:57,178] Trial 92 pruned. 


Trial 93 with params: {'learning_rate': 0.00026197866171262226, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4194,0.769084,0.877177,0.737042,0.747736,0.741906
2,0.2788,0.706333,0.893676,0.89684,0.844894,0.862405
3,0.1803,0.655668,0.888176,0.881613,0.838405,0.854711


[I 2025-03-28 14:09:33,023] Trial 93 pruned. 


Trial 94 with params: {'learning_rate': 9.132655522948801e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3812,1.432742,0.852429,0.718393,0.726739,0.722071


[I 2025-03-28 14:10:03,672] Trial 94 pruned. 


Trial 95 with params: {'learning_rate': 0.0001026742208447073, 'weight_decay': 0.009000000000000001, 'warmup_steps': 4, 'lambda_param': 0.7000000000000001, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2721,1.310246,0.854262,0.719695,0.729035,0.723848
2,0.7033,0.775736,0.882676,0.906573,0.781074,0.795299


[I 2025-03-28 14:11:13,372] Trial 95 pruned. 


Trial 96 with params: {'learning_rate': 1.0675005523304308e-05, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.30000000000000004, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6581,3.631625,0.422548,0.257273,0.317533,0.264613


[I 2025-03-28 14:11:44,261] Trial 96 pruned. 


Trial 97 with params: {'learning_rate': 7.328860102245757e-05, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.5953,1.684294,0.836847,0.707267,0.714719,0.710273
2,1.0385,0.968099,0.873511,0.730866,0.745649,0.737867


[I 2025-03-28 14:12:44,644] Trial 97 pruned. 


Trial 98 with params: {'learning_rate': 2.025662008519137e-05, 'weight_decay': 0.008, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4914,3.215491,0.538955,0.580016,0.423252,0.401646
2,2.6723,2.432163,0.773602,0.660538,0.6613,0.658377


[I 2025-03-28 14:13:52,636] Trial 98 pruned. 


Trial 99 with params: {'learning_rate': 0.00020418597237509528, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.6000000000000001, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6019,0.83058,0.874427,0.734664,0.746263,0.739881
2,0.3329,0.737572,0.88451,0.886865,0.829251,0.846617


[I 2025-03-28 14:14:53,425] Trial 99 pruned. 


Trial 100 with params: {'learning_rate': 0.0004540061556428691, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0674,0.786371,0.87626,0.884163,0.810787,0.832672
2,0.2191,0.734173,0.878093,0.874497,0.830959,0.847271
3,0.1555,0.748369,0.875344,0.860374,0.829087,0.841166


[I 2025-03-28 14:16:33,090] Trial 100 pruned. 


Trial 101 with params: {'learning_rate': 0.00012385080025508166, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.8, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.085,1.132839,0.864345,0.727935,0.738097,0.732633


[I 2025-03-28 14:17:02,770] Trial 101 pruned. 


Trial 102 with params: {'learning_rate': 0.0002969042768317243, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.327,0.740593,0.880843,0.88266,0.80518,0.826095
2,0.26,0.73956,0.883593,0.889122,0.837042,0.854482
3,0.1762,0.65187,0.891842,0.897308,0.841716,0.861342
4,0.1401,0.740938,0.883593,0.87662,0.835248,0.850626
5,0.1175,0.684453,0.891842,0.884632,0.841504,0.857518
6,0.105,0.718716,0.885426,0.866087,0.836593,0.847805
7,0.095,0.773632,0.87901,0.862909,0.830852,0.843198
8,0.0889,0.742365,0.879927,0.861941,0.832471,0.843607
9,0.0842,0.756657,0.878093,0.859685,0.83104,0.841766
10,0.0779,0.752514,0.879927,0.873016,0.831995,0.846954


[I 2025-03-28 14:22:19,288] Trial 102 pruned. 


Trial 103 with params: {'learning_rate': 0.00034773261125749537, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2178,0.744142,0.879927,0.882393,0.804139,0.825157
2,0.2382,0.725182,0.883593,0.875623,0.836258,0.850347
3,0.1654,0.742925,0.874427,0.870113,0.827822,0.843294


[I 2025-03-28 14:23:57,759] Trial 103 pruned. 


Trial 104 with params: {'learning_rate': 0.00045570924262504944, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0654,0.795307,0.872594,0.881658,0.807511,0.829743
2,0.2175,0.749898,0.880843,0.888505,0.844268,0.859651
3,0.1578,0.775088,0.863428,0.84107,0.819441,0.827494


[I 2025-03-28 14:25:32,353] Trial 104 pruned. 


Trial 105 with params: {'learning_rate': 0.00016896026180779306, 'weight_decay': 0.002, 'warmup_steps': 4, 'lambda_param': 1.0, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7973,0.925483,0.868928,0.730578,0.741075,0.735394
2,0.3978,0.733349,0.885426,0.887436,0.820745,0.839738


[I 2025-03-28 14:26:37,055] Trial 105 pruned. 


Trial 106 with params: {'learning_rate': 0.0001824191165136387, 'weight_decay': 0.01, 'warmup_steps': 1, 'lambda_param': 0.2, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7127,0.885688,0.867094,0.730097,0.740412,0.734552
2,0.3678,0.746506,0.886343,0.889191,0.830867,0.848645
3,0.2175,0.657078,0.898258,0.900441,0.847593,0.865757
4,0.1726,0.688147,0.890926,0.896736,0.842026,0.861205
5,0.1418,0.670254,0.887259,0.869727,0.837359,0.850185


[I 2025-03-28 14:29:15,955] Trial 106 pruned. 


Trial 107 with params: {'learning_rate': 0.0002850420863174203, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3568,0.750551,0.877177,0.903189,0.775123,0.790682
2,0.2655,0.752017,0.883593,0.876391,0.836322,0.85053
3,0.1797,0.662716,0.887259,0.880983,0.837514,0.853953


[I 2025-03-28 14:30:53,787] Trial 107 pruned. 


Trial 108 with params: {'learning_rate': 0.0002521998191737194, 'weight_decay': 0.005, 'warmup_steps': 1, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4427,0.784497,0.875344,0.735487,0.74664,0.74055
2,0.2851,0.719256,0.892759,0.896422,0.844151,0.861833
3,0.1859,0.649778,0.891842,0.882432,0.842612,0.856994
4,0.1494,0.703319,0.88451,0.889666,0.836648,0.854932
5,0.1245,0.727563,0.880843,0.887786,0.832098,0.851438


[I 2025-03-28 14:33:33,788] Trial 108 pruned. 


Trial 109 with params: {'learning_rate': 0.00011365832523607101, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.9, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1644,1.205935,0.859762,0.724077,0.733876,0.728539
2,0.621,0.754057,0.88451,0.8839,0.810551,0.829311


[I 2025-03-28 14:34:32,156] Trial 109 pruned. 


Trial 110 with params: {'learning_rate': 0.00010411015351181881, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.247,1.291297,0.856095,0.721124,0.730416,0.725228
2,0.6897,0.7734,0.882676,0.906274,0.790212,0.808384


[I 2025-03-28 14:35:39,938] Trial 110 pruned. 


Trial 111 with params: {'learning_rate': 0.0001896764680585084, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.30000000000000004, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6828,0.867311,0.869844,0.732015,0.742889,0.73682
2,0.3562,0.759991,0.879927,0.884519,0.825987,0.843615


[I 2025-03-28 14:36:42,027] Trial 111 pruned. 


Trial 112 with params: {'learning_rate': 0.00018005515625529216, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7295,0.891596,0.865261,0.728279,0.738531,0.732836
2,0.3723,0.739536,0.886343,0.889197,0.830762,0.848633
3,0.2184,0.656768,0.897342,0.899535,0.846891,0.864933
4,0.1735,0.694039,0.888176,0.881112,0.839095,0.85471
5,0.1427,0.673797,0.890009,0.882244,0.839707,0.855578


[I 2025-03-28 14:39:21,357] Trial 112 pruned. 


Trial 113 with params: {'learning_rate': 9.919351336125201e-05, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.0, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.299,1.342832,0.854262,0.719598,0.72842,0.723553


[I 2025-03-28 14:39:51,726] Trial 113 pruned. 


Trial 114 with params: {'learning_rate': 0.00043306163705016814, 'weight_decay': 0.009000000000000001, 'warmup_steps': 4, 'lambda_param': 0.30000000000000004, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.101,0.779356,0.880843,0.887547,0.813887,0.836135
2,0.2232,0.749099,0.889093,0.870559,0.841213,0.852203
3,0.1565,0.7867,0.870761,0.855705,0.8247,0.836172


[I 2025-03-28 14:41:30,426] Trial 114 pruned. 


Trial 115 with params: {'learning_rate': 0.00015605814032519834, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1, 'lambda_param': 0.8, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.859,0.968462,0.868928,0.730429,0.741326,0.735428
2,0.4332,0.726539,0.885426,0.887839,0.820481,0.839809


[I 2025-03-28 14:42:32,499] Trial 115 pruned. 


Trial 116 with params: {'learning_rate': 0.00019678204048657536, 'weight_decay': 0.005, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6549,0.850728,0.872594,0.734359,0.744488,0.738799
2,0.3461,0.766508,0.877177,0.881496,0.824049,0.841214
3,0.2089,0.655344,0.893676,0.896722,0.844494,0.862231
4,0.1649,0.685068,0.886343,0.879439,0.838431,0.853479
5,0.1369,0.676184,0.889093,0.881788,0.83949,0.85501


[I 2025-03-28 14:45:12,252] Trial 116 pruned. 


Trial 117 with params: {'learning_rate': 0.00015027285574006327, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.1, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8995,0.992604,0.867094,0.729282,0.739979,0.734237
2,0.4526,0.724714,0.886343,0.88885,0.821315,0.840612


[I 2025-03-28 14:46:13,118] Trial 117 pruned. 


Trial 118 with params: {'learning_rate': 0.00023497366287061837, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5031,0.794212,0.877177,0.737205,0.747981,0.74199
2,0.2992,0.731889,0.888176,0.89078,0.831618,0.849913
3,0.1918,0.656441,0.893676,0.884077,0.8442,0.858622
4,0.1526,0.707316,0.883593,0.876888,0.834599,0.850402
5,0.1283,0.682558,0.890926,0.882477,0.842279,0.856706
6,0.1118,0.718906,0.887259,0.880621,0.837484,0.853823
7,0.1015,0.768738,0.872594,0.858694,0.825862,0.838595
8,0.095,0.70818,0.88451,0.878866,0.834489,0.85133
9,0.0908,0.716623,0.88176,0.863234,0.834303,0.84533
10,0.083,0.726976,0.87901,0.862249,0.83055,0.842844


[I 2025-03-28 14:51:28,511] Trial 118 pruned. 


Trial 119 with params: {'learning_rate': 0.00016121710388565795, 'weight_decay': 0.006, 'warmup_steps': 1, 'lambda_param': 0.8, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8278,0.948982,0.869844,0.731149,0.74209,0.736152
2,0.4175,0.727912,0.88451,0.88715,0.81955,0.839017


[I 2025-03-28 14:52:35,765] Trial 119 pruned. 


Trial 120 with params: {'learning_rate': 0.0003791496115871342, 'weight_decay': 0.008, 'warmup_steps': 0, 'lambda_param': 0.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1418,0.721023,0.87901,0.881161,0.804227,0.824806
2,0.2309,0.698012,0.88176,0.863555,0.835022,0.845835


[I 2025-03-28 14:53:36,934] Trial 120 pruned. 


Trial 121 with params: {'learning_rate': 1.5745418122329243e-05, 'weight_decay': 0.003, 'warmup_steps': 3, 'lambda_param': 1.0, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.572,3.419045,0.464711,0.236405,0.349654,0.280184
2,2.9416,2.741977,0.727773,0.636984,0.620084,0.62105


[I 2025-03-28 14:54:44,361] Trial 121 pruned. 


Trial 122 with params: {'learning_rate': 0.0002362398781984195, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4988,0.792915,0.877177,0.736832,0.747994,0.741883
2,0.2981,0.723732,0.890009,0.892478,0.833014,0.851544
3,0.1913,0.647091,0.890009,0.881467,0.841353,0.856018
4,0.1535,0.699652,0.890009,0.882277,0.839743,0.855756
5,0.1279,0.698058,0.887259,0.880375,0.838404,0.853698
6,0.11,0.710037,0.882676,0.86708,0.833649,0.847157
7,0.101,0.759155,0.877177,0.861255,0.829773,0.841909
8,0.0943,0.700918,0.880843,0.864795,0.831862,0.844968
9,0.0896,0.709354,0.885426,0.867486,0.836287,0.848499
10,0.0828,0.714674,0.882676,0.865239,0.833761,0.845975


[I 2025-03-28 15:02:45,330] Trial 122 finished with value: 0.8428319571162907 and parameters: {'learning_rate': 0.0002362398781984195, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}. Best is trial 122 with value: 0.8428319571162907.


Trial 123 with params: {'learning_rate': 0.00038307908163366395, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1613,0.735943,0.87901,0.882296,0.803496,0.82498
2,0.2321,0.722005,0.888176,0.868575,0.840541,0.850964
3,0.1581,0.717576,0.87901,0.874047,0.831353,0.847392


[I 2025-03-28 15:04:20,126] Trial 123 pruned. 


Trial 124 with params: {'learning_rate': 0.0002523101368650241, 'weight_decay': 0.008, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4535,0.776507,0.878093,0.737018,0.748785,0.742419
2,0.287,0.742255,0.888176,0.891253,0.831667,0.850176
3,0.1871,0.662898,0.890009,0.8821,0.841432,0.856299
4,0.1487,0.709956,0.879927,0.873983,0.832386,0.847928
5,0.1249,0.703931,0.886343,0.879285,0.837692,0.852914


[I 2025-03-28 15:07:00,719] Trial 124 pruned. 


Trial 125 with params: {'learning_rate': 0.0001296811629436767, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0373,1.095953,0.862511,0.726776,0.736785,0.731359
2,0.5331,0.724437,0.885426,0.885546,0.811163,0.830202


[I 2025-03-28 15:08:03,589] Trial 125 pruned. 


Trial 126 with params: {'learning_rate': 0.00011704977597501867, 'weight_decay': 0.0, 'warmup_steps': 3, 'lambda_param': 0.0, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1393,1.180463,0.861595,0.725234,0.735439,0.729932


[I 2025-03-28 15:08:33,815] Trial 126 pruned. 


Trial 127 with params: {'learning_rate': 1.9042756876605174e-05, 'weight_decay': 0.01, 'warmup_steps': 1, 'lambda_param': 0.4, 'temperature': 7.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.5111,3.267437,0.498625,0.566569,0.383995,0.341536
2,2.7409,2.508491,0.764436,0.65498,0.653158,0.650421


[I 2025-03-28 15:09:41,501] Trial 127 pruned. 


Trial 128 with params: {'learning_rate': 0.00022603940479354003, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5339,0.802713,0.874427,0.735057,0.745967,0.739936
2,0.3086,0.733975,0.889093,0.891204,0.832668,0.850566
3,0.1936,0.66521,0.889093,0.87944,0.840735,0.854367


[I 2025-03-28 15:11:12,598] Trial 128 pruned. 


Trial 129 with params: {'learning_rate': 0.00019281421483645332, 'weight_decay': 0.009000000000000001, 'warmup_steps': 3, 'lambda_param': 0.7000000000000001, 'temperature': 3.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6726,0.860018,0.868011,0.730635,0.74096,0.735198
2,0.3519,0.761566,0.87901,0.88312,0.825263,0.842539


[I 2025-03-28 15:12:22,819] Trial 129 pruned. 


Trial 130 with params: {'learning_rate': 0.00025279144727131616, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1, 'lambda_param': 0.8, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4409,0.784343,0.874427,0.734915,0.745708,0.739757
2,0.2849,0.717344,0.893676,0.896858,0.844867,0.862391
3,0.1857,0.65365,0.892759,0.883234,0.843076,0.857607
4,0.1496,0.697976,0.877177,0.8843,0.83063,0.849189
5,0.1252,0.697597,0.883593,0.876583,0.835093,0.850209


[I 2025-03-28 15:15:00,626] Trial 130 pruned. 


Trial 131 with params: {'learning_rate': 0.00011147227759502049, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.6000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1834,1.224212,0.857012,0.721264,0.731876,0.726169
2,0.6357,0.758054,0.883593,0.883586,0.809091,0.828385


[I 2025-03-28 15:16:01,751] Trial 131 pruned. 


Trial 132 with params: {'learning_rate': 0.0002350885771508221, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5026,0.794026,0.877177,0.737205,0.747981,0.74199
2,0.2991,0.731604,0.887259,0.890181,0.830938,0.849253
3,0.1917,0.655668,0.892759,0.883273,0.843485,0.857861
4,0.1525,0.706458,0.883593,0.876888,0.834599,0.850402
5,0.128,0.679594,0.892759,0.883861,0.843661,0.858186
6,0.1116,0.719225,0.88451,0.879009,0.834342,0.851358
7,0.1015,0.767577,0.874427,0.859781,0.827411,0.8399
8,0.095,0.700996,0.887259,0.880347,0.836883,0.853305
9,0.0912,0.714391,0.882676,0.864923,0.834719,0.846397
10,0.0829,0.723139,0.88176,0.865071,0.832634,0.845332


[I 2025-03-28 15:21:17,204] Trial 132 pruned. 


Trial 133 with params: {'learning_rate': 0.0001912274455548727, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6758,0.863665,0.870761,0.73282,0.743556,0.737522
2,0.3539,0.762243,0.87901,0.883584,0.82532,0.842803
3,0.2122,0.653027,0.894592,0.897399,0.845209,0.862927
4,0.1688,0.691495,0.890009,0.883359,0.840856,0.856705
5,0.1391,0.675235,0.890009,0.882253,0.840087,0.855806


[I 2025-03-28 15:23:57,419] Trial 133 pruned. 


Trial 134 with params: {'learning_rate': 0.00013026648449214178, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1, 'lambda_param': 0.5, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0286,1.091373,0.862511,0.726729,0.736785,0.731368


[I 2025-03-28 15:24:32,575] Trial 134 pruned. 


Trial 135 with params: {'learning_rate': 5.704978541670567e-05, 'weight_decay': 0.004, 'warmup_steps': 1, 'lambda_param': 1.0, 'temperature': 6.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.822,1.986453,0.816682,0.693565,0.698445,0.695265
2,1.3347,1.191702,0.865261,0.724073,0.739724,0.731589


[I 2025-03-28 15:25:35,634] Trial 135 pruned. 


Trial 136 with params: {'learning_rate': 0.000144126682671934, 'weight_decay': 0.0, 'warmup_steps': 0, 'lambda_param': 0.4, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9257,1.015308,0.868011,0.729561,0.740805,0.734846


[I 2025-03-28 15:26:05,831] Trial 136 pruned. 


Trial 137 with params: {'learning_rate': 0.00018672893520468124, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 1.0, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6968,0.874523,0.868928,0.731441,0.741842,0.736013
2,0.3608,0.754681,0.880843,0.885497,0.826583,0.844381
3,0.2149,0.651527,0.898258,0.900415,0.847571,0.865687
4,0.1707,0.688353,0.890009,0.882669,0.841041,0.856524
5,0.1406,0.678531,0.887259,0.870201,0.837906,0.850655


[I 2025-03-28 15:28:42,776] Trial 137 pruned. 


Trial 138 with params: {'learning_rate': 0.0002393631929640189, 'weight_decay': 0.006, 'warmup_steps': 2, 'lambda_param': 0.2, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4885,0.79039,0.87626,0.736052,0.747328,0.741179
2,0.2956,0.718919,0.891842,0.893594,0.834472,0.852836
3,0.1899,0.648064,0.893676,0.884327,0.844179,0.858778
4,0.1527,0.702069,0.886343,0.87879,0.836763,0.852478
5,0.1263,0.679268,0.889093,0.880548,0.840448,0.85496


[I 2025-03-28 15:31:21,131] Trial 138 pruned. 


Trial 139 with params: {'learning_rate': 0.00012945768684487043, 'weight_decay': 0.003, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.039,1.097329,0.862511,0.726776,0.736785,0.731359


[I 2025-03-28 15:31:55,651] Trial 139 pruned. 


Trial 140 with params: {'learning_rate': 0.00033458481464474914, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 2.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2431,0.736207,0.886343,0.890222,0.818733,0.840189
2,0.2431,0.768168,0.882676,0.874227,0.836028,0.849341
3,0.1677,0.760933,0.87626,0.8835,0.82895,0.847688


[I 2025-03-28 15:33:25,604] Trial 140 pruned. 


Trial 141 with params: {'learning_rate': 4.075479103416734e-05, 'weight_decay': 0.002, 'warmup_steps': 3, 'lambda_param': 0.5, 'temperature': 4.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.1026,2.415956,0.780018,0.67039,0.664974,0.664468


[I 2025-03-28 15:34:00,345] Trial 141 pruned. 


Trial 142 with params: {'learning_rate': 0.0001650985977148528, 'weight_decay': 0.006, 'warmup_steps': 1, 'lambda_param': 0.7000000000000001, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8049,0.935786,0.868011,0.729868,0.740659,0.734783
2,0.4063,0.734253,0.882676,0.885333,0.818467,0.837501


[I 2025-03-28 15:35:04,853] Trial 142 pruned. 


Trial 143 with params: {'learning_rate': 0.00023839616536102163, 'weight_decay': 0.009000000000000001, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4917,0.791052,0.87626,0.736052,0.747328,0.741179
2,0.2964,0.719297,0.890926,0.892684,0.833805,0.851998
3,0.1901,0.643286,0.894592,0.885223,0.844894,0.859629
4,0.1522,0.698193,0.883593,0.875955,0.835593,0.850373
5,0.1264,0.693273,0.88176,0.875128,0.833614,0.848807


[I 2025-03-28 15:37:43,464] Trial 143 pruned. 


Trial 144 with params: {'learning_rate': 1.1375872635111501e-05, 'weight_decay': 0.009000000000000001, 'warmup_steps': 1, 'lambda_param': 0.0, 'temperature': 5.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.6453,3.603332,0.441797,0.255241,0.332013,0.275662
2,3.2159,3.104942,0.608616,0.589907,0.496557,0.499927


[I 2025-03-28 15:38:45,220] Trial 144 pruned. 


Trial 145 with params: {'learning_rate': 0.00013462228938406439, 'weight_decay': 0.006, 'warmup_steps': 3, 'lambda_param': 0.2, 'temperature': 6.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.0057,1.067918,0.861595,0.72575,0.736119,0.730608


[I 2025-03-28 15:39:20,415] Trial 145 pruned. 


Trial 146 with params: {'learning_rate': 0.00022948616811289355, 'weight_decay': 0.007, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 2.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5218,0.799185,0.87626,0.736651,0.7473,0.741344
2,0.3049,0.739603,0.887259,0.889753,0.831071,0.848983
3,0.1928,0.65828,0.892759,0.883226,0.843649,0.857946


[I 2025-03-28 15:40:52,980] Trial 146 pruned. 


Trial 147 with params: {'learning_rate': 0.0001885930583357555, 'weight_decay': 0.004, 'warmup_steps': 2, 'lambda_param': 0.4, 'temperature': 4.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.688,0.870152,0.869844,0.73213,0.74284,0.736817
2,0.3579,0.757811,0.880843,0.885387,0.826667,0.844431


[I 2025-03-28 15:42:00,092] Trial 147 pruned. 


Trial 148 with params: {'learning_rate': 0.00034220148166248327, 'weight_decay': 0.007, 'warmup_steps': 0, 'lambda_param': 0.1, 'temperature': 5.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2083,0.741013,0.883593,0.888167,0.817137,0.838274
2,0.2376,0.705569,0.882676,0.875716,0.835759,0.850202
3,0.1656,0.695523,0.887259,0.892868,0.837717,0.856782
4,0.1319,0.748343,0.882676,0.863802,0.834878,0.84547
5,0.1108,0.73337,0.883593,0.876855,0.835002,0.850222


[I 2025-03-28 15:44:38,292] Trial 148 pruned. 


Trial 149 with params: {'learning_rate': 0.0001810064352145765, 'weight_decay': 0.008, 'warmup_steps': 2, 'lambda_param': 0.8, 'temperature': 3.5}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7247,0.889067,0.866178,0.729022,0.739529,0.733661
2,0.3705,0.741424,0.886343,0.889197,0.830762,0.848633
3,0.2179,0.656731,0.896425,0.898804,0.846176,0.86421
4,0.1732,0.695757,0.889093,0.882016,0.839776,0.855508
5,0.1423,0.675283,0.889093,0.881389,0.839089,0.85488


[I 2025-03-28 15:47:14,523] Trial 149 pruned. 


In [19]:
print(best_trial4)

BestRun(run_id='122', objective=0.8428319571162907, hyperparameters={'learning_rate': 0.0002362398781984195, 'weight_decay': 0.01, 'warmup_steps': 2, 'lambda_param': 0.7000000000000001, 'temperature': 2.5}, run_summary=None)


In [20]:
print("Best normal training score: ", best_trial)
print("Best distilation trianing score: ", best_trial2)
print("Best normal training score with augmentations: ", best_trial3)
print("Best distilation trianing score with augmentations: ",best_trial4)