# Veri Aktarma

In [None]:
# =======================
# 0. Paketler
# =======================
import os
import shutil
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, Features, Sequence, Value
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForTokenClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
import evaluate
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# =======================
# 1. Verileri oku (Train + Validation)
# =======================
train_df = pd.read_json("realistic_train.jsonl", lines=True)
val_df   = pd.read_json("realistic_val.jsonl", lines=True)

# Entities sütununu string yap -> Arrow hatasını önlemek için
for df in [train_df, val_df]:
    df["entities"] = df["entities"].apply(lambda x: [list(map(str, ent)) for ent in x])

# Dataset features tanımla
features = Features({
    "text": Value("string"),
    "entities": Sequence(Sequence(Value("string")))  # [[start, end, label], ...]
})

train_ds = Dataset.from_pandas(train_df, features=features)
val_ds   = Dataset.from_pandas(val_df, features=features)



# TOKENİZE ETME

In [2]:
# =======================
# 2. Tokenizer hazırla
# =======================
model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Label listesi -> entity tiplerinden BIO formatı
unique_labels = set()
for ents in train_df["entities"]:
    for e in ents:
        unique_labels.add(e[2])   # sadece label kısmı

label_list = sorted(list(unique_labels))
label_list = ["O"] + [f"B-{l}" for l in label_list] + [f"I-{l}" for l in label_list]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}

# =======================
# 3. Label hizalama fonksiyonu (BIO)
# =======================
MAX_LEN = 128  # istersen 256 yapabilirsin (uzun metinlerde daha az truncation)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_offsets_mapping=True
    )

    labels = []
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        entity_list = examples["entities"][i]
        # JSONL içinden string gelen değerleri int'e çeviriyoruz
        ents = [(int(start), int(end), label) for start, end, label in entity_list]

        label_ids = []
        for start, end in offsets:
            if start == end:   # özel token (CLS/SEP vs.)
                label_ids.append(-100)
            else:
                tag = "O"
                for ent_start, ent_end, ent_label in ents:
                    if start >= ent_start and end <= ent_end:
                        tag = f"B-{ent_label}" if start == ent_start else f"I-{ent_label}"
                        break
                label_ids.append(label2id[tag])
        labels.append(label_ids)

    tokenized["labels"] = labels
    tokenized.pop("offset_mapping")
    return tokenized

train_tokenized = train_ds.map(tokenize_and_align_labels, batched=True)
val_tokenized   = val_ds.map(tokenize_and_align_labels, batched=True)



Map: 100%|██████████| 4500/4500 [00:01<00:00, 3393.60 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 3596.40 examples/s]


# Model ve Metrik Tanımlama

In [3]:
# =======================
# 4. model_init (dropout'lar trial'dan)
# =======================
def model_init(trial=None):
    hidden_dropout = trial.suggest_float("hidden_dropout_prob", 0.0, 0.3) if trial else 0.1
    attn_dropout   = trial.suggest_float("attention_probs_dropout_prob", 0.0, 0.3) if trial else 0.1

    cfg = AutoConfig.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        problem_type="token_classification",
        hidden_dropout_prob=hidden_dropout,
        attention_probs_dropout_prob=attn_dropout,
    )
    return AutoModelForTokenClassification.from_pretrained(model_name, config=cfg)

# =======================
# 5. Değerlendirme metriği
# =======================
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_preds = [
        [id2label[pred] for (pred, l) in zip(pred_row, label) if l != -100]
        for pred_row, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



# Train Hazırlığı

In [None]:
# =======================
# 6. Argümanlar + HP Search Trainer
# =======================
hp_output_dir = "./_tmp_hp"      # deneme checkpoint'leri (sonunda silinecek)
best_output_dir = "./_tmp_best"  # en iyi HParam ile eğitimin checkpoint'leri (sonunda silinecek)
final_dir = "./bert-turkish-ner-final/yeni_veri3"  # EN İYİ model buraya

# GPU bf16 destekliyorsa otomatik aç
supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

base_args = TrainingArguments(
    output_dir=hp_output_dir,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=200,                  # veri boyutuna göre ayarlayın (50–500)
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=1,

    # bu değerler hp_space tarafından override edilecek
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_ratio=0.0,
    lr_scheduler_type="linear",
    gradient_accumulation_steps=1,
    max_grad_norm=1.0,
    label_smoothing_factor=0.0,

    gradient_checkpointing=True,
    fp16=False,
    bf16=bool(supports_bf16),
    no_cuda=False,
    optim="adamw_8bit",              # bitsandbytes yoksa "adamw_torch"
    group_by_length=True,
    dataloader_num_workers=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    seed=42
)

trainer = Trainer(
    model_init=model_init,  # >>> önemli: model_init
    args=base_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=2e-4)]
)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Optimizasyon

In [5]:
# =======================
# 7. Optuna arama uzayı (genişletilmiş)
# =======================
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 4, 12),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4]),
        "weight_decay": trial.suggest_float("weight_decay", 1e-5, 1e-1, log=True),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.3),
        "lr_scheduler_type": trial.suggest_categorical(
            "lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts", "polynomial"]
        ),
        "max_grad_norm": trial.suggest_float("max_grad_norm", 0.5, 1.0),
        "label_smoothing_factor": trial.suggest_float("label_smoothing_factor", 0.0, 0.1),

        # model_init içinde okunacak dropout'lar:
        "hidden_dropout_prob": trial.suggest_float("hidden_dropout_prob", 0.0, 0.3),
        "attention_probs_dropout_prob": trial.suggest_float("attention_probs_dropout_prob", 0.0, 0.3),
    }

def compute_objective(metrics):
    return metrics["eval_f1"]



# Optimizasyon Çalıştırma (OPTUNA)

In [6]:
# =======================
# 8. Hyperparameter Search (validation F1'i maksimize)
# =======================
os.makedirs(hp_output_dir, exist_ok=True)

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    compute_objective=compute_objective,
    n_trials=40,                             # bütçenize göre 80-100 yapabilirsiniz
    backend="optuna",
    sampler=TPESampler(seed=42, multivariate=True),
    pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=0),
)
print("Best trial:", best_trial)
print("Best hyperparameters:", best_trial.hyperparameters)

# =======================
# 9. En iyi hiperparametrelerle yeniden eğit
# =======================
os.makedirs(best_output_dir, exist_ok=True)

best_args = TrainingArguments(
    output_dir=best_output_dir,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=200,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,

    learning_rate=best_trial.hyperparameters["learning_rate"],
    per_device_train_batch_size=best_trial.hyperparameters["per_device_train_batch_size"],
    per_device_eval_batch_size=best_trial.hyperparameters["per_device_train_batch_size"],
    num_train_epochs=best_trial.hyperparameters["num_train_epochs"],
    weight_decay=best_trial.hyperparameters["weight_decay"],
    warmup_ratio=best_trial.hyperparameters["warmup_ratio"],
    lr_scheduler_type=best_trial.hyperparameters["lr_scheduler_type"],
    gradient_accumulation_steps=best_trial.hyperparameters.get("gradient_accumulation_steps", 1),
    max_grad_norm=best_trial.hyperparameters.get("max_grad_norm", 1.0),
    label_smoothing_factor=best_trial.hyperparameters.get("label_smoothing_factor", 0.0),

    gradient_checkpointing=True,
    fp16=False,
    bf16=bool(supports_bf16),
    no_cuda=False,
    optim="adamw_8bit",
    group_by_length=True,
    dataloader_num_workers=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    seed=42
)

best_trainer = Trainer(
    model_init=model_init,  # >>> yine model_init
    args=best_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4, early_stopping_threshold=2e-4)]
)






[I 2025-08-21 13:50:07,542] A new study created in memory with name: no-name-091889de-63df-4856-abf3-d3dfdc222526
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return fn(*args, **kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  0%|          | 52/13500 [00:17<20:37, 10.86it/s]  

{'loss': 3.702, 'grad_norm': 8.353860855102539, 'learning_rate': 8.887988134073672e-07, 'epoch': 0.04}


  1%|          | 102/13500 [00:22<20:30, 10.88it/s]

{'loss': 3.1448, 'grad_norm': 7.468728065490723, 'learning_rate': 1.7775976268147343e-06, 'epoch': 0.09}


  1%|          | 152/13500 [00:26<20:46, 10.71it/s]

{'loss': 2.0772, 'grad_norm': 3.6064555644989014, 'learning_rate': 2.6663964402221016e-06, 'epoch': 0.13}


  1%|▏         | 200/13500 [00:31<20:22, 10.88it/s]

{'loss': 1.2093, 'grad_norm': 3.5722079277038574, 'learning_rate': 3.5551952536294686e-06, 'epoch': 0.18}


  _warn_prf(average, modifier, msg_start, len(result))
                                                   
  1%|▏         | 200/13500 [00:46<20:22, 10.88it/s]

{'eval_loss': 0.8110863566398621, 'eval_precision': 0.335602899187349, 'eval_recall': 0.325314030232063, 'eval_f1': 0.3303783783783784, 'eval_accuracy': 0.7903366336633664, 'eval_runtime': 15.8415, 'eval_samples_per_second': 31.563, 'eval_steps_per_second': 7.891, 'epoch': 0.18}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  2%|▏         | 252/13500 [00:52<20:20, 10.85it/s]   

{'loss': 0.6707, 'grad_norm': 2.9042699337005615, 'learning_rate': 4.443994067036836e-06, 'epoch': 0.22}


  2%|▏         | 302/13500 [00:57<20:08, 10.92it/s]

{'loss': 0.3507, 'grad_norm': 1.8381259441375732, 'learning_rate': 5.332792880444203e-06, 'epoch': 0.27}


  3%|▎         | 352/13500 [01:01<20:02, 10.93it/s]

{'loss': 0.162, 'grad_norm': 1.0773353576660156, 'learning_rate': 6.221591693851571e-06, 'epoch': 0.31}


  3%|▎         | 400/13500 [01:06<20:07, 10.85it/s]

{'loss': 0.0729, 'grad_norm': 4.987400054931641, 'learning_rate': 7.110390507258937e-06, 'epoch': 0.36}


                                                   
  3%|▎         | 400/13500 [01:21<20:07, 10.85it/s]

{'eval_loss': 0.04206102713942528, 'eval_precision': 0.9661337820782498, 'eval_recall': 0.977858207366404, 'eval_f1': 0.9719606390858111, 'eval_accuracy': 0.9923762376237624, 'eval_runtime': 15.6736, 'eval_samples_per_second': 31.901, 'eval_steps_per_second': 7.975, 'epoch': 0.36}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  3%|▎         | 452/13500 [01:27<20:07, 10.80it/s]   

{'loss': 0.0431, 'grad_norm': 0.5634477138519287, 'learning_rate': 7.999189320666306e-06, 'epoch': 0.4}


  4%|▎         | 502/13500 [01:31<20:14, 10.70it/s]

{'loss': 0.0161, 'grad_norm': 0.19973011314868927, 'learning_rate': 8.887988134073671e-06, 'epoch': 0.44}


  4%|▍         | 552/13500 [01:36<19:42, 10.95it/s]

{'loss': 0.0107, 'grad_norm': 0.05458303168416023, 'learning_rate': 9.776786947481039e-06, 'epoch': 0.49}


  4%|▍         | 600/13500 [01:40<19:47, 10.87it/s]

{'loss': 0.0085, 'grad_norm': 0.038223765790462494, 'learning_rate': 1.0665585760888406e-05, 'epoch': 0.53}


                                                   
  4%|▍         | 600/13500 [01:56<19:47, 10.87it/s]

{'eval_loss': 0.0028270843904465437, 'eval_precision': 0.9978718876356671, 'eval_recall': 0.9982967851820311, 'eval_f1': 0.9980842911877394, 'eval_accuracy': 0.9996237623762376, 'eval_runtime': 15.7843, 'eval_samples_per_second': 31.677, 'eval_steps_per_second': 7.919, 'epoch': 0.53}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  5%|▍         | 651/13500 [02:02<19:36, 10.92it/s]   

{'loss': 0.0085, 'grad_norm': 0.7075756788253784, 'learning_rate': 1.1554384574295775e-05, 'epoch': 0.58}


  5%|▌         | 701/13500 [02:06<19:28, 10.96it/s]

{'loss': 0.0102, 'grad_norm': 0.04562601074576378, 'learning_rate': 1.2443183387703141e-05, 'epoch': 0.62}


  6%|▌         | 751/13500 [02:11<19:32, 10.88it/s]

{'loss': 0.004, 'grad_norm': 0.030317798256874084, 'learning_rate': 1.3331982201110509e-05, 'epoch': 0.67}


  6%|▌         | 800/13500 [02:15<19:19, 10.96it/s]

{'loss': 0.0031, 'grad_norm': 0.02245190180838108, 'learning_rate': 1.4220781014517874e-05, 'epoch': 0.71}


                                                   
  6%|▌         | 800/13500 [02:31<19:19, 10.96it/s]

{'eval_loss': 0.0012573737185448408, 'eval_precision': 0.998935717326522, 'eval_recall': 0.9991483925910155, 'eval_f1': 0.9990420436402342, 'eval_accuracy': 0.9999009900990099, 'eval_runtime': 15.6508, 'eval_samples_per_second': 31.947, 'eval_steps_per_second': 7.987, 'epoch': 0.71}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  6%|▋         | 851/13500 [02:37<19:32, 10.78it/s]  

{'loss': 0.0039, 'grad_norm': 0.3443985879421234, 'learning_rate': 1.5109579827925244e-05, 'epoch': 0.76}


  7%|▋         | 901/13500 [02:41<19:16, 10.89it/s]

{'loss': 0.0021, 'grad_norm': 0.4846867024898529, 'learning_rate': 1.599837864133261e-05, 'epoch': 0.8}


  7%|▋         | 951/13500 [02:46<19:33, 10.70it/s]

{'loss': 0.004, 'grad_norm': 0.02691871114075184, 'learning_rate': 1.688717745473998e-05, 'epoch': 0.84}


  7%|▋         | 1000/13500 [02:50<19:18, 10.79it/s]

{'loss': 0.0023, 'grad_norm': 0.022746799513697624, 'learning_rate': 1.7775976268147343e-05, 'epoch': 0.89}


                                                    
  7%|▋         | 1000/13500 [03:06<19:18, 10.79it/s]

{'eval_loss': 0.0009187837713398039, 'eval_precision': 0.9995741056218058, 'eval_recall': 0.9993612944432616, 'eval_f1': 0.9994676887043543, 'eval_accuracy': 0.9999207920792079, 'eval_runtime': 15.6746, 'eval_samples_per_second': 31.899, 'eval_steps_per_second': 7.975, 'epoch': 0.89}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  8%|▊         | 1052/13500 [03:12<19:03, 10.89it/s]  

{'loss': 0.0024, 'grad_norm': 0.008280666545033455, 'learning_rate': 1.8664775081554714e-05, 'epoch': 0.93}


  8%|▊         | 1102/13500 [03:16<18:57, 10.90it/s]

{'loss': 0.0018, 'grad_norm': 0.0765613466501236, 'learning_rate': 1.9553573894962078e-05, 'epoch': 0.98}


  9%|▊         | 1151/13500 [03:34<22:50,  9.01it/s]  

{'loss': 0.0014, 'grad_norm': 0.888707160949707, 'learning_rate': 2.044237270836945e-05, 'epoch': 1.02}


  9%|▉         | 1200/13500 [03:38<18:57, 10.81it/s]

{'loss': 0.0009, 'grad_norm': 0.004078365862369537, 'learning_rate': 2.1331171521776813e-05, 'epoch': 1.07}


                                                    
  9%|▉         | 1200/13500 [03:54<18:57, 10.81it/s]

{'eval_loss': 0.0004679140402004123, 'eval_precision': 0.9991485738612176, 'eval_recall': 0.9993612944432616, 'eval_f1': 0.9992549228312931, 'eval_accuracy': 0.9999207920792079, 'eval_runtime': 15.722, 'eval_samples_per_second': 31.803, 'eval_steps_per_second': 7.951, 'epoch': 1.07}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  9%|▉         | 1252/13500 [04:00<18:57, 10.77it/s]  

{'loss': 0.0015, 'grad_norm': 0.009042259305715561, 'learning_rate': 2.221997033518418e-05, 'epoch': 1.11}


 10%|▉         | 1302/13500 [04:04<18:34, 10.94it/s]

{'loss': 0.0008, 'grad_norm': 0.011725695803761482, 'learning_rate': 2.310876914859155e-05, 'epoch': 1.16}


 10%|█         | 1352/13500 [04:09<18:42, 10.82it/s]

{'loss': 0.0017, 'grad_norm': 0.007900822907686234, 'learning_rate': 2.3997567961998915e-05, 'epoch': 1.2}


 10%|█         | 1400/13500 [04:13<18:31, 10.89it/s]

{'loss': 0.0023, 'grad_norm': 0.017811244353652, 'learning_rate': 2.4886366775406282e-05, 'epoch': 1.24}


                                                    
 10%|█         | 1400/13500 [04:29<18:31, 10.89it/s]

{'eval_loss': 0.0011975248344242573, 'eval_precision': 0.9974462651628005, 'eval_recall': 0.9978709814775388, 'eval_f1': 0.9976585781183482, 'eval_accuracy': 0.9997029702970297, 'eval_runtime': 15.6233, 'eval_samples_per_second': 32.003, 'eval_steps_per_second': 8.001, 'epoch': 1.24}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 11%|█         | 1452/13500 [04:35<18:30, 10.85it/s]   

{'loss': 0.0032, 'grad_norm': 0.009410947561264038, 'learning_rate': 2.5775165588813653e-05, 'epoch': 1.29}


 11%|█         | 1502/13500 [04:39<18:18, 10.93it/s]

{'loss': 0.0026, 'grad_norm': 0.005000211298465729, 'learning_rate': 2.6663964402221017e-05, 'epoch': 1.33}


 11%|█▏        | 1552/13500 [04:44<18:41, 10.66it/s]

{'loss': 0.0026, 'grad_norm': 0.8769811391830444, 'learning_rate': 2.7552763215628385e-05, 'epoch': 1.38}


 12%|█▏        | 1600/13500 [04:48<18:01, 11.00it/s]

{'loss': 0.0058, 'grad_norm': 0.022304717451334, 'learning_rate': 2.844156202903575e-05, 'epoch': 1.42}


                                                    
 12%|█▏        | 1600/13500 [05:04<18:01, 11.00it/s]

{'eval_loss': 0.003000806551426649, 'eval_precision': 0.9951178093822968, 'eval_recall': 0.9980838833297849, 'eval_f1': 0.9965986394557823, 'eval_accuracy': 0.9996039603960396, 'eval_runtime': 16.3299, 'eval_samples_per_second': 30.619, 'eval_steps_per_second': 7.655, 'epoch': 1.42}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 12%|█▏        | 1652/13500 [05:10<18:43, 10.55it/s]   

{'loss': 0.0147, 'grad_norm': 6.977432727813721, 'learning_rate': 2.933036084244312e-05, 'epoch': 1.47}


 13%|█▎        | 1702/13500 [05:15<18:23, 10.70it/s]

{'loss': 0.0041, 'grad_norm': 1.1790859699249268, 'learning_rate': 3.0219159655850487e-05, 'epoch': 1.51}


 13%|█▎        | 1750/13500 [05:18<18:31, 10.57it/s]

{'loss': 0.0019, 'grad_norm': 0.18083156645298004, 'learning_rate': 3.110795846925785e-05, 'epoch': 1.56}


 13%|█▎        | 1800/13500 [05:22<18:59, 10.27it/s]

{'loss': 0.0014, 'grad_norm': 0.0019908652175217867, 'learning_rate': 3.199675728266522e-05, 'epoch': 1.6}


                                                    
 13%|█▎        | 1800/13500 [05:39<18:59, 10.27it/s]

{'eval_loss': 0.00141254928894341, 'eval_precision': 0.9982975101085337, 'eval_recall': 0.9987225888865233, 'eval_f1': 0.9985100042571307, 'eval_accuracy': 0.9996831683168317, 'eval_runtime': 16.9507, 'eval_samples_per_second': 29.497, 'eval_steps_per_second': 7.374, 'epoch': 1.6}


 13%|█▎        | 1800/13500 [05:42<37:03,  5.26it/s]
[I 2025-08-21 13:55:51,479] Trial 0 finished with value: 0.9985100042571307 and parameters: {'learning_rate': 4.3284502212938785e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 1, 'weight_decay': 0.029154431891537533, 'warmup_ratio': 0.18033450352296262, 'lr_scheduler_type': 'cosine_with_restarts', 'max_grad_norm': 0.6061695553391381, 'label_smoothing_factor': 0.018182496720710064, 'hidden_dropout_prob': 0.055021352956030146, 'attention_probs_dropout_prob': 0.09127267288786131}. Best is trial 0 with value: 0.9985100042571307.
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


{'train_runtime': 342.0624, 'train_samples_per_second': 157.866, 'train_steps_per_second': 39.466, 'train_loss': 0.32097837971730364, 'epoch': 1.6}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  5%|▌         | 50/980 [00:32<06:08,  2.52it/s] 

{'loss': 2.455, 'grad_norm': 1.5622904300689697, 'learning_rate': 1.6861781658606575e-05, 'epoch': 0.36}


 10%|█         | 100/980 [00:52<05:49,  2.52it/s]

{'loss': 0.2211, 'grad_norm': 0.233993262052536, 'learning_rate': 3.372356331721315e-05, 'epoch': 0.71}


 15%|█▌        | 150/980 [01:25<07:43,  1.79it/s]  

{'loss': 0.0083, 'grad_norm': 0.5613688230514526, 'learning_rate': 5.058534497581973e-05, 'epoch': 1.07}


 20%|██        | 200/980 [01:45<05:10,  2.51it/s]

{'loss': 0.0072, 'grad_norm': 0.41084593534469604, 'learning_rate': 6.74471266344263e-05, 'epoch': 1.42}



 20%|██        | 200/980 [02:00<05:10,  2.51it/s]

{'eval_loss': 0.0024460384156554937, 'eval_precision': 0.9974473516273133, 'eval_recall': 0.9982967851820311, 'eval_f1': 0.9978718876356671, 'eval_accuracy': 0.9997623762376238, 'eval_runtime': 15.6461, 'eval_samples_per_second': 31.957, 'eval_steps_per_second': 7.989, 'epoch': 1.42}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 26%|██▌       | 250/980 [02:21<04:49,  2.52it/s]  

{'loss': 0.0039, 'grad_norm': 0.07353706657886505, 'learning_rate': 7.777780837212964e-05, 'epoch': 1.78}


 31%|███       | 300/980 [02:54<04:34,  2.47it/s]

{'loss': 0.0029, 'grad_norm': 0.05288831889629364, 'learning_rate': 7.628153852101098e-05, 'epoch': 2.13}


 36%|███▌      | 350/980 [03:13<04:11,  2.50it/s]

{'loss': 0.0016, 'grad_norm': 0.4230625033378601, 'learning_rate': 7.314938908640968e-05, 'epoch': 2.49}


 41%|████      | 400/980 [03:33<03:51,  2.51it/s]

{'loss': 0.0015, 'grad_norm': 0.2836463153362274, 'learning_rate': 6.851861445801946e-05, 'epoch': 2.84}



 41%|████      | 400/980 [03:49<03:51,  2.51it/s]

{'eval_loss': 0.0007544470718130469, 'eval_precision': 0.9978732454274777, 'eval_recall': 0.9989354907387694, 'eval_f1': 0.998404085541015, 'eval_accuracy': 0.9998217821782178, 'eval_runtime': 15.8385, 'eval_samples_per_second': 31.569, 'eval_steps_per_second': 7.892, 'epoch': 2.84}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 46%|████▌     | 450/980 [04:23<03:30,  2.51it/s]

{'loss': 0.001, 'grad_norm': 0.0056531066074967384, 'learning_rate': 6.25921405047045e-05, 'epoch': 3.2}


 51%|█████     | 500/980 [04:43<03:11,  2.50it/s]

{'loss': 0.0008, 'grad_norm': 0.04355726018548012, 'learning_rate': 5.56296721295936e-05, 'epoch': 3.55}


 56%|█████▌    | 550/980 [05:03<02:51,  2.50it/s]

{'loss': 0.0003, 'grad_norm': 0.00266621564514935, 'learning_rate': 4.7936312702606095e-05, 'epoch': 3.91}


 61%|██████    | 600/980 [05:36<02:30,  2.52it/s]

{'loss': 0.0004, 'grad_norm': 0.12314776331186295, 'learning_rate': 3.984919407996202e-05, 'epoch': 4.26}



 61%|██████    | 600/980 [05:52<02:30,  2.52it/s]

{'eval_loss': 0.00025424783234484494, 'eval_precision': 0.9995742869306088, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9996806812134114, 'eval_accuracy': 0.9999405940594059, 'eval_runtime': 15.7206, 'eval_samples_per_second': 31.805, 'eval_steps_per_second': 7.951, 'epoch': 4.26}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 66%|██████▋   | 650/980 [06:12<02:11,  2.52it/s]

{'loss': 0.0002, 'grad_norm': 0.0013684665318578482, 'learning_rate': 3.1722703098595016e-05, 'epoch': 4.62}


 71%|███████▏  | 700/980 [06:32<01:51,  2.50it/s]

{'loss': 0.0002, 'grad_norm': 0.0016468738904222846, 'learning_rate': 2.391295193746987e-05, 'epoch': 4.97}


 77%|███████▋  | 750/980 [07:05<01:31,  2.52it/s]

{'loss': 0.0002, 'grad_norm': 0.0011348576517775655, 'learning_rate': 1.676217287242836e-05, 'epoch': 5.33}


 82%|████████▏ | 800/980 [07:25<01:11,  2.51it/s]

{'loss': 0.0003, 'grad_norm': 0.00797371007502079, 'learning_rate': 1.0583721264349651e-05, 'epoch': 5.68}



 82%|████████▏ | 800/980 [07:41<01:11,  2.51it/s]

{'eval_loss': 0.00014918424130883068, 'eval_precision': 0.9995742869306088, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9996806812134114, 'eval_accuracy': 0.9999405940594059, 'eval_runtime': 15.8797, 'eval_samples_per_second': 31.487, 'eval_steps_per_second': 7.872, 'epoch': 5.68}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 87%|████████▋ | 850/980 [08:14<02:15,  1.04s/it]

{'loss': 0.0002, 'grad_norm': 0.0009271166636608541, 'learning_rate': 5.6483439669281866e-06, 'epoch': 6.04}


 92%|█████████▏| 900/980 [08:34<00:31,  2.52it/s]

{'loss': 0.0002, 'grad_norm': 0.0027078159619122744, 'learning_rate': 2.172314888229314e-06, 'epoch': 6.39}


 97%|█████████▋| 950/980 [08:54<00:11,  2.52it/s]

{'loss': 0.0002, 'grad_norm': 0.11093872040510178, 'learning_rate': 3.0795761935756554e-07, 'epoch': 6.75}


100%|██████████| 980/980 [09:08<00:00,  2.50it/s]

{'train_runtime': 548.1576, 'train_samples_per_second': 57.465, 'train_steps_per_second': 1.788, 'train_loss': 0.1380430312257032, 'epoch': 6.96}


100%|██████████| 980/980 [09:08<00:00,  1.79it/s]
[I 2025-08-21 14:05:03,049] Trial 1 finished with value: 0.9996806812134114 and parameters: {'learning_rate': 7.790143126276238e-05, 'num_train_epochs': 7, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4, 'weight_decay': 0.0006672367170464204, 'warmup_ratio': 0.23555278841790406, 'lr_scheduler_type': 'cosine_with_restarts', 'max_grad_norm': 0.8037724259507192, 'label_smoothing_factor': 0.017052412368729154, 'hidden_dropout_prob': 0.019515477895583853, 'attention_probs_dropout_prob': 0.28466566117599995}. Best is trial 1 with value: 0.9996806812134114.
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-c

{'loss': 1.3033, 'grad_norm': 2.885141372680664, 'learning_rate': 0.0001707418151613477, 'epoch': 0.04}


  1%|          | 102/12375 [00:21<19:09, 10.68it/s]

{'loss': 0.1003, 'grad_norm': 3.9878382682800293, 'learning_rate': 0.0003414836303226954, 'epoch': 0.09}


  1%|          | 152/12375 [00:26<19:00, 10.72it/s]

{'loss': 0.1226, 'grad_norm': 5.3155388832092285, 'learning_rate': 0.00043631386031595804, 'epoch': 0.13}


  2%|▏         | 200/12375 [00:30<18:34, 10.92it/s]

{'loss': 0.1394, 'grad_norm': 7.709505081176758, 'learning_rate': 0.0004345293455498396, 'epoch': 0.18}


  2%|▏         | 200/12375 [00:42<18:34, 10.92it/s]
  2%|▏         | 200/12375 [00:46<18:34, 10.92it/s]

{'eval_loss': 0.08586582541465759, 'eval_precision': 0.8038922155688623, 'eval_recall': 0.9146263572493081, 'eval_f1': 0.8556916641768748, 'eval_accuracy': 0.9798217821782178, 'eval_runtime': 15.8106, 'eval_samples_per_second': 31.624, 'eval_steps_per_second': 7.906, 'epoch': 0.18}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  2%|▏         | 252/12375 [00:52<18:40, 10.82it/s]   

{'loss': 0.1469, 'grad_norm': 3.9265854358673096, 'learning_rate': 0.00043274483078372116, 'epoch': 0.22}


  2%|▏         | 302/12375 [00:56<18:33, 10.84it/s]

{'loss': 0.1536, 'grad_norm': 5.377829074859619, 'learning_rate': 0.0004309603160176027, 'epoch': 0.27}


  3%|▎         | 352/12375 [01:01<18:25, 10.87it/s]

{'loss': 0.1688, 'grad_norm': 7.733010768890381, 'learning_rate': 0.00042917580125148433, 'epoch': 0.31}


  3%|▎         | 400/12375 [01:05<18:13, 10.96it/s]

{'loss': 0.128, 'grad_norm': 5.07607364654541, 'learning_rate': 0.00042739128648536587, 'epoch': 0.36}


  3%|▎         | 400/12375 [01:16<18:13, 10.96it/s]
  3%|▎         | 400/12375 [01:21<18:13, 10.96it/s]

{'eval_loss': 0.04446425661444664, 'eval_precision': 0.8793275965579348, 'eval_recall': 0.9354907387694273, 'eval_f1': 0.9065401279141738, 'eval_accuracy': 0.9900990099009901, 'eval_runtime': 15.7894, 'eval_samples_per_second': 31.667, 'eval_steps_per_second': 7.917, 'epoch': 0.36}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  4%|▎         | 452/12375 [01:27<18:25, 10.78it/s]  

{'loss': 0.1307, 'grad_norm': 3.5316171646118164, 'learning_rate': 0.0004256067717192474, 'epoch': 0.4}


  4%|▍         | 502/12375 [01:31<18:08, 10.90it/s]

{'loss': 0.1148, 'grad_norm': 3.1509270668029785, 'learning_rate': 0.000423822256953129, 'epoch': 0.44}


  4%|▍         | 552/12375 [01:36<18:09, 10.85it/s]

{'loss': 0.1355, 'grad_norm': 5.745420932769775, 'learning_rate': 0.0004220377421870105, 'epoch': 0.49}


  5%|▍         | 600/12375 [01:40<17:56, 10.94it/s]

{'loss': 0.1211, 'grad_norm': 13.083442687988281, 'learning_rate': 0.0004202532274208921, 'epoch': 0.53}


  5%|▍         | 600/12375 [01:53<17:56, 10.94it/s]
  5%|▍         | 600/12375 [01:56<17:56, 10.94it/s]

{'eval_loss': 0.047389619052410126, 'eval_precision': 0.8950950950950951, 'eval_recall': 0.9518841813923781, 'eval_f1': 0.9226165910028891, 'eval_accuracy': 0.989009900990099, 'eval_runtime': 15.7792, 'eval_samples_per_second': 31.687, 'eval_steps_per_second': 7.922, 'epoch': 0.53}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  5%|▌         | 652/12375 [02:02<18:16, 10.70it/s]  

{'loss': 0.1268, 'grad_norm': 4.3417158126831055, 'learning_rate': 0.0004184687126547737, 'epoch': 0.58}


  6%|▌         | 702/12375 [02:06<18:01, 10.79it/s]

{'loss': 0.082, 'grad_norm': 5.168877601623535, 'learning_rate': 0.00041668419788865523, 'epoch': 0.62}


  6%|▌         | 752/12375 [02:11<17:46, 10.90it/s]

{'loss': 0.1177, 'grad_norm': 5.022995948791504, 'learning_rate': 0.00041489968312253677, 'epoch': 0.67}


  6%|▋         | 800/12375 [02:15<17:32, 11.00it/s]

{'loss': 0.093, 'grad_norm': 3.1634531021118164, 'learning_rate': 0.00041311516835641836, 'epoch': 0.71}


  6%|▋         | 800/12375 [02:26<17:32, 11.00it/s]
  6%|▋         | 800/12375 [02:31<17:32, 11.00it/s]

{'eval_loss': 0.03598732873797417, 'eval_precision': 0.9135928265742816, 'eval_recall': 0.9544390036193315, 'eval_f1': 0.9335693461057892, 'eval_accuracy': 0.9921386138613861, 'eval_runtime': 15.8163, 'eval_samples_per_second': 31.613, 'eval_steps_per_second': 7.903, 'epoch': 0.71}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  7%|▋         | 852/12375 [02:37<17:51, 10.75it/s]  

{'loss': 0.0983, 'grad_norm': 5.038382053375244, 'learning_rate': 0.0004113306535902999, 'epoch': 0.76}


  7%|▋         | 902/12375 [02:41<17:33, 10.89it/s]

{'loss': 0.0944, 'grad_norm': 5.910884857177734, 'learning_rate': 0.0004095461388241815, 'epoch': 0.8}


  8%|▊         | 952/12375 [02:46<17:36, 10.81it/s]

{'loss': 0.0961, 'grad_norm': 1.66644287109375, 'learning_rate': 0.00040776162405806307, 'epoch': 0.84}


  8%|▊         | 1000/12375 [02:50<17:21, 10.92it/s]

{'loss': 0.0948, 'grad_norm': 10.518210411071777, 'learning_rate': 0.0004059771092919446, 'epoch': 0.89}


  8%|▊         | 1000/12375 [03:03<17:21, 10.92it/s]
  8%|▊         | 1000/12375 [03:06<17:21, 10.92it/s]

{'eval_loss': 0.03153349086642265, 'eval_precision': 0.9160211985324093, 'eval_recall': 0.9567809239940388, 'eval_f1': 0.9359575132771009, 'eval_accuracy': 0.9939009900990099, 'eval_runtime': 15.8428, 'eval_samples_per_second': 31.56, 'eval_steps_per_second': 7.89, 'epoch': 0.89}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  8%|▊         | 1051/12375 [03:12<17:37, 10.71it/s]  

{'loss': 0.083, 'grad_norm': 6.649545192718506, 'learning_rate': 0.0004041925945258262, 'epoch': 0.93}


  9%|▉         | 1101/12375 [03:16<17:19, 10.85it/s]

{'loss': 0.0728, 'grad_norm': 6.488632678985596, 'learning_rate': 0.0004024080797597077, 'epoch': 0.98}


  9%|▉         | 1151/12375 [03:34<20:41,  9.04it/s]  

{'loss': 0.07, 'grad_norm': 7.940118312835693, 'learning_rate': 0.0004006235649935893, 'epoch': 1.02}


 10%|▉         | 1200/12375 [03:39<17:15, 10.80it/s]

{'loss': 0.0722, 'grad_norm': 1.6772500276565552, 'learning_rate': 0.00039883905022747084, 'epoch': 1.07}


 10%|▉         | 1200/12375 [03:53<17:15, 10.80it/s]
 10%|▉         | 1200/12375 [03:54<17:15, 10.80it/s]

{'eval_loss': 0.01660447008907795, 'eval_precision': 0.9557448576771245, 'eval_recall': 0.9793485203321269, 'eval_f1': 0.9674027339642481, 'eval_accuracy': 0.9960792079207921, 'eval_runtime': 15.7286, 'eval_samples_per_second': 31.789, 'eval_steps_per_second': 7.947, 'epoch': 1.07}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 10%|█         | 1252/12375 [04:00<17:06, 10.83it/s]  

{'loss': 0.0711, 'grad_norm': 2.5223875045776367, 'learning_rate': 0.00039705453546135243, 'epoch': 1.11}


 11%|█         | 1302/12375 [04:05<17:02, 10.83it/s]

{'loss': 0.0842, 'grad_norm': 2.926459550857544, 'learning_rate': 0.00039527002069523397, 'epoch': 1.16}


 11%|█         | 1352/12375 [04:09<16:59, 10.81it/s]

{'loss': 0.0718, 'grad_norm': 6.8750739097595215, 'learning_rate': 0.00039348550592911556, 'epoch': 1.2}


 11%|█▏        | 1400/12375 [04:14<16:46, 10.90it/s]

{'loss': 0.0777, 'grad_norm': 3.7661101818084717, 'learning_rate': 0.0003917009911629971, 'epoch': 1.24}


 11%|█▏        | 1400/12375 [04:26<16:46, 10.90it/s]
 11%|█▏        | 1400/12375 [04:29<16:46, 10.90it/s]

{'eval_loss': 0.03057040460407734, 'eval_precision': 0.9081037603056505, 'eval_recall': 0.9614647647434532, 'eval_f1': 0.9340227507755945, 'eval_accuracy': 0.9935841584158416, 'eval_runtime': 15.7797, 'eval_samples_per_second': 31.686, 'eval_steps_per_second': 7.922, 'epoch': 1.24}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 12%|█▏        | 1451/12375 [04:35<16:51, 10.80it/s]  

{'loss': 0.069, 'grad_norm': 1.8836387395858765, 'learning_rate': 0.0003899164763968787, 'epoch': 1.29}


 12%|█▏        | 1501/12375 [04:39<16:38, 10.89it/s]

{'loss': 0.0595, 'grad_norm': 0.8567374348640442, 'learning_rate': 0.00038813196163076027, 'epoch': 1.33}


 13%|█▎        | 1551/12375 [04:44<16:33, 10.89it/s]

{'loss': 0.0701, 'grad_norm': 4.959644317626953, 'learning_rate': 0.0003863474468646418, 'epoch': 1.38}


 13%|█▎        | 1600/12375 [04:49<16:21, 10.98it/s]

{'loss': 0.062, 'grad_norm': 3.4058079719543457, 'learning_rate': 0.00038456293209852333, 'epoch': 1.42}


 13%|█▎        | 1600/12375 [05:03<16:21, 10.98it/s]
 13%|█▎        | 1600/12375 [05:04<16:21, 10.98it/s]

{'eval_loss': 0.03155333548784256, 'eval_precision': 0.9296811120196239, 'eval_recall': 0.9682776240153289, 'eval_f1': 0.9485869225153822, 'eval_accuracy': 0.9946732673267327, 'eval_runtime': 15.8031, 'eval_samples_per_second': 31.639, 'eval_steps_per_second': 7.91, 'epoch': 1.42}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 13%|█▎        | 1652/12375 [05:10<16:28, 10.84it/s]  

{'loss': 0.0495, 'grad_norm': 1.2430461645126343, 'learning_rate': 0.0003827784173324049, 'epoch': 1.47}


 14%|█▍        | 1702/12375 [05:15<16:32, 10.76it/s]

{'loss': 0.0648, 'grad_norm': 3.4658164978027344, 'learning_rate': 0.0003809939025662865, 'epoch': 1.51}


 14%|█▍        | 1752/12375 [05:19<16:20, 10.83it/s]

{'loss': 0.0607, 'grad_norm': 3.760335683822632, 'learning_rate': 0.00037920938780016804, 'epoch': 1.56}


 15%|█▍        | 1800/12375 [05:24<16:09, 10.91it/s]

{'loss': 0.0667, 'grad_norm': 2.2100751399993896, 'learning_rate': 0.00037742487303404963, 'epoch': 1.6}


 15%|█▍        | 1800/12375 [05:36<16:09, 10.91it/s]
 15%|█▍        | 1800/12375 [05:40<16:09, 10.91it/s]

{'eval_loss': 0.021115189418196678, 'eval_precision': 0.9604496253122398, 'eval_recall': 0.9823291462635725, 'eval_f1': 0.9712661825071045, 'eval_accuracy': 0.9961584158415842, 'eval_runtime': 15.8699, 'eval_samples_per_second': 31.506, 'eval_steps_per_second': 7.877, 'epoch': 1.6}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 15%|█▍        | 1852/12375 [05:45<16:18, 10.76it/s]  

{'loss': 0.0702, 'grad_norm': 3.5139083862304688, 'learning_rate': 0.00037564035826793117, 'epoch': 1.64}


 15%|█▌        | 1902/12375 [05:50<16:08, 10.81it/s]

{'loss': 0.0513, 'grad_norm': 1.7609179019927979, 'learning_rate': 0.0003738558435018127, 'epoch': 1.69}


 16%|█▌        | 1952/12375 [05:54<15:54, 10.92it/s]

{'loss': 0.06, 'grad_norm': 5.518508434295654, 'learning_rate': 0.0003720713287356943, 'epoch': 1.73}


 16%|█▌        | 2000/12375 [05:59<15:51, 10.90it/s]

{'loss': 0.065, 'grad_norm': 1.789190649986267, 'learning_rate': 0.0003702868139695759, 'epoch': 1.78}


 16%|█▌        | 2000/12375 [06:13<15:51, 10.90it/s]
 16%|█▌        | 2000/12375 [06:15<15:51, 10.90it/s]

{'eval_loss': 0.01671353168785572, 'eval_precision': 0.9618352450469239, 'eval_recall': 0.9819033425590803, 'eval_f1': 0.9717656974294143, 'eval_accuracy': 0.9966336633663366, 'eval_runtime': 15.8193, 'eval_samples_per_second': 31.607, 'eval_steps_per_second': 7.902, 'epoch': 1.78}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 17%|█▋        | 2051/12375 [06:20<15:55, 10.80it/s]  

{'loss': 0.0609, 'grad_norm': 6.432090759277344, 'learning_rate': 0.0003685022992034574, 'epoch': 1.82}


 17%|█▋        | 2101/12375 [06:25<15:36, 10.97it/s]

{'loss': 0.0646, 'grad_norm': 9.399792671203613, 'learning_rate': 0.000366717784437339, 'epoch': 1.87}


 17%|█▋        | 2151/12375 [06:29<15:34, 10.95it/s]

{'loss': 0.0505, 'grad_norm': 0.6206274032592773, 'learning_rate': 0.00036493326967122053, 'epoch': 1.91}


 18%|█▊        | 2200/12375 [06:34<15:31, 10.92it/s]

{'loss': 0.0447, 'grad_norm': 3.6403181552886963, 'learning_rate': 0.00036314875490510207, 'epoch': 1.96}


 18%|█▊        | 2200/12375 [06:46<15:31, 10.92it/s]
 18%|█▊        | 2200/12375 [06:50<15:31, 10.92it/s]

{'eval_loss': 0.010149499401450157, 'eval_precision': 0.9771008403361344, 'eval_recall': 0.9902065147966788, 'eval_f1': 0.9836100243206091, 'eval_accuracy': 0.998019801980198, 'eval_runtime': 15.7275, 'eval_samples_per_second': 31.791, 'eval_steps_per_second': 7.948, 'epoch': 1.96}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 18%|█▊        | 2250/12375 [06:56<15:23, 10.97it/s]  

{'loss': 0.0461, 'grad_norm': 3.831984519958496, 'learning_rate': 0.00036136424013898365, 'epoch': 2.0}


 19%|█▊        | 2302/12375 [07:13<15:24, 10.89it/s]  

{'loss': 0.0492, 'grad_norm': 1.9920756816864014, 'learning_rate': 0.00035957972537286524, 'epoch': 2.04}


 19%|█▉        | 2352/12375 [07:17<15:18, 10.91it/s]

{'loss': 0.0652, 'grad_norm': 4.738366603851318, 'learning_rate': 0.0003577952106067468, 'epoch': 2.09}


 19%|█▉        | 2400/12375 [07:22<15:21, 10.82it/s]

{'loss': 0.0522, 'grad_norm': 5.371719837188721, 'learning_rate': 0.00035601069584062836, 'epoch': 2.13}


 19%|█▉        | 2400/12375 [07:33<15:21, 10.82it/s]
 19%|█▉        | 2400/12375 [07:37<15:21, 10.82it/s]

{'eval_loss': 0.011576171033084393, 'eval_precision': 0.9716088328075709, 'eval_recall': 0.9836065573770492, 'eval_f1': 0.9775708844688955, 'eval_accuracy': 0.9981188118811881, 'eval_runtime': 15.6581, 'eval_samples_per_second': 31.932, 'eval_steps_per_second': 7.983, 'epoch': 2.13}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 20%|█▉        | 2452/12375 [07:43<15:14, 10.85it/s]  

{'loss': 0.042, 'grad_norm': 8.99142837524414, 'learning_rate': 0.0003542261810745099, 'epoch': 2.18}


 20%|██        | 2502/12375 [07:47<15:13, 10.81it/s]

{'loss': 0.0414, 'grad_norm': 1.5327024459838867, 'learning_rate': 0.00035244166630839143, 'epoch': 2.22}


 21%|██        | 2552/12375 [07:52<14:57, 10.94it/s]

{'loss': 0.0518, 'grad_norm': 1.9452606439590454, 'learning_rate': 0.0003506571515422731, 'epoch': 2.27}


 21%|██        | 2600/12375 [07:57<15:08, 10.75it/s]

{'loss': 0.0381, 'grad_norm': 1.9838929176330566, 'learning_rate': 0.0003488726367761546, 'epoch': 2.31}



 21%|██        | 2600/12375 [08:12<15:08, 10.75it/s]

{'eval_loss': 0.02342493273317814, 'eval_precision': 0.9522628642281463, 'eval_recall': 0.9810517351500958, 'eval_f1': 0.9664429530201343, 'eval_accuracy': 0.9966138613861386, 'eval_runtime': 15.7139, 'eval_samples_per_second': 31.819, 'eval_steps_per_second': 7.955, 'epoch': 2.31}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 21%|██▏       | 2652/12375 [08:18<15:10, 10.68it/s]  

{'loss': 0.0493, 'grad_norm': 3.5290439128875732, 'learning_rate': 0.00034708812201003614, 'epoch': 2.36}


 22%|██▏       | 2702/12375 [08:23<14:56, 10.79it/s]

{'loss': 0.043, 'grad_norm': 1.4618293046951294, 'learning_rate': 0.00034530360724391773, 'epoch': 2.4}


 22%|██▏       | 2752/12375 [08:27<14:43, 10.89it/s]

{'loss': 0.0382, 'grad_norm': 7.257384300231934, 'learning_rate': 0.00034351909247779926, 'epoch': 2.44}


 23%|██▎       | 2800/12375 [08:32<14:40, 10.88it/s]

{'loss': 0.0374, 'grad_norm': 0.49048054218292236, 'learning_rate': 0.0003417345777116808, 'epoch': 2.49}


 23%|██▎       | 2800/12375 [08:43<14:40, 10.88it/s]
 23%|██▎       | 2800/12375 [08:47<14:40, 10.88it/s]

{'eval_loss': 0.012965220026671886, 'eval_precision': 0.964688675302967, 'eval_recall': 0.9829678518203109, 'eval_f1': 0.9737424865548878, 'eval_accuracy': 0.9975643564356436, 'eval_runtime': 15.7132, 'eval_samples_per_second': 31.82, 'eval_steps_per_second': 7.955, 'epoch': 2.49}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 23%|██▎       | 2851/12375 [08:53<14:40, 10.82it/s]  

{'loss': 0.0418, 'grad_norm': 1.8633257150650024, 'learning_rate': 0.00033995006294556244, 'epoch': 2.53}


 23%|██▎       | 2901/12375 [08:57<14:33, 10.85it/s]

{'loss': 0.0398, 'grad_norm': 0.5808850526809692, 'learning_rate': 0.000338165548179444, 'epoch': 2.58}


 24%|██▍       | 2951/12375 [09:02<14:36, 10.75it/s]

{'loss': 0.0396, 'grad_norm': 3.456127643585205, 'learning_rate': 0.0003363810334133255, 'epoch': 2.62}


 24%|██▍       | 3000/12375 [09:06<14:28, 10.79it/s]

{'loss': 0.0435, 'grad_norm': 1.5147459506988525, 'learning_rate': 0.0003345965186472071, 'epoch': 2.67}


 24%|██▍       | 3000/12375 [09:16<14:28, 10.79it/s]
 24%|██▍       | 3000/12375 [09:22<14:28, 10.79it/s]

{'eval_loss': 0.014233194291591644, 'eval_precision': 0.9715123586091328, 'eval_recall': 0.9874387907174792, 'eval_f1': 0.9794108330693696, 'eval_accuracy': 0.9970891089108911, 'eval_runtime': 15.5557, 'eval_samples_per_second': 32.143, 'eval_steps_per_second': 8.036, 'epoch': 2.67}


 24%|██▍       | 3000/12375 [09:24<29:23,  5.32it/s]
[I 2025-08-21 14:14:29,722] Trial 2 finished with value: 0.9794108330693696 and parameters: {'learning_rate': 0.00043709904681305016, 'num_train_epochs': 11, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 1, 'weight_decay': 0.0009565499215943821, 'warmup_ratio': 0.010316556334565519, 'lr_scheduler_type': 'linear', 'max_grad_norm': 0.7600340105889054, 'label_smoothing_factor': 0.054671027934327966, 'hidden_dropout_prob': 0.05545633665765811, 'attention_probs_dropout_prob': 0.29087538832936755}. Best is trial 1 with value: 0.9996806812134114.
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


{'train_runtime': 564.1872, 'train_samples_per_second': 87.737, 'train_steps_per_second': 21.934, 'train_loss': 0.09764930681387583, 'epoch': 2.67}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  0%|          | 52/13500 [00:17<20:30, 10.93it/s]  

{'loss': 3.0835, 'grad_norm': 4.26278018951416, 'learning_rate': 7.870131887994132e-06, 'epoch': 0.04}


  1%|          | 102/13500 [00:21<20:35, 10.84it/s]

{'loss': 1.0682, 'grad_norm': 3.6991894245147705, 'learning_rate': 1.5740263775988263e-05, 'epoch': 0.09}


  1%|          | 152/13500 [00:26<20:22, 10.92it/s]

{'loss': 0.217, 'grad_norm': 6.375947952270508, 'learning_rate': 2.3610395663982395e-05, 'epoch': 0.13}


  1%|▏         | 200/13500 [00:30<20:18, 10.91it/s]

{'loss': 0.0414, 'grad_norm': 0.4459840953350067, 'learning_rate': 3.1480527551976527e-05, 'epoch': 0.18}


  1%|▏         | 200/13500 [00:40<20:18, 10.91it/s]
  1%|▏         | 200/13500 [00:46<20:18, 10.91it/s]

{'eval_loss': 0.009548045694828033, 'eval_precision': 0.993195832447374, 'eval_recall': 0.994464551841601, 'eval_f1': 0.9938297872340426, 'eval_accuracy': 0.9988118811881188, 'eval_runtime': 15.6577, 'eval_samples_per_second': 31.933, 'eval_steps_per_second': 7.983, 'epoch': 0.18}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  2%|▏         | 250/13500 [00:51<20:36, 10.71it/s]   

{'loss': 0.026, 'grad_norm': 0.3585526645183563, 'learning_rate': 3.935065943997066e-05, 'epoch': 0.22}


  2%|▏         | 302/13500 [00:56<20:29, 10.73it/s]

{'loss': 0.0119, 'grad_norm': 0.029560642316937447, 'learning_rate': 4.722079132796479e-05, 'epoch': 0.27}


  3%|▎         | 352/13500 [01:01<20:12, 10.85it/s]

{'loss': 0.0133, 'grad_norm': 0.06426829844713211, 'learning_rate': 5.509092321595892e-05, 'epoch': 0.31}


  3%|▎         | 400/13500 [01:05<19:56, 10.95it/s]

{'loss': 0.0078, 'grad_norm': 0.44093647599220276, 'learning_rate': 6.296105510395305e-05, 'epoch': 0.36}


  3%|▎         | 400/13500 [01:17<19:56, 10.95it/s]
  3%|▎         | 400/13500 [01:21<19:56, 10.95it/s]

{'eval_loss': 0.0023029742296785116, 'eval_precision': 0.9978732454274777, 'eval_recall': 0.9989354907387694, 'eval_f1': 0.998404085541015, 'eval_accuracy': 0.9997227722772277, 'eval_runtime': 15.6787, 'eval_samples_per_second': 31.89, 'eval_steps_per_second': 7.973, 'epoch': 0.36}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  3%|▎         | 452/13500 [01:26<20:17, 10.72it/s]   

{'loss': 0.0139, 'grad_norm': 0.02912736125290394, 'learning_rate': 7.083118699194718e-05, 'epoch': 0.4}


  4%|▎         | 502/13500 [01:31<20:25, 10.60it/s]

{'loss': 0.0087, 'grad_norm': 0.07943978160619736, 'learning_rate': 7.870131887994132e-05, 'epoch': 0.44}


  4%|▍         | 552/13500 [01:36<19:54, 10.84it/s]

{'loss': 0.0142, 'grad_norm': 3.654648542404175, 'learning_rate': 8.657145076793544e-05, 'epoch': 0.49}


  4%|▍         | 600/13500 [01:40<19:38, 10.95it/s]

{'loss': 0.0131, 'grad_norm': 0.014213279820978642, 'learning_rate': 9.444158265592958e-05, 'epoch': 0.53}


  4%|▍         | 600/13500 [01:50<19:38, 10.95it/s]
  4%|▍         | 600/13500 [01:56<19:38, 10.95it/s]

{'eval_loss': 0.005331751424819231, 'eval_precision': 0.9849288898323074, 'eval_recall': 0.9878645944219715, 'eval_f1': 0.9863945578231292, 'eval_accuracy': 0.9988514851485149, 'eval_runtime': 15.8802, 'eval_samples_per_second': 31.486, 'eval_steps_per_second': 7.871, 'epoch': 0.53}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  5%|▍         | 652/13500 [02:01<19:41, 10.87it/s]   

{'loss': 0.0273, 'grad_norm': 1.0590275526046753, 'learning_rate': 0.00010231171454392371, 'epoch': 0.58}


  5%|▌         | 702/13500 [02:06<19:41, 10.83it/s]

{'loss': 0.0155, 'grad_norm': 3.1975862979888916, 'learning_rate': 0.00011018184643191784, 'epoch': 0.62}


  6%|▌         | 752/13500 [02:11<19:34, 10.85it/s]

{'loss': 0.0397, 'grad_norm': 43.00090408325195, 'learning_rate': 0.00011805197831991198, 'epoch': 0.67}


  6%|▌         | 800/13500 [02:15<19:19, 10.95it/s]

{'loss': 0.0183, 'grad_norm': 1.1930180788040161, 'learning_rate': 0.0001259221102079061, 'epoch': 0.71}


  6%|▌         | 800/13500 [02:27<19:19, 10.95it/s]
  6%|▌         | 800/13500 [02:31<19:19, 10.95it/s]

{'eval_loss': 0.023196792230010033, 'eval_precision': 0.9535655058043118, 'eval_recall': 0.9793485203321269, 'eval_f1': 0.966285054090957, 'eval_accuracy': 0.9938613861386139, 'eval_runtime': 15.8115, 'eval_samples_per_second': 31.623, 'eval_steps_per_second': 7.906, 'epoch': 0.71}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  6%|▋         | 851/13500 [02:36<19:35, 10.76it/s]   

{'loss': 0.0325, 'grad_norm': 8.675963401794434, 'learning_rate': 0.00013379224209590022, 'epoch': 0.76}


  7%|▋         | 901/13500 [02:41<19:23, 10.83it/s]

{'loss': 0.0179, 'grad_norm': 0.033762913197278976, 'learning_rate': 0.00014166237398389436, 'epoch': 0.8}


  7%|▋         | 951/13500 [02:46<19:18, 10.83it/s]

{'loss': 0.0166, 'grad_norm': 3.7183103561401367, 'learning_rate': 0.00014953250587188848, 'epoch': 0.84}


  7%|▋         | 1000/13500 [02:50<19:11, 10.85it/s]

{'loss': 0.0295, 'grad_norm': 2.600788116455078, 'learning_rate': 0.00015740263775988265, 'epoch': 0.89}


  7%|▋         | 1000/13500 [03:00<19:11, 10.85it/s]
  7%|▋         | 1000/13500 [03:06<19:11, 10.85it/s]

{'eval_loss': 0.007750596385449171, 'eval_precision': 0.9835164835164835, 'eval_recall': 0.990845220353417, 'eval_f1': 0.987167249973486, 'eval_accuracy': 0.9981782178217822, 'eval_runtime': 15.8559, 'eval_samples_per_second': 31.534, 'eval_steps_per_second': 7.884, 'epoch': 0.89}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  8%|▊         | 1052/13500 [03:12<19:10, 10.82it/s]  

{'loss': 0.0212, 'grad_norm': 0.9293193817138672, 'learning_rate': 0.00016527276964787676, 'epoch': 0.93}


  8%|▊         | 1102/13500 [03:16<19:06, 10.82it/s]

{'loss': 0.0446, 'grad_norm': 4.932656288146973, 'learning_rate': 0.00017314290153587088, 'epoch': 0.98}


  9%|▊         | 1151/13500 [03:34<23:44,  8.67it/s]  

{'loss': 0.0286, 'grad_norm': 4.650859355926514, 'learning_rate': 0.00018101303342386502, 'epoch': 1.02}


  9%|▉         | 1200/13500 [03:38<18:37, 11.00it/s]

{'loss': 0.0224, 'grad_norm': 5.4942626953125, 'learning_rate': 0.00018888316531185916, 'epoch': 1.07}


  9%|▉         | 1200/13500 [03:50<18:37, 11.00it/s]
  9%|▉         | 1200/13500 [03:54<18:37, 11.00it/s]

{'eval_loss': 0.017893780022859573, 'eval_precision': 0.9789251844046365, 'eval_recall': 0.9889291036832021, 'eval_f1': 0.9839017157381912, 'eval_accuracy': 0.9966138613861386, 'eval_runtime': 16.0954, 'eval_samples_per_second': 31.065, 'eval_steps_per_second': 7.766, 'epoch': 1.07}


  9%|▉         | 1200/13500 [03:56<40:27,  5.07it/s]
[I 2025-08-21 14:18:28,487] Trial 3 finished with value: 0.9839017157381912 and parameters: {'learning_rate': 0.0002074566765675253, 'num_train_epochs': 12, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 1, 'weight_decay': 1.5167330688076188e-05, 'warmup_ratio': 0.0975990992289793, 'lr_scheduler_type': 'cosine_with_restarts', 'max_grad_norm': 0.6404672548436904, 'label_smoothing_factor': 0.05426960831582485, 'hidden_dropout_prob': 0.04227726749242879, 'attention_probs_dropout_prob': 0.2406590942262119}. Best is trial 1 with value: 0.9996806812134114.
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


{'train_runtime': 236.7155, 'train_samples_per_second': 228.122, 'train_steps_per_second': 57.03, 'train_loss': 0.20138750712076822, 'epoch': 1.07}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  1%|          | 51/6744 [00:20<17:04,  6.53it/s]  

{'loss': 3.7411, 'grad_norm': 8.142027854919434, 'learning_rate': 4.287719918175497e-07, 'epoch': 0.09}


  1%|▏         | 101/6744 [00:28<17:02,  6.49it/s]

{'loss': 3.4427, 'grad_norm': 8.099677085876465, 'learning_rate': 8.575439836350994e-07, 'epoch': 0.18}


  2%|▏         | 151/6744 [00:35<16:54,  6.50it/s]

{'loss': 2.7681, 'grad_norm': 6.514291763305664, 'learning_rate': 1.2863159754526492e-06, 'epoch': 0.27}


  3%|▎         | 200/6744 [00:43<16:34,  6.58it/s]

{'loss': 1.8626, 'grad_norm': 3.3089470863342285, 'learning_rate': 1.7150879672701988e-06, 'epoch': 0.36}


  _warn_prf(average, modifier, msg_start, len(result))

  3%|▎         | 200/6744 [00:59<16:34,  6.58it/s]

{'eval_loss': 1.4255576133728027, 'eval_precision': 0.12256888168557536, 'eval_recall': 0.1288056206088993, 'eval_f1': 0.12560988269490292, 'eval_accuracy': 0.674930693069307, 'eval_runtime': 16.0119, 'eval_samples_per_second': 31.227, 'eval_steps_per_second': 7.807, 'epoch': 0.36}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  4%|▎         | 251/6744 [01:07<16:41,  6.48it/s]  

{'loss': 1.1512, 'grad_norm': 2.105520725250244, 'learning_rate': 2.143859959087749e-06, 'epoch': 0.44}


  4%|▍         | 301/6744 [01:15<16:23,  6.55it/s]

{'loss': 0.6778, 'grad_norm': 2.031259298324585, 'learning_rate': 2.5726319509052984e-06, 'epoch': 0.53}


  5%|▌         | 351/6744 [01:23<16:22,  6.51it/s]

{'loss': 0.3699, 'grad_norm': 1.6868095397949219, 'learning_rate': 3.001403942722848e-06, 'epoch': 0.62}


  6%|▌         | 400/6744 [01:30<16:11,  6.53it/s]

{'loss': 0.1761, 'grad_norm': 2.160975217819214, 'learning_rate': 3.4301759345403976e-06, 'epoch': 0.71}



  6%|▌         | 400/6744 [01:46<16:11,  6.53it/s]

{'eval_loss': 0.09987612813711166, 'eval_precision': 0.9243298969072165, 'eval_recall': 0.9544390036193315, 'eval_f1': 0.939143186341259, 'eval_accuracy': 0.9872871287128713, 'eval_runtime': 15.8941, 'eval_samples_per_second': 31.458, 'eval_steps_per_second': 7.865, 'epoch': 0.71}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  7%|▋         | 451/6744 [01:55<16:02,  6.54it/s]  

{'loss': 0.0836, 'grad_norm': 0.5555436611175537, 'learning_rate': 3.858947926357948e-06, 'epoch': 0.8}


  7%|▋         | 501/6744 [02:02<15:58,  6.51it/s]

{'loss': 0.0437, 'grad_norm': 1.3201504945755005, 'learning_rate': 4.287719918175498e-06, 'epoch': 0.89}


  8%|▊         | 551/6744 [02:10<15:41,  6.58it/s]

{'loss': 0.0214, 'grad_norm': 0.24258214235305786, 'learning_rate': 4.716491909993047e-06, 'epoch': 0.98}


  9%|▉         | 600/6744 [02:31<15:52,  6.45it/s]  

{'loss': 0.0137, 'grad_norm': 0.6399999856948853, 'learning_rate': 5.145263901810597e-06, 'epoch': 1.07}



  9%|▉         | 600/6744 [02:47<15:52,  6.45it/s]

{'eval_loss': 0.007260311860591173, 'eval_precision': 0.9963814389101745, 'eval_recall': 0.9965935703640622, 'eval_f1': 0.9964874933475253, 'eval_accuracy': 0.9995841584158416, 'eval_runtime': 15.8673, 'eval_samples_per_second': 31.511, 'eval_steps_per_second': 7.878, 'epoch': 1.07}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 10%|▉         | 651/6744 [02:59<15:35,  6.52it/s]   

{'loss': 0.0097, 'grad_norm': 0.3526693284511566, 'learning_rate': 5.574035893628146e-06, 'epoch': 1.16}


 10%|█         | 701/6744 [03:06<15:26,  6.52it/s]

{'loss': 0.0091, 'grad_norm': 0.08967069536447525, 'learning_rate': 6.002807885445696e-06, 'epoch': 1.24}


 11%|█         | 751/6744 [03:14<15:28,  6.45it/s]

{'loss': 0.0063, 'grad_norm': 0.0648423284292221, 'learning_rate': 6.431579877263245e-06, 'epoch': 1.33}


 12%|█▏        | 800/6744 [03:22<15:03,  6.58it/s]

{'loss': 0.0049, 'grad_norm': 0.028808532282710075, 'learning_rate': 6.860351869080795e-06, 'epoch': 1.42}



 12%|█▏        | 800/6744 [03:37<15:03,  6.58it/s]

{'eval_loss': 0.0035112276673316956, 'eval_precision': 0.995537611559711, 'eval_recall': 0.9974451777730466, 'eval_f1': 0.9964904817611401, 'eval_accuracy': 0.9996831683168317, 'eval_runtime': 15.8862, 'eval_samples_per_second': 31.474, 'eval_steps_per_second': 7.868, 'epoch': 1.42}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 13%|█▎        | 851/6744 [03:46<14:57,  6.56it/s]  

{'loss': 0.0044, 'grad_norm': 0.4770607650279999, 'learning_rate': 7.289123860898345e-06, 'epoch': 1.51}


 13%|█▎        | 901/6744 [03:54<14:57,  6.51it/s]

{'loss': 0.0033, 'grad_norm': 0.24125386774539948, 'learning_rate': 7.717895852715895e-06, 'epoch': 1.6}


 14%|█▍        | 951/6744 [04:01<14:53,  6.48it/s]

{'loss': 0.0035, 'grad_norm': 0.028314754366874695, 'learning_rate': 8.146667844533445e-06, 'epoch': 1.69}


 15%|█▍        | 1000/6744 [04:09<14:39,  6.53it/s]

{'loss': 0.0031, 'grad_norm': 0.6858077645301819, 'learning_rate': 8.575439836350995e-06, 'epoch': 1.78}



 15%|█▍        | 1000/6744 [04:25<14:39,  6.53it/s]

{'eval_loss': 0.0017561836866661906, 'eval_precision': 0.9976600723250372, 'eval_recall': 0.9985096870342772, 'eval_f1': 0.9980846988721005, 'eval_accuracy': 0.9998217821782178, 'eval_runtime': 15.8386, 'eval_samples_per_second': 31.568, 'eval_steps_per_second': 7.892, 'epoch': 1.78}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 16%|█▌        | 1051/6744 [04:34<14:28,  6.55it/s]  

{'loss': 0.0036, 'grad_norm': 0.1184505820274353, 'learning_rate': 9.004211828168545e-06, 'epoch': 1.87}


 16%|█▋        | 1101/6744 [04:41<14:30,  6.48it/s]

{'loss': 0.0029, 'grad_norm': 0.03173959255218506, 'learning_rate': 9.432983819986094e-06, 'epoch': 1.96}


 17%|█▋        | 1151/6744 [05:02<14:15,  6.54it/s]  

{'loss': 0.0016, 'grad_norm': 0.10064098984003067, 'learning_rate': 9.861755811803644e-06, 'epoch': 2.04}


 18%|█▊        | 1200/6744 [05:10<14:16,  6.48it/s]

{'loss': 0.0018, 'grad_norm': 0.013503440655767918, 'learning_rate': 1.0290527803621194e-05, 'epoch': 2.13}



 18%|█▊        | 1200/6744 [05:26<14:16,  6.48it/s]

{'eval_loss': 0.0008097671088762581, 'eval_precision': 0.9993614303959132, 'eval_recall': 0.9995741962955078, 'eval_f1': 0.9994678020223524, 'eval_accuracy': 0.999960396039604, 'eval_runtime': 15.9773, 'eval_samples_per_second': 31.294, 'eval_steps_per_second': 7.824, 'epoch': 2.13}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 19%|█▊        | 1251/6744 [05:34<14:00,  6.54it/s]  

{'loss': 0.0025, 'grad_norm': 0.06960248947143555, 'learning_rate': 1.0719299795438744e-05, 'epoch': 2.22}


 19%|█▉        | 1301/6744 [05:42<13:50,  6.55it/s]

{'loss': 0.0011, 'grad_norm': 0.010491807013750076, 'learning_rate': 1.1148071787256292e-05, 'epoch': 2.31}


 20%|██        | 1351/6744 [05:50<13:49,  6.50it/s]

{'loss': 0.0009, 'grad_norm': 0.011288771405816078, 'learning_rate': 1.1576843779073842e-05, 'epoch': 2.4}


 21%|██        | 1400/6744 [05:57<13:34,  6.56it/s]

{'loss': 0.001, 'grad_norm': 0.2625957131385803, 'learning_rate': 1.2005615770891392e-05, 'epoch': 2.49}



 21%|██        | 1400/6744 [06:13<13:34,  6.56it/s]

{'eval_loss': 0.000491767656058073, 'eval_precision': 0.9995742869306088, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9996806812134114, 'eval_accuracy': 0.999960396039604, 'eval_runtime': 15.9179, 'eval_samples_per_second': 31.411, 'eval_steps_per_second': 7.853, 'epoch': 2.49}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 22%|██▏       | 1451/6744 [06:22<13:28,  6.55it/s]  

{'loss': 0.0015, 'grad_norm': 0.00733600091189146, 'learning_rate': 1.2434387762708942e-05, 'epoch': 2.58}


 22%|██▏       | 1501/6744 [06:29<13:25,  6.51it/s]

{'loss': 0.0011, 'grad_norm': 0.007678571157157421, 'learning_rate': 1.286315975452649e-05, 'epoch': 2.67}


 23%|██▎       | 1551/6744 [06:37<13:14,  6.54it/s]

{'loss': 0.0008, 'grad_norm': 0.008255078457295895, 'learning_rate': 1.329193174634404e-05, 'epoch': 2.76}


 24%|██▎       | 1600/6744 [06:44<13:10,  6.51it/s]

{'loss': 0.0016, 'grad_norm': 0.009681508876383305, 'learning_rate': 1.3286287785239019e-05, 'epoch': 2.84}



 24%|██▎       | 1600/6744 [07:00<13:10,  6.51it/s]

{'eval_loss': 0.0008326931856572628, 'eval_precision': 0.9989359438178336, 'eval_recall': 0.9993612944432616, 'eval_f1': 0.9991485738612174, 'eval_accuracy': 0.9999207920792079, 'eval_runtime': 15.8917, 'eval_samples_per_second': 31.463, 'eval_steps_per_second': 7.866, 'epoch': 2.84}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 24%|██▍       | 1651/6744 [07:09<13:20,  6.37it/s]  

{'loss': 0.0018, 'grad_norm': 0.03940179944038391, 'learning_rate': 1.3158116247668655e-05, 'epoch': 2.93}


 25%|██▌       | 1701/6744 [07:30<16:00,  5.25it/s]  

{'loss': 0.0008, 'grad_norm': 0.018018851056694984, 'learning_rate': 1.3029944710098292e-05, 'epoch': 3.02}


 26%|██▌       | 1751/6744 [07:37<12:48,  6.50it/s]

{'loss': 0.0005, 'grad_norm': 0.005477603524923325, 'learning_rate': 1.2901773172527926e-05, 'epoch': 3.11}


 27%|██▋       | 1800/6744 [07:45<12:35,  6.54it/s]

{'loss': 0.0004, 'grad_norm': 0.07193367928266525, 'learning_rate': 1.2773601634957565e-05, 'epoch': 3.2}



 27%|██▋       | 1800/6744 [08:01<12:35,  6.54it/s]

{'eval_loss': 0.00032340496545657516, 'eval_precision': 0.9997870981477539, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9997870981477539, 'eval_accuracy': 0.999980198019802, 'eval_runtime': 15.8526, 'eval_samples_per_second': 31.54, 'eval_steps_per_second': 7.885, 'epoch': 3.2}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 27%|██▋       | 1851/6744 [08:09<12:21,  6.60it/s]  

{'loss': 0.0004, 'grad_norm': 0.048741795122623444, 'learning_rate': 1.26454300973872e-05, 'epoch': 3.29}


 28%|██▊       | 1901/6744 [08:17<12:29,  6.46it/s]

{'loss': 0.0004, 'grad_norm': 0.0026706154458224773, 'learning_rate': 1.2517258559816838e-05, 'epoch': 3.38}


 29%|██▉       | 1951/6744 [08:25<12:29,  6.39it/s]

{'loss': 0.0004, 'grad_norm': 0.012313742190599442, 'learning_rate': 1.2389087022246472e-05, 'epoch': 3.47}


 30%|██▉       | 2000/6744 [08:32<12:03,  6.56it/s]

{'loss': 0.0006, 'grad_norm': 0.025610903277993202, 'learning_rate': 1.2260915484676109e-05, 'epoch': 3.56}



 30%|██▉       | 2000/6744 [08:48<12:03,  6.56it/s]

{'eval_loss': 0.0003714697959367186, 'eval_precision': 0.9997871434653044, 'eval_recall': 1.0, 'eval_f1': 0.9998935604044704, 'eval_accuracy': 0.999980198019802, 'eval_runtime': 15.9303, 'eval_samples_per_second': 31.387, 'eval_steps_per_second': 7.847, 'epoch': 3.56}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 30%|███       | 2051/6744 [08:57<12:12,  6.41it/s]  

{'loss': 0.001, 'grad_norm': 0.010401778854429722, 'learning_rate': 1.2132743947105745e-05, 'epoch': 3.64}


 31%|███       | 2101/6744 [09:05<11:53,  6.51it/s]

{'loss': 0.0015, 'grad_norm': 0.00794356781989336, 'learning_rate': 1.2004572409535382e-05, 'epoch': 3.73}


 32%|███▏      | 2151/6744 [09:12<11:48,  6.48it/s]

{'loss': 0.0007, 'grad_norm': 0.16946206986904144, 'learning_rate': 1.1876400871965018e-05, 'epoch': 3.82}


 33%|███▎      | 2200/6744 [09:20<11:36,  6.52it/s]

{'loss': 0.0011, 'grad_norm': 0.01061338372528553, 'learning_rate': 1.1748229334394653e-05, 'epoch': 3.91}



 33%|███▎      | 2200/6744 [09:36<11:36,  6.52it/s]

{'eval_loss': 0.00032322161132469773, 'eval_precision': 0.9993614303959132, 'eval_recall': 0.9995741962955078, 'eval_f1': 0.9994678020223524, 'eval_accuracy': 0.999960396039604, 'eval_runtime': 15.9218, 'eval_samples_per_second': 31.403, 'eval_steps_per_second': 7.851, 'epoch': 3.91}


 33%|███▎      | 2200/6744 [09:37<19:53,  3.81it/s]
[I 2025-08-21 14:28:08,407] Trial 4 finished with value: 0.9994678020223524 and parameters: {'learning_rate': 1.3386261584543902e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 2, 'weight_decay': 0.00824192526487645, 'warmup_ratio': 0.2313811040057837, 'lr_scheduler_type': 'polynomial', 'max_grad_norm': 0.811649063413779, 'label_smoothing_factor': 0.03308980248526492, 'hidden_dropout_prob': 0.01906750508580709, 'attention_probs_dropout_prob': 0.09329469651469866}. Best is trial 1 with value: 0.9996806812134114.
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


{'train_runtime': 577.9504, 'train_samples_per_second': 93.434, 'train_steps_per_second': 11.669, 'train_loss': 0.3278630990865217, 'epoch': 3.91}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  4%|▎         | 50/1400 [00:32<08:59,  2.50it/s] 

{'loss': 2.9605, 'grad_norm': 2.9827404022216797, 'learning_rate': 7.560224837405551e-06, 'epoch': 0.36}


  7%|▋         | 100/1400 [00:52<08:40,  2.50it/s]

{'loss': 0.6764, 'grad_norm': 0.9374064207077026, 'learning_rate': 1.5120449674811102e-05, 'epoch': 0.71}


 11%|█         | 150/1400 [01:25<11:38,  1.79it/s]  

{'loss': 0.0313, 'grad_norm': 0.10365892201662064, 'learning_rate': 2.2680674512216653e-05, 'epoch': 1.07}


 14%|█▍        | 200/1400 [01:45<07:58,  2.51it/s]

{'loss': 0.0047, 'grad_norm': 0.018719473853707314, 'learning_rate': 3.0240899349622205e-05, 'epoch': 1.42}



 14%|█▍        | 200/1400 [02:01<07:58,  2.51it/s]

{'eval_loss': 0.0015988602535799146, 'eval_precision': 0.9997870981477539, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9997870981477539, 'eval_accuracy': 0.999980198019802, 'eval_runtime': 15.7483, 'eval_samples_per_second': 31.749, 'eval_steps_per_second': 7.937, 'epoch': 1.42}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 18%|█▊        | 250/1400 [02:21<07:38,  2.51it/s]  

{'loss': 0.0034, 'grad_norm': 0.10702957212924957, 'learning_rate': 3.5255069087145476e-05, 'epoch': 1.78}


 21%|██▏       | 300/1400 [02:55<07:25,  2.47it/s]  

{'loss': 0.0023, 'grad_norm': 0.013218460604548454, 'learning_rate': 3.372223999640002e-05, 'epoch': 2.13}


 25%|██▌       | 350/1400 [03:15<07:00,  2.49it/s]

{'loss': 0.0012, 'grad_norm': 0.007502885535359383, 'learning_rate': 3.218941090565457e-05, 'epoch': 2.49}


 29%|██▊       | 400/1400 [03:35<06:41,  2.49it/s]

{'loss': 0.001, 'grad_norm': 0.029056943953037262, 'learning_rate': 3.065658181490911e-05, 'epoch': 2.84}



 29%|██▊       | 400/1400 [03:51<06:41,  2.49it/s]

{'eval_loss': 0.00045586228952743113, 'eval_precision': 0.9991489361702127, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9994679152921144, 'eval_accuracy': 0.999960396039604, 'eval_runtime': 16.1884, 'eval_samples_per_second': 30.886, 'eval_steps_per_second': 7.722, 'epoch': 2.84}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 32%|███▏      | 450/1400 [04:25<06:19,  2.51it/s]  

{'loss': 0.0005, 'grad_norm': 0.005311929155141115, 'learning_rate': 2.9123752724163655e-05, 'epoch': 3.2}


 36%|███▌      | 500/1400 [04:45<05:59,  2.50it/s]

{'loss': 0.0006, 'grad_norm': 0.21245291829109192, 'learning_rate': 2.75909236334182e-05, 'epoch': 3.55}


 39%|███▉      | 550/1400 [05:05<05:41,  2.49it/s]

{'loss': 0.0005, 'grad_norm': 0.002782347146421671, 'learning_rate': 2.6058094542672743e-05, 'epoch': 3.91}


 43%|████▎     | 600/1400 [05:39<05:19,  2.50it/s]  

{'loss': 0.0005, 'grad_norm': 0.0025353641249239445, 'learning_rate': 2.4525265451927288e-05, 'epoch': 4.26}



 43%|████▎     | 600/1400 [05:55<05:19,  2.50it/s]

{'eval_loss': 0.0002216025604866445, 'eval_precision': 0.9997870981477539, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9997870981477539, 'eval_accuracy': 0.999980198019802, 'eval_runtime': 16.0802, 'eval_samples_per_second': 31.094, 'eval_steps_per_second': 7.774, 'epoch': 4.26}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 46%|████▋     | 650/1400 [06:16<04:59,  2.50it/s]  

{'loss': 0.0004, 'grad_norm': 0.0031133643351495266, 'learning_rate': 2.2992436361181834e-05, 'epoch': 4.62}


 50%|█████     | 700/1400 [06:36<04:42,  2.48it/s]

{'loss': 0.0003, 'grad_norm': 0.01740073226392269, 'learning_rate': 2.1459607270436376e-05, 'epoch': 4.97}


 54%|█████▎    | 750/1400 [07:09<04:20,  2.49it/s]

{'loss': 0.0003, 'grad_norm': 0.0013869153335690498, 'learning_rate': 1.992677817969092e-05, 'epoch': 5.33}


 57%|█████▋    | 800/1400 [07:29<04:01,  2.49it/s]

{'loss': 0.0002, 'grad_norm': 0.003294189926236868, 'learning_rate': 1.8393949088945464e-05, 'epoch': 5.68}



 57%|█████▋    | 800/1400 [07:45<04:01,  2.49it/s]

{'eval_loss': 0.0002164638281101361, 'eval_precision': 1.0, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9998935377408709, 'eval_accuracy': 0.999960396039604, 'eval_runtime': 16.0567, 'eval_samples_per_second': 31.14, 'eval_steps_per_second': 7.785, 'epoch': 5.68}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 61%|██████    | 850/1400 [08:19<09:48,  1.07s/it]

{'loss': 0.0003, 'grad_norm': 0.0011143573792651296, 'learning_rate': 1.686111999820001e-05, 'epoch': 6.04}


 64%|██████▍   | 900/1400 [08:39<03:18,  2.51it/s]

{'loss': 0.0002, 'grad_norm': 0.0023568118922412395, 'learning_rate': 1.5328290907454555e-05, 'epoch': 6.39}


 68%|██████▊   | 950/1400 [08:59<02:59,  2.51it/s]

{'loss': 0.0002, 'grad_norm': 0.0010490475688129663, 'learning_rate': 1.37954618167091e-05, 'epoch': 6.75}


 71%|███████▏  | 1000/1400 [09:33<02:50,  2.34it/s]

{'loss': 0.0002, 'grad_norm': 0.0008968600886873901, 'learning_rate': 1.2262632725963644e-05, 'epoch': 7.1}



 71%|███████▏  | 1000/1400 [09:49<02:50,  2.34it/s]

{'eval_loss': 0.00014089957403484732, 'eval_precision': 0.9997871434653044, 'eval_recall': 1.0, 'eval_f1': 0.9998935604044704, 'eval_accuracy': 0.999980198019802, 'eval_runtime': 15.9292, 'eval_samples_per_second': 31.389, 'eval_steps_per_second': 7.847, 'epoch': 7.1}


 71%|███████▏  | 1000/1400 [09:51<03:56,  1.69it/s]
[I 2025-08-21 14:38:01,802] Trial 5 finished with value: 0.9998935604044704 and parameters: {'learning_rate': 3.56842612325542e-05, 'num_train_epochs': 10, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4, 'weight_decay': 0.011044350847124691, 'warmup_ratio': 0.16838315927084888, 'lr_scheduler_type': 'linear', 'max_grad_norm': 0.5127095633720475, 'label_smoothing_factor': 0.010789142699330446, 'hidden_dropout_prob': 0.009428755706020276, 'attention_probs_dropout_prob': 0.1909231233791341}. Best is trial 5 with value: 0.9998935604044704.
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


{'train_runtime': 591.008, 'train_samples_per_second': 76.141, 'train_steps_per_second': 2.369, 'train_loss': 0.18425862412527202, 'epoch': 7.1}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  1%|          | 51/4496 [00:20<11:25,  6.48it/s]  

{'loss': 3.3443, 'grad_norm': 5.476739406585693, 'learning_rate': 4.373958211208214e-06, 'epoch': 0.09}


  2%|▏         | 101/4496 [00:28<11:10,  6.56it/s]

{'loss': 1.5228, 'grad_norm': 2.5694963932037354, 'learning_rate': 8.747916422416429e-06, 'epoch': 0.18}


  3%|▎         | 151/4496 [00:35<11:10,  6.48it/s]

{'loss': 0.4355, 'grad_norm': 1.0008890628814697, 'learning_rate': 1.3121874633624642e-05, 'epoch': 0.27}


  4%|▍         | 200/4496 [00:43<10:56,  6.54it/s]

{'loss': 0.0867, 'grad_norm': 1.384345531463623, 'learning_rate': 1.7495832844832857e-05, 'epoch': 0.36}



  4%|▍         | 200/4496 [00:59<10:56,  6.54it/s]

{'eval_loss': 0.02176855504512787, 'eval_precision': 0.9845469940728196, 'eval_recall': 0.9902065147966788, 'eval_f1': 0.9873686445175672, 'eval_accuracy': 0.9972079207920792, 'eval_runtime': 16.1302, 'eval_samples_per_second': 30.998, 'eval_steps_per_second': 7.749, 'epoch': 0.36}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  6%|▌         | 251/4496 [01:08<10:55,  6.47it/s]  

{'loss': 0.0325, 'grad_norm': 2.0364508628845215, 'learning_rate': 2.1869791056041073e-05, 'epoch': 0.44}


  7%|▋         | 301/4496 [01:15<10:50,  6.45it/s]

{'loss': 0.0121, 'grad_norm': 0.18882375955581665, 'learning_rate': 2.6243749267249284e-05, 'epoch': 0.53}


  8%|▊         | 351/4496 [01:23<10:42,  6.45it/s]

{'loss': 0.0062, 'grad_norm': 0.6147652268409729, 'learning_rate': 3.06177074784575e-05, 'epoch': 0.62}


  9%|▉         | 400/4496 [01:31<10:50,  6.30it/s]

{'loss': 0.0046, 'grad_norm': 0.044427111744880676, 'learning_rate': 3.420394753676896e-05, 'epoch': 0.71}



  9%|▉         | 400/4496 [01:47<10:50,  6.30it/s]

{'eval_loss': 0.0025029622483998537, 'eval_precision': 0.9965971926839643, 'eval_recall': 0.9976580796252927, 'eval_f1': 0.997127353973827, 'eval_accuracy': 0.9995841584158416, 'eval_runtime': 16.1892, 'eval_samples_per_second': 30.885, 'eval_steps_per_second': 7.721, 'epoch': 0.71}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 10%|█         | 451/4496 [01:55<10:33,  6.39it/s]  

{'loss': 0.0054, 'grad_norm': 0.6053141951560974, 'learning_rate': 3.4186922101445006e-05, 'epoch': 0.8}


 11%|█         | 501/4496 [02:03<10:17,  6.47it/s]

{'loss': 0.0041, 'grad_norm': 0.9672493934631348, 'learning_rate': 3.414488348364242e-05, 'epoch': 0.89}


 12%|█▏        | 551/4496 [02:11<10:10,  6.47it/s]

{'loss': 0.0026, 'grad_norm': 0.041053008288145065, 'learning_rate': 3.40778932306414e-05, 'epoch': 0.98}


 13%|█▎        | 600/4496 [02:31<09:57,  6.52it/s]  

{'loss': 0.0023, 'grad_norm': 0.8986262083053589, 'learning_rate': 3.3986049420544605e-05, 'epoch': 1.07}



 13%|█▎        | 600/4496 [02:47<09:57,  6.52it/s]

{'eval_loss': 0.0008023252594284713, 'eval_precision': 0.998510321344967, 'eval_recall': 0.9989354907387694, 'eval_f1': 0.9987228607918263, 'eval_accuracy': 0.9998613861386139, 'eval_runtime': 15.8871, 'eval_samples_per_second': 31.472, 'eval_steps_per_second': 7.868, 'epoch': 1.07}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 14%|█▍        | 651/4496 [02:56<09:51,  6.50it/s]  

{'loss': 0.0067, 'grad_norm': 0.668337345123291, 'learning_rate': 3.3869486518684406e-05, 'epoch': 1.16}


 16%|█▌        | 701/4496 [03:04<09:44,  6.50it/s]

{'loss': 0.0022, 'grad_norm': 0.032032519578933716, 'learning_rate': 3.372837518075685e-05, 'epoch': 1.24}


 17%|█▋        | 751/4496 [03:11<09:36,  6.50it/s]

{'loss': 0.0038, 'grad_norm': 0.009115275926887989, 'learning_rate': 3.356292200297061e-05, 'epoch': 1.33}


 18%|█▊        | 800/4496 [03:19<09:29,  6.49it/s]

{'loss': 0.0011, 'grad_norm': 0.005571444518864155, 'learning_rate': 3.337336921957661e-05, 'epoch': 1.42}



 18%|█▊        | 800/4496 [03:35<09:29,  6.49it/s]

{'eval_loss': 0.0005433787009678781, 'eval_precision': 0.9987234042553191, 'eval_recall': 0.9993612944432616, 'eval_f1': 0.9990422475258061, 'eval_accuracy': 0.9999009900990099, 'eval_runtime': 16.3639, 'eval_samples_per_second': 30.555, 'eval_steps_per_second': 7.639, 'epoch': 1.42}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 19%|█▉        | 851/4496 [03:44<09:15,  6.56it/s]  

{'loss': 0.0017, 'grad_norm': 1.3668292760849, 'learning_rate': 3.3159994348221334e-05, 'epoch': 1.51}


 20%|██        | 901/4496 [03:52<09:13,  6.49it/s]

{'loss': 0.0026, 'grad_norm': 0.008016865700483322, 'learning_rate': 3.292310978364281e-05, 'epoch': 1.6}


 21%|██        | 951/4496 [03:59<09:03,  6.53it/s]

{'loss': 0.0011, 'grad_norm': 0.11549237370491028, 'learning_rate': 3.266306234030436e-05, 'epoch': 1.69}


 22%|██▏       | 1000/4496 [04:07<09:01,  6.46it/s]

{'loss': 0.0016, 'grad_norm': 0.8651622533798218, 'learning_rate': 3.2380232744635615e-05, 'epoch': 1.78}



 22%|██▏       | 1000/4496 [04:23<09:01,  6.46it/s]

{'eval_loss': 0.0009137267479673028, 'eval_precision': 0.9987231325814003, 'eval_recall': 0.9991483925910155, 'eval_f1': 0.998935717326522, 'eval_accuracy': 0.9998811881188119, 'eval_runtime': 16.0744, 'eval_samples_per_second': 31.105, 'eval_steps_per_second': 7.776, 'epoch': 1.78}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 23%|██▎       | 1051/4496 [04:32<08:51,  6.48it/s]  

{'loss': 0.0039, 'grad_norm': 0.007806077599525452, 'learning_rate': 3.20750350776241e-05, 'epoch': 1.87}


 24%|██▍       | 1101/4496 [04:39<08:43,  6.48it/s]

{'loss': 0.0009, 'grad_norm': 0.009013031609356403, 'learning_rate': 3.1747916168573724e-05, 'epoch': 1.96}


 26%|██▌       | 1151/4496 [05:00<08:35,  6.48it/s]  

{'loss': 0.0007, 'grad_norm': 0.0027162579353898764, 'learning_rate': 3.139935494091745e-05, 'epoch': 2.04}


 27%|██▋       | 1200/4496 [05:08<08:32,  6.43it/s]

{'loss': 0.0014, 'grad_norm': 0.012943603098392487, 'learning_rate': 3.1029861711042186e-05, 'epoch': 2.13}



 27%|██▋       | 1200/4496 [05:24<08:32,  6.43it/s]

{'eval_loss': 0.0007039154879748821, 'eval_precision': 0.9968105464597066, 'eval_recall': 0.9980838833297849, 'eval_f1': 0.9974468085106383, 'eval_accuracy': 0.9998217821782178, 'eval_runtime': 15.8955, 'eval_samples_per_second': 31.455, 'eval_steps_per_second': 7.864, 'epoch': 2.13}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 28%|██▊       | 1251/4496 [05:32<08:14,  6.56it/s]  

{'loss': 0.0013, 'grad_norm': 0.5397002100944519, 'learning_rate': 3.0639977441152244e-05, 'epoch': 2.22}


 29%|██▉       | 1301/4496 [05:40<08:08,  6.55it/s]

{'loss': 0.0015, 'grad_norm': 0.007882140576839447, 'learning_rate': 3.023027294726538e-05, 'epoch': 2.31}


 30%|███       | 1351/4496 [05:48<08:02,  6.51it/s]

{'loss': 0.0008, 'grad_norm': 0.0054051573388278484, 'learning_rate': 2.9801348063500905e-05, 'epoch': 2.4}


 31%|███       | 1400/4496 [05:55<07:54,  6.53it/s]

{'loss': 0.0005, 'grad_norm': 0.46548134088516235, 'learning_rate': 2.9353830763883364e-05, 'epoch': 2.49}



 31%|███       | 1400/4496 [06:11<07:54,  6.53it/s]

{'eval_loss': 0.00032548478338867426, 'eval_precision': 0.9987231325814003, 'eval_recall': 0.9991483925910155, 'eval_f1': 0.998935717326522, 'eval_accuracy': 0.9999207920792079, 'eval_runtime': 16.1164, 'eval_samples_per_second': 31.024, 'eval_steps_per_second': 7.756, 'epoch': 2.49}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 32%|███▏      | 1451/4496 [06:20<07:46,  6.53it/s]  

{'loss': 0.0004, 'grad_norm': 0.0019023144850507379, 'learning_rate': 2.8888376242947586e-05, 'epoch': 2.58}


 33%|███▎      | 1501/4496 [06:28<07:36,  6.56it/s]

{'loss': 0.001, 'grad_norm': 1.3894859552383423, 'learning_rate': 2.8405665956491124e-05, 'epoch': 2.67}


 34%|███▍      | 1551/4496 [06:35<07:38,  6.43it/s]

{'loss': 0.0008, 'grad_norm': 0.02805793285369873, 'learning_rate': 2.7906406623878486e-05, 'epoch': 2.76}


 36%|███▌      | 1600/4496 [06:43<07:29,  6.45it/s]

{'loss': 0.0008, 'grad_norm': 0.004024108871817589, 'learning_rate': 2.739132919335786e-05, 'epoch': 2.84}



 36%|███▌      | 1600/4496 [06:59<07:29,  6.45it/s]

{'eval_loss': 0.0003013944660779089, 'eval_precision': 0.9995741962955078, 'eval_recall': 0.9995741962955078, 'eval_f1': 0.9995741962955078, 'eval_accuracy': 0.9999405940594059, 'eval_runtime': 16.1223, 'eval_samples_per_second': 31.013, 'eval_steps_per_second': 7.753, 'epoch': 2.84}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 37%|███▋      | 1651/4496 [07:08<07:17,  6.50it/s]  

{'loss': 0.0005, 'grad_norm': 0.0844673439860344, 'learning_rate': 2.6861187771905178e-05, 'epoch': 2.93}


 38%|███▊      | 1701/4496 [07:29<08:54,  5.23it/s]  

{'loss': 0.0007, 'grad_norm': 0.006612530909478664, 'learning_rate': 2.631675852116227e-05, 'epoch': 3.02}


 39%|███▉      | 1751/4496 [07:36<07:03,  6.48it/s]

{'loss': 0.0011, 'grad_norm': 0.10979136824607849, 'learning_rate': 2.57588385210856e-05, 'epoch': 3.11}


 40%|████      | 1800/4496 [07:44<06:53,  6.52it/s]

{'loss': 0.001, 'grad_norm': 0.027425631880760193, 'learning_rate': 2.518824460296913e-05, 'epoch': 3.2}



 40%|████      | 1800/4496 [08:00<06:53,  6.52it/s]

{'eval_loss': 0.00010741217556642368, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 16.1235, 'eval_samples_per_second': 31.011, 'eval_steps_per_second': 7.753, 'epoch': 3.2}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 41%|████      | 1851/4496 [08:09<06:46,  6.50it/s]  

{'loss': 0.0008, 'grad_norm': 0.005205526482313871, 'learning_rate': 2.4605812153550038e-05, 'epoch': 3.29}


 42%|████▏     | 1901/4496 [08:16<06:40,  6.49it/s]

{'loss': 0.0012, 'grad_norm': 0.004638276062905788, 'learning_rate': 2.4012393891947986e-05, 'epoch': 3.38}


 43%|████▎     | 1951/4496 [08:24<06:36,  6.42it/s]

{'loss': 0.0006, 'grad_norm': 0.030375923961400986, 'learning_rate': 2.3408858621228702e-05, 'epoch': 3.47}


 44%|████▍     | 2000/4496 [08:32<06:24,  6.49it/s]

{'loss': 0.0002, 'grad_norm': 0.0017825504764914513, 'learning_rate': 2.2796089956419558e-05, 'epoch': 3.56}



 44%|████▍     | 2000/4496 [08:48<06:24,  6.49it/s]

{'eval_loss': 6.205201498232782e-05, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 15.9983, 'eval_samples_per_second': 31.253, 'eval_steps_per_second': 7.813, 'epoch': 3.56}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 46%|████▌     | 2051/4496 [08:56<06:18,  6.46it/s]  

{'loss': 0.0001, 'grad_norm': 0.0008944177534431219, 'learning_rate': 2.2174985030839507e-05, 'epoch': 3.64}


 47%|████▋     | 2101/4496 [09:04<06:07,  6.51it/s]

{'loss': 0.0002, 'grad_norm': 0.043790847063064575, 'learning_rate': 2.1546453182637365e-05, 'epoch': 3.73}


 48%|████▊     | 2151/4496 [09:12<05:57,  6.57it/s]

{'loss': 0.0002, 'grad_norm': 0.0009368934552185237, 'learning_rate': 2.0911414623461402e-05, 'epoch': 3.82}


 49%|████▉     | 2200/4496 [09:19<05:54,  6.48it/s]

{'loss': 0.0003, 'grad_norm': 0.027071280404925346, 'learning_rate': 2.027079909120942e-05, 'epoch': 3.91}



 49%|████▉     | 2200/4496 [09:35<05:54,  6.48it/s]

{'eval_loss': 5.256315853330307e-05, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 16.0912, 'eval_samples_per_second': 31.073, 'eval_steps_per_second': 7.768, 'epoch': 3.91}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 50%|█████     | 2250/4496 [09:44<13:33,  2.76it/s]  

{'loss': 0.0004, 'grad_norm': 0.0019831168465316296, 'learning_rate': 1.9625544488831798e-05, 'epoch': 4.0}


 51%|█████     | 2301/4496 [10:05<05:39,  6.47it/s]  

{'loss': 0.0005, 'grad_norm': 0.015249661169946194, 'learning_rate': 1.8976595511180366e-05, 'epoch': 4.09}


 52%|█████▏    | 2351/4496 [10:12<05:32,  6.46it/s]

{'loss': 0.0003, 'grad_norm': 0.009803751483559608, 'learning_rate': 1.8324902261913467e-05, 'epoch': 4.18}


 53%|█████▎    | 2400/4496 [10:20<05:22,  6.49it/s]

{'loss': 0.0005, 'grad_norm': 0.0009434163803234696, 'learning_rate': 1.7671418862482206e-05, 'epoch': 4.27}



 53%|█████▎    | 2400/4496 [10:36<05:22,  6.49it/s]

{'eval_loss': 5.058860551798716e-05, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 15.9943, 'eval_samples_per_second': 31.261, 'eval_steps_per_second': 7.815, 'epoch': 4.27}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 55%|█████▍    | 2451/4496 [10:45<05:15,  6.48it/s]  

{'loss': 0.0002, 'grad_norm': 0.0010502960067242384, 'learning_rate': 1.701710205523436e-05, 'epoch': 4.36}


 56%|█████▌    | 2501/4496 [10:52<05:07,  6.50it/s]

{'loss': 0.0001, 'grad_norm': 0.0015527030918747187, 'learning_rate': 1.6362909802681154e-05, 'epoch': 4.44}


 57%|█████▋    | 2551/4496 [11:00<05:01,  6.44it/s]

{'loss': 0.0002, 'grad_norm': 0.013412266038358212, 'learning_rate': 1.5709799884977608e-05, 'epoch': 4.53}


 58%|█████▊    | 2600/4496 [11:08<04:48,  6.56it/s]

{'loss': 0.0001, 'grad_norm': 0.05494792014360428, 'learning_rate': 1.5058728497669873e-05, 'epoch': 4.62}



 58%|█████▊    | 2600/4496 [11:24<04:48,  6.56it/s]

{'eval_loss': 4.248062759870663e-05, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 16.176, 'eval_samples_per_second': 30.91, 'eval_steps_per_second': 7.727, 'epoch': 4.62}


 58%|█████▊    | 2600/4496 [11:29<04:48,  6.56it/s]

{'train_runtime': 689.1919, 'train_samples_per_second': 52.235, 'train_steps_per_second': 6.524, 'train_loss': 0.10582748024724424, 'epoch': 4.62}


 58%|█████▊    | 2600/4496 [11:29<08:23,  3.77it/s]
[I 2025-08-21 14:49:33,750] Trial 6 finished with value: 1.0 and parameters: {'learning_rate': 3.4204353211648236e-05, 'num_train_epochs': 8, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 2, 'weight_decay': 2.0319809838424957e-05, 'warmup_ratio': 0.0869254358741304, 'lr_scheduler_type': 'cosine', 'max_grad_norm': 0.9357302950938589, 'label_smoothing_factor': 0.08036720768991146, 'hidden_dropout_prob': 0.05597101766581075, 'attention_probs_dropout_prob': 0.2677676995469933}. Best is trial 6 with value: 1.0.
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier

{'loss': 3.3268, 'grad_norm': 4.337559223175049, 'learning_rate': 5.161186065984375e-06, 'epoch': 0.18}


  3%|▎         | 100/3091 [00:39<13:36,  3.66it/s]

{'loss': 1.5936, 'grad_norm': 1.751987338066101, 'learning_rate': 1.032237213196875e-05, 'epoch': 0.36}


  5%|▍         | 150/3091 [00:53<13:21,  3.67it/s]

{'loss': 0.5409, 'grad_norm': 1.2112290859222412, 'learning_rate': 1.548355819795312e-05, 'epoch': 0.53}


  6%|▋         | 200/3091 [01:07<13:18,  3.62it/s]

{'loss': 0.1505, 'grad_norm': 2.016571521759033, 'learning_rate': 2.06447442639375e-05, 'epoch': 0.71}



  6%|▋         | 200/3091 [01:23<13:18,  3.62it/s]

{'eval_loss': 0.027008559554815292, 'eval_precision': 0.9765278071473884, 'eval_recall': 0.983180753672557, 'eval_f1': 0.9798429874814343, 'eval_accuracy': 0.9941386138613861, 'eval_runtime': 16.1152, 'eval_samples_per_second': 31.027, 'eval_steps_per_second': 7.757, 'epoch': 0.71}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  8%|▊         | 250/3091 [01:41<12:56,  3.66it/s]  

{'loss': 0.045, 'grad_norm': 1.1613984107971191, 'learning_rate': 2.5805930329921872e-05, 'epoch': 0.89}


 10%|▉         | 300/3091 [02:08<13:00,  3.58it/s]  

{'loss': 0.0178, 'grad_norm': 0.16006916761398315, 'learning_rate': 3.096711639590624e-05, 'epoch': 1.07}


 11%|█▏        | 350/3091 [02:22<12:36,  3.62it/s]

{'loss': 0.0082, 'grad_norm': 0.9875632524490356, 'learning_rate': 3.612830246189062e-05, 'epoch': 1.24}


 13%|█▎        | 400/3091 [02:35<12:22,  3.63it/s]

{'loss': 0.0057, 'grad_norm': 0.05756624788045883, 'learning_rate': 4.1289488527875e-05, 'epoch': 1.42}



 13%|█▎        | 400/3091 [02:51<12:22,  3.63it/s]

{'eval_loss': 0.00369080388918519, 'eval_precision': 0.9965964688364178, 'eval_recall': 0.9974451777730466, 'eval_f1': 0.9970206426899341, 'eval_accuracy': 0.9993267326732673, 'eval_runtime': 16.1068, 'eval_samples_per_second': 31.043, 'eval_steps_per_second': 7.761, 'epoch': 1.42}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 15%|█▍        | 450/3091 [03:06<11:56,  3.68it/s]  

{'loss': 0.0053, 'grad_norm': 0.38849228620529175, 'learning_rate': 4.645067459385937e-05, 'epoch': 1.6}


 16%|█▌        | 500/3091 [03:19<12:00,  3.60it/s]

{'loss': 0.0062, 'grad_norm': 0.07503032684326172, 'learning_rate': 5.1611860659843745e-05, 'epoch': 1.78}


 18%|█▊        | 550/3091 [03:33<11:38,  3.64it/s]

{'loss': 0.0078, 'grad_norm': 0.022305559366941452, 'learning_rate': 5.677304672582812e-05, 'epoch': 1.96}


 19%|█▉        | 600/3091 [04:00<11:22,  3.65it/s]  

{'loss': 0.0055, 'grad_norm': 0.3796388804912567, 'learning_rate': 6.193423279181249e-05, 'epoch': 2.13}



 19%|█▉        | 600/3091 [04:16<11:22,  3.65it/s]

{'eval_loss': 0.001048747799359262, 'eval_precision': 0.9980846988721005, 'eval_recall': 0.9985096870342772, 'eval_f1': 0.9982971477224352, 'eval_accuracy': 0.9997821782178218, 'eval_runtime': 15.9847, 'eval_samples_per_second': 31.28, 'eval_steps_per_second': 7.82, 'epoch': 2.13}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 21%|██        | 650/3091 [04:31<11:06,  3.66it/s]  

{'loss': 0.0031, 'grad_norm': 0.6895585656166077, 'learning_rate': 6.709541885779687e-05, 'epoch': 2.31}


 23%|██▎       | 700/3091 [04:44<10:55,  3.65it/s]

{'loss': 0.0028, 'grad_norm': 0.2739419937133789, 'learning_rate': 7.225660492378124e-05, 'epoch': 2.49}


 24%|██▍       | 750/3091 [04:58<10:39,  3.66it/s]

{'loss': 0.0024, 'grad_norm': 0.8399205803871155, 'learning_rate': 7.741779098976561e-05, 'epoch': 2.67}


 26%|██▌       | 800/3091 [05:12<10:29,  3.64it/s]

{'loss': 0.0034, 'grad_norm': 1.7219592332839966, 'learning_rate': 8.247571459646993e-05, 'epoch': 2.84}



 26%|██▌       | 800/3091 [05:28<10:29,  3.64it/s]

{'eval_loss': 0.000472404615720734, 'eval_precision': 0.998297147722435, 'eval_recall': 0.9985096870342772, 'eval_f1': 0.998403406067057, 'eval_accuracy': 0.9998415841584158, 'eval_runtime': 16.0703, 'eval_samples_per_second': 31.113, 'eval_steps_per_second': 7.778, 'epoch': 2.84}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 27%|██▋       | 850/3091 [05:56<27:38,  1.35it/s]  

{'loss': 0.0069, 'grad_norm': 1.8499435186386108, 'learning_rate': 8.237503690754037e-05, 'epoch': 3.02}


 29%|██▉       | 900/3091 [06:09<09:56,  3.67it/s]

{'loss': 0.0032, 'grad_norm': 0.3303965628147125, 'learning_rate': 8.208121805522934e-05, 'epoch': 3.2}


 31%|███       | 950/3091 [06:23<09:44,  3.66it/s]

{'loss': 0.0052, 'grad_norm': 0.01053374819457531, 'learning_rate': 8.159563753473358e-05, 'epoch': 3.38}


 32%|███▏      | 1000/3091 [06:37<09:48,  3.55it/s]

{'loss': 0.0024, 'grad_norm': 0.004319536965340376, 'learning_rate': 8.092057517251886e-05, 'epoch': 3.56}



 32%|███▏      | 1000/3091 [06:53<09:48,  3.55it/s]

{'eval_loss': 0.0003066498029511422, 'eval_precision': 0.9991487550542668, 'eval_recall': 0.9995741962955078, 'eval_f1': 0.9993614303959131, 'eval_accuracy': 0.9999207920792079, 'eval_runtime': 16.1056, 'eval_samples_per_second': 31.045, 'eval_steps_per_second': 7.761, 'epoch': 3.56}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 34%|███▍      | 1050/3091 [07:08<09:19,  3.65it/s]  

{'loss': 0.0016, 'grad_norm': 0.005567383486777544, 'learning_rate': 8.005920042241312e-05, 'epoch': 3.73}


 36%|███▌      | 1100/3091 [07:21<09:04,  3.66it/s]

{'loss': 0.0018, 'grad_norm': 1.9211962223052979, 'learning_rate': 7.901555748485146e-05, 'epoch': 3.91}


 37%|███▋      | 1150/3091 [07:48<08:53,  3.64it/s]  

{'loss': 0.0028, 'grad_norm': 0.004499984439462423, 'learning_rate': 7.779454631913923e-05, 'epoch': 4.09}


 39%|███▉      | 1200/3091 [08:02<08:38,  3.65it/s]

{'loss': 0.0015, 'grad_norm': 0.03643811494112015, 'learning_rate': 7.640189963788153e-05, 'epoch': 4.27}



 39%|███▉      | 1200/3091 [08:18<08:38,  3.65it/s]

{'eval_loss': 0.00042471030610613525, 'eval_precision': 0.9985106382978723, 'eval_recall': 0.9991483925910155, 'eval_f1': 0.9988294136426519, 'eval_accuracy': 0.9998811881188119, 'eval_runtime': 16.1163, 'eval_samples_per_second': 31.025, 'eval_steps_per_second': 7.756, 'epoch': 4.27}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 40%|████      | 1250/3091 [08:33<08:20,  3.68it/s]  

{'loss': 0.0018, 'grad_norm': 0.006080925930291414, 'learning_rate': 7.484415599159147e-05, 'epoch': 4.44}


 42%|████▏     | 1300/3091 [08:47<08:09,  3.66it/s]

{'loss': 0.0023, 'grad_norm': 0.27170494198799133, 'learning_rate': 7.312862906984649e-05, 'epoch': 4.62}


 44%|████▎     | 1350/3091 [09:00<07:57,  3.64it/s]

{'loss': 0.0025, 'grad_norm': 0.004199398681521416, 'learning_rate': 7.12633733631253e-05, 'epoch': 4.8}


 45%|████▌     | 1400/3091 [09:14<07:43,  3.65it/s]

{'loss': 0.0019, 'grad_norm': 0.0016721044667065144, 'learning_rate': 6.92571463465452e-05, 'epoch': 4.98}



 45%|████▌     | 1400/3091 [09:30<07:43,  3.65it/s]

{'eval_loss': 0.0001651358907110989, 'eval_precision': 0.9995742869306088, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9996806812134114, 'eval_accuracy': 0.999960396039604, 'eval_runtime': 16.1154, 'eval_samples_per_second': 31.026, 'eval_steps_per_second': 7.757, 'epoch': 4.98}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 47%|████▋     | 1450/3091 [10:01<07:30,  3.65it/s]  

{'loss': 0.004, 'grad_norm': 0.12532413005828857, 'learning_rate': 6.711936736304881e-05, 'epoch': 5.16}


 49%|████▊     | 1500/3091 [10:15<07:17,  3.64it/s]

{'loss': 0.0011, 'grad_norm': 0.007016350515186787, 'learning_rate': 6.486007339908618e-05, 'epoch': 5.33}


 50%|█████     | 1550/3091 [10:29<07:02,  3.65it/s]

{'loss': 0.0014, 'grad_norm': 0.15773451328277588, 'learning_rate': 6.248987196042753e-05, 'epoch': 5.51}


 52%|█████▏    | 1600/3091 [10:42<06:49,  3.64it/s]

{'loss': 0.001, 'grad_norm': 0.001944790710695088, 'learning_rate': 6.001989126935746e-05, 'epoch': 5.69}



 52%|█████▏    | 1600/3091 [10:59<06:49,  3.64it/s]

{'eval_loss': 0.00030759142828173935, 'eval_precision': 0.9995742869306088, 'eval_recall': 0.9997870981477539, 'eval_f1': 0.9996806812134114, 'eval_accuracy': 0.9999405940594059, 'eval_runtime': 16.3263, 'eval_samples_per_second': 30.625, 'eval_steps_per_second': 7.656, 'epoch': 5.69}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 53%|█████▎    | 1650/3091 [11:13<06:35,  3.65it/s]  

{'loss': 0.0005, 'grad_norm': 0.004753422923386097, 'learning_rate': 5.7461728017077196e-05, 'epoch': 5.87}


 55%|█████▍    | 1700/3091 [11:40<07:39,  3.03it/s]  

{'loss': 0.0005, 'grad_norm': 0.005123198963701725, 'learning_rate': 5.4827392916620384e-05, 'epoch': 6.04}


 57%|█████▋    | 1750/3091 [11:54<06:07,  3.65it/s]

{'loss': 0.0006, 'grad_norm': 0.027924364432692528, 'learning_rate': 5.212925431191418e-05, 'epoch': 6.22}


 58%|█████▊    | 1800/3091 [12:08<05:54,  3.64it/s]

{'loss': 0.0009, 'grad_norm': 0.0019316410180181265, 'learning_rate': 4.937998010774426e-05, 'epoch': 6.4}



 58%|█████▊    | 1800/3091 [12:24<05:54,  3.64it/s]

{'eval_loss': 9.161227353615686e-05, 'eval_precision': 0.9997871434653044, 'eval_recall': 1.0, 'eval_f1': 0.9998935604044704, 'eval_accuracy': 0.999980198019802, 'eval_runtime': 15.9997, 'eval_samples_per_second': 31.251, 'eval_steps_per_second': 7.813, 'epoch': 6.4}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 60%|█████▉    | 1850/3091 [12:38<05:40,  3.65it/s]  

{'loss': 0.0007, 'grad_norm': 0.0009397069807164371, 'learning_rate': 4.659247829326557e-05, 'epoch': 6.58}


 61%|██████▏   | 1900/3091 [12:52<05:26,  3.64it/s]

{'loss': 0.0006, 'grad_norm': 0.7045295238494873, 'learning_rate': 4.377983633830421e-05, 'epoch': 6.76}


 63%|██████▎   | 1950/3091 [13:06<05:21,  3.54it/s]

{'loss': 0.0002, 'grad_norm': 0.007232037838548422, 'learning_rate': 4.095525974698793e-05, 'epoch': 6.93}


 65%|██████▍   | 2000/3091 [13:33<05:01,  3.62it/s]  

{'loss': 0.0005, 'grad_norm': 0.5254523158073425, 'learning_rate': 3.813201005719924e-05, 'epoch': 7.11}



 65%|██████▍   | 2000/3091 [13:49<05:01,  3.62it/s]

{'eval_loss': 2.337977093702648e-05, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 16.0485, 'eval_samples_per_second': 31.156, 'eval_steps_per_second': 7.789, 'epoch': 7.11}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 66%|██████▋   | 2050/3091 [14:03<04:44,  3.66it/s]  

{'loss': 0.0004, 'grad_norm': 0.0009909480577334762, 'learning_rate': 3.53233425769472e-05, 'epoch': 7.29}


 68%|██████▊   | 2100/3091 [14:17<04:30,  3.67it/s]

{'loss': 0.0005, 'grad_norm': 0.004645457491278648, 'learning_rate': 3.254244414998855e-05, 'epoch': 7.47}


 70%|██████▉   | 2150/3091 [14:31<04:15,  3.68it/s]

{'loss': 0.0002, 'grad_norm': 0.003124655457213521, 'learning_rate': 2.980237124289252e-05, 'epoch': 7.64}


 71%|███████   | 2200/3091 [14:45<04:04,  3.65it/s]

{'loss': 0.0004, 'grad_norm': 0.0007797955768182874, 'learning_rate': 2.7115988644233076e-05, 'epoch': 7.82}



 71%|███████   | 2200/3091 [15:01<04:04,  3.65it/s]

{'eval_loss': 0.00020580014097504318, 'eval_precision': 0.9989366227137388, 'eval_recall': 1.0, 'eval_f1': 0.9994680285136717, 'eval_accuracy': 0.9999009900990099, 'eval_runtime': 16.1764, 'eval_samples_per_second': 30.909, 'eval_steps_per_second': 7.727, 'epoch': 7.82}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 73%|███████▎  | 2250/3091 [15:16<06:58,  2.01it/s]  

{'loss': 0.0001, 'grad_norm': 0.004367300774902105, 'learning_rate': 2.449590906371994e-05, 'epoch': 8.0}


 74%|███████▍  | 2300/3091 [15:43<04:29,  2.94it/s]

{'loss': 0.0001, 'grad_norm': 0.0007245203014463186, 'learning_rate': 2.195443391485311e-05, 'epoch': 8.18}


 76%|███████▌  | 2350/3091 [15:57<03:23,  3.64it/s]

{'loss': 0.0001, 'grad_norm': 0.0010822514304891229, 'learning_rate': 1.950349555912985e-05, 'epoch': 8.36}


 78%|███████▊  | 2400/3091 [16:11<03:11,  3.61it/s]

{'loss': 0.0002, 'grad_norm': 0.002382257953286171, 'learning_rate': 1.715460128297036e-05, 'epoch': 8.53}



 78%|███████▊  | 2400/3091 [16:28<03:11,  3.61it/s]

{'eval_loss': 0.00140680733602494, 'eval_precision': 0.9991491172091045, 'eval_recall': 1.0, 'eval_f1': 0.9995743775271334, 'eval_accuracy': 0.9997623762376238, 'eval_runtime': 16.8451, 'eval_samples_per_second': 29.682, 'eval_steps_per_second': 7.421, 'epoch': 8.53}


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 79%|███████▉  | 2450/3091 [16:43<02:57,  3.60it/s]  

{'loss': 0.0003, 'grad_norm': 0.0004854469152633101, 'learning_rate': 1.491877927039387e-05, 'epoch': 8.71}


 81%|████████  | 2500/3091 [16:57<02:47,  3.54it/s]

{'loss': 0.0001, 'grad_norm': 0.0007040274213068187, 'learning_rate': 1.2806526825105808e-05, 'epoch': 8.89}


 82%|████████▏ | 2550/3091 [17:24<02:33,  3.53it/s]

{'loss': 0.0003, 'grad_norm': 0.00044646760215982795, 'learning_rate': 1.082776108509664e-05, 'epoch': 9.07}


 84%|████████▍ | 2600/3091 [17:38<02:14,  3.64it/s]

{'loss': 0.0001, 'grad_norm': 0.0011547369649633765, 'learning_rate': 8.991772461149505e-06, 'epoch': 9.24}



 84%|████████▍ | 2600/3091 [17:54<02:14,  3.64it/s]

{'eval_loss': 7.976683264132589e-05, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 16.4075, 'eval_samples_per_second': 30.474, 'eval_steps_per_second': 7.618, 'epoch': 9.24}


 84%|████████▍ | 2600/3091 [17:56<03:23,  2.41it/s]
[I 2025-08-21 15:07:32,485] Trial 7 finished with value: 1.0 and parameters: {'learning_rate': 8.24757533344303e-05, 'num_train_epochs': 11, 'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'weight_decay': 0.018709365688887352, 'warmup_ratio': 0.258219174976903, 'lr_scheduler_type': 'cosine', 'max_grad_norm': 0.5599326836668415, 'label_smoothing_factor': 0.033761517140362796, 'hidden_dropout_prob': 0.28287291117375574, 'attention_probs_dropout_prob': 0.09696087960622657}. Best is trial 6 with value: 1.0.
Trying to set hidden_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set attention_probs_dropout_prob in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


{'train_runtime': 1076.6842, 'train_samples_per_second': 45.974, 'train_steps_per_second': 2.871, 'train_loss': 0.1110292919866669, 'epoch': 9.24}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[W 2025-08-21 15:07:33,985] Trial 8 failed with parameters: {'learning_rate': 7.610438922351127e-05, 'num_train_epochs': 10, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 1, 'weight_decay': 0.0001597766235483386, 'warmup_ratio': 0.08545214831324029, 'lr_scheduler_type': 'cosine', 'max_grad_norm': 0.6393232321183058, 'label_smoothing_factor': 0.09082658859666537, 'hidden_dropout_prob': 0.07186856720009173, 'attention_probs_dropout_prob': 0.04346846162736693} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\90532\anaconda3\envs\yolov5-env\lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_

KeyboardInterrupt: 

# En İyi Parametrelerle Eğitim

In [None]:
train_result = best_trainer.train()
print(train_result)

# En iyi Modeli Kaydetme

In [None]:
os.makedirs(final_dir, exist_ok=True)
best_trainer.save_model(final_dir)   # model + config
tokenizer.save_pretrained(final_dir)

# İsteğe bağlı: Geçici klasörleri temizle
for p in [hp_output_dir, best_output_dir]:
    try:
        shutil.rmtree(p, ignore_errors=True)
    except Exception as e:
        print(f"Temizlik hatası ({p}):", e)

print("Final model saved to:", final_dir)

# Model Çıktılarını İnceleme

In [None]:
from transformers import pipeline
from html import escape
from IPython.display import HTML, display

# --- NER pipeline (GPU varsa device=0 ver)
ckpt = "./bert-turkish-ner-final/yeni_veri3/"
ner = pipeline(
    "token-classification",
    model=ckpt,
    tokenizer=ckpt,
    aggregation_strategy="simple",
    device=0  # GPU yoksa -1
)

# --- Test metni
text = (
    "Müşteri: “Merhaba, dün 15.03.2025 tarihinde hesabımdan "
    "TR12 3456 7890 1234 5678 9012 34 IBAN’ına 7.850,00 TL havale yapılmış gözüküyor. "
    "Benim bilgim dışında gerçekleşmiş, acil iptal edilmesini istiyorum. "
    "Ayrıca TCKN 12345678901 üzerinden yaptığım başvuruda da halen dönüş olmadı. "
    "Telefonum 0532 456 78 90, mail adresim ayse.kaya@example.com. "
    "Bu işlem Ziraat Teknoloji A.Ş. hesabına mı yapılmış, bilgi rica ederim.”"
)

# --- Tüm entity'ler için renk paleti
colors = {
    "PERSON":   "#4E79A7",  # mavi
    "ORG":      "#F28E2B",  # turuncu
    "DATE":     "#59A14F",  # yeşil
    "TIME":     "#FF33A6",  # pembe
    "IBAN":     "#E15759",  # kırmızı
    "AMOUNT":   "#EDC948",  # sarı
    "CURRENCY": "#B07AA1",  # mor
    "TCKN":     "#76B7B2",  # teal
    "CARD_NO":  "#9C755F",  # kahverengi
    "PLATE":    "#FF9DA7",  # açık pembe
    "ACCOUNT_NO": "#33B2FF",  # açık mavi
    "ADDRESS":  "#8CD17D",  # açık yeşil
    "LANDLINE": "#F39C12",  # turuncu-sarı
    "EMAIL":    "#7F7F7F",  # gri
    "PHONE":    "#17BECF",  # cyan
}

# --- HTML render fonksiyonu
def render_ner_html(text, preds, palette):
    preds = sorted(preds, key=lambda x: x["start"])
    out = []
    last = 0
    for p in preds:
        s, e, lab = p["start"], p["end"], p["entity_group"]
        out.append(escape(text[last:s]))
        color = palette.get(lab, "#FFD54F")
        out.append(
            f"<span style='background:{color}33;"
            f"border-bottom:2px solid {color}; border-radius:3px;"
            f"padding:0 2px;' title='{lab}'>"
            f"{escape(text[s:e])}"
            f"</span>"
        )
        last = e
    out.append(escape(text[last:]))

    # Lejant
    legend = "".join(
        f"<span style='background:{c}33;border-bottom:2px solid {c};"
        f"border-radius:3px;padding:2px 6px;margin:3px;display:inline-block'>{lbl}</span>"
        for lbl, c in palette.items()
    )

    html = (
        "<div style='font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial;"
        "line-height:1.8;font-size:16px;white-space:pre-wrap;'>"
        + "".join(out) +
        "</div>"
        "<div style='margin-top:10px'>" + legend + "</div>"
    )
    return HTML(html)

# --- Model tahminleri ve gösterim
preds = ner(text)
display(render_ner_html(text, preds, colors))
