In [1]:
import json

# Đọc dữ liệu từ file
train_data = []
with open("duration_training_dataset.txt", "r", encoding="utf-8") as f:  # ← đổi tên file cho đúng với file bạn đang dùng
    for line in f:
        if line.strip():  # Bỏ qua dòng trống
            train_data.append(json.loads(line.strip()))

print(f"Số lượng mẫu: {len(train_data)}")
print("Ví dụ một mẫu đầu tiên:")
print(json.dumps(train_data[0], indent=2, ensure_ascii=False))


Số lượng mẫu: 1490
Ví dụ một mẫu đầu tiên:
{
  "context": "Trong một lớp học, các học sinh đang học về các chủ đề khác nhau. Một số em rất chăm chỉ và thường xuyên hoàn thành bài tập về nhà, trong khi những em khác lại bỏ lỡ nhiều bài. Cô giáo luôn cố gắng khuyến khích cả lớp bằng cách tổ chức các buổi thảo luận thú vị.",
  "labels": [
    "yes",
    "no",
    "yes",
    "no"
  ],
  "options": [
    "1 tuần",
    "5 năm",
    "10 ngày",
    "10 giây"
  ],
  "qid": 650,
  "question": "Mất bao lâu để hoàn thành tất cả bài tập về nhà của lớp học?"
}


In [2]:
from collections import Counter

label_counter = Counter()
option_lengths = []

for sample in train_data:
    label_counter.update(sample["labels"])
    option_lengths.append(len(sample["options"]))

print("Tổng số nhãn:")
print(label_counter)
print(f"Số option trung bình mỗi câu hỏi: {sum(option_lengths) / len(option_lengths):.2f}")


Tổng số nhãn:
Counter({'no': 3003, 'yes': 2957})
Số option trung bình mỗi câu hỏi: 4.00


In [4]:

# BƯỚC 2: Rule-based labeling
def rule_based_predict(sample):
    context = sample["context"].lower()
    question = sample["question"].lower()
    options = sample["options"]

    short_units = ["giây", "phút", "giờ"]
    medium_units = ["ngày", "tuần"]
    long_units = ["tháng", "năm"]

    short_context = ["biểu diễn", "sửa", "trình diễn", "hòa nhạc", "vẽ", "chơi", "làm bài", "bài tập"]
    long_context = ["chuẩn bị", "triển lãm", "phát triển", "hoàn thành", "tổ chức"]

    predicted_labels = []

    for opt in options:
        label = "no"  # default

        # Check type of time unit in option
        if any(unit in opt for unit in short_units):
            if any(word in context for word in short_context):
                label = "yes"
        elif any(unit in opt for unit in medium_units):
            if any(word in context for word in long_context + short_context):
                label = "yes"
        elif any(unit in opt for unit in long_units):
            if any(word in context for word in long_context):
                label = "yes"

        predicted_labels.append(label)

    return predicted_labels

# BƯỚC 3: Đánh giá
from sklearn.metrics import classification_report

y_true = []
y_pred = []

for sample in train_data:
    pred = rule_based_predict(sample)
    gold = sample["labels"]
    y_true.extend(gold)
    y_pred.extend(pred)

print(classification_report(y_true, y_pred, target_names=["no", "yes"]))


              precision    recall  f1-score   support

          no       0.51      0.83      0.63      3003
         yes       0.51      0.18      0.26      2957

    accuracy                           0.51      5960
   macro avg       0.51      0.50      0.45      5960
weighted avg       0.51      0.51      0.45      5960



In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCl

In [5]:
import json

examples = []
with open("duration_training_dataset.txt", "r", encoding="utf-8") as f:
    for line in f:
        sample = json.loads(line.strip())
        for opt, label in zip(sample["options"], sample["labels"]):
            text = sample["context"] + " " + sample["question"] + " " + opt
            label_id = 1 if label == "yes" else 0
            examples.append({
                "text": text,
                "label": label_id
            })

print(f"Tổng số sample: {len(examples)}")

Tổng số sample: 5960


In [6]:
from datasets import Dataset

dataset = Dataset.from_list(examples)

def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.train_test_split(test_size=0.2)

Map: 100%|██████████| 5960/5960 [00:00<00:00, 6818.10 examples/s]


In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./phobert_duration_qa",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()

  0%|          | 10/5960 [00:06<35:53,  2.76it/s] 

{'loss': 0.7066, 'learning_rate': 4.9916107382550336e-05, 'epoch': 0.02}


  0%|          | 20/5960 [00:09<31:59,  3.09it/s]

{'loss': 0.7102, 'learning_rate': 4.983221476510067e-05, 'epoch': 0.03}


  1%|          | 30/5960 [00:13<31:43,  3.12it/s]

{'loss': 0.694, 'learning_rate': 4.974832214765101e-05, 'epoch': 0.05}


  1%|          | 40/5960 [00:16<31:42,  3.11it/s]

{'loss': 0.6616, 'learning_rate': 4.966442953020135e-05, 'epoch': 0.07}


  1%|          | 50/5960 [00:19<31:40,  3.11it/s]

{'loss': 0.6329, 'learning_rate': 4.958053691275168e-05, 'epoch': 0.08}


  1%|          | 60/5960 [00:22<31:57,  3.08it/s]

{'loss': 0.5324, 'learning_rate': 4.9496644295302015e-05, 'epoch': 0.1}


  1%|          | 70/5960 [00:25<31:45,  3.09it/s]

{'loss': 0.7176, 'learning_rate': 4.9412751677852355e-05, 'epoch': 0.12}


  1%|▏         | 80/5960 [00:29<31:47,  3.08it/s]

{'loss': 0.5999, 'learning_rate': 4.932885906040269e-05, 'epoch': 0.13}


  2%|▏         | 90/5960 [00:32<31:44,  3.08it/s]

{'loss': 0.6061, 'learning_rate': 4.924496644295302e-05, 'epoch': 0.15}


  2%|▏         | 100/5960 [00:35<32:21,  3.02it/s]

{'loss': 0.5078, 'learning_rate': 4.9161073825503354e-05, 'epoch': 0.17}


  2%|▏         | 110/5960 [00:38<31:22,  3.11it/s]

{'loss': 0.572, 'learning_rate': 4.9077181208053694e-05, 'epoch': 0.18}


  2%|▏         | 120/5960 [00:42<33:45,  2.88it/s]

{'loss': 0.5472, 'learning_rate': 4.8993288590604034e-05, 'epoch': 0.2}


  2%|▏         | 130/5960 [00:45<32:26,  2.99it/s]

{'loss': 0.5556, 'learning_rate': 4.890939597315437e-05, 'epoch': 0.22}


  2%|▏         | 140/5960 [00:49<31:48,  3.05it/s]

{'loss': 0.548, 'learning_rate': 4.88255033557047e-05, 'epoch': 0.23}


  3%|▎         | 150/5960 [00:52<31:27,  3.08it/s]

{'loss': 0.6494, 'learning_rate': 4.874161073825503e-05, 'epoch': 0.25}


  3%|▎         | 160/5960 [00:55<30:53,  3.13it/s]

{'loss': 0.4952, 'learning_rate': 4.865771812080537e-05, 'epoch': 0.27}


  3%|▎         | 170/5960 [00:58<31:43,  3.04it/s]

{'loss': 0.6289, 'learning_rate': 4.8573825503355706e-05, 'epoch': 0.29}


  3%|▎         | 180/5960 [01:02<31:20,  3.07it/s]

{'loss': 0.5789, 'learning_rate': 4.848993288590604e-05, 'epoch': 0.3}


  3%|▎         | 190/5960 [01:05<30:57,  3.11it/s]

{'loss': 0.6103, 'learning_rate': 4.840604026845638e-05, 'epoch': 0.32}


  3%|▎         | 200/5960 [01:08<30:56,  3.10it/s]

{'loss': 0.6041, 'learning_rate': 4.832214765100672e-05, 'epoch': 0.34}


  4%|▎         | 210/5960 [01:11<32:30,  2.95it/s]

{'loss': 0.5319, 'learning_rate': 4.823825503355705e-05, 'epoch': 0.35}


  4%|▎         | 220/5960 [01:15<31:30,  3.04it/s]

{'loss': 0.4965, 'learning_rate': 4.8154362416107385e-05, 'epoch': 0.37}


  4%|▍         | 230/5960 [01:18<31:12,  3.06it/s]

{'loss': 0.5534, 'learning_rate': 4.807046979865772e-05, 'epoch': 0.39}


  4%|▍         | 240/5960 [01:21<31:34,  3.02it/s]

{'loss': 0.6059, 'learning_rate': 4.798657718120805e-05, 'epoch': 0.4}


  4%|▍         | 250/5960 [01:25<32:06,  2.96it/s]

{'loss': 0.619, 'learning_rate': 4.790268456375839e-05, 'epoch': 0.42}


  4%|▍         | 260/5960 [01:28<32:10,  2.95it/s]

{'loss': 0.6074, 'learning_rate': 4.7818791946308725e-05, 'epoch': 0.44}


  5%|▍         | 270/5960 [01:31<31:17,  3.03it/s]

{'loss': 0.6065, 'learning_rate': 4.7734899328859064e-05, 'epoch': 0.45}


  5%|▍         | 280/5960 [01:35<31:51,  2.97it/s]

{'loss': 0.5788, 'learning_rate': 4.76510067114094e-05, 'epoch': 0.47}


  5%|▍         | 290/5960 [01:38<31:53,  2.96it/s]

{'loss': 0.4919, 'learning_rate': 4.756711409395974e-05, 'epoch': 0.49}


  5%|▌         | 300/5960 [01:41<31:18,  3.01it/s]

{'loss': 0.4392, 'learning_rate': 4.748322147651007e-05, 'epoch': 0.5}


  5%|▌         | 310/5960 [01:45<31:22,  3.00it/s]

{'loss': 0.5987, 'learning_rate': 4.7399328859060404e-05, 'epoch': 0.52}


  5%|▌         | 320/5960 [01:48<31:50,  2.95it/s]

{'loss': 0.456, 'learning_rate': 4.731543624161074e-05, 'epoch': 0.54}


  6%|▌         | 330/5960 [01:51<30:29,  3.08it/s]

{'loss': 0.5492, 'learning_rate': 4.723154362416108e-05, 'epoch': 0.55}


  6%|▌         | 340/5960 [01:55<30:09,  3.11it/s]

{'loss': 0.6306, 'learning_rate': 4.714765100671141e-05, 'epoch': 0.57}


  6%|▌         | 350/5960 [01:58<30:58,  3.02it/s]

{'loss': 0.5797, 'learning_rate': 4.706375838926175e-05, 'epoch': 0.59}


  6%|▌         | 360/5960 [02:01<30:45,  3.03it/s]

{'loss': 0.6555, 'learning_rate': 4.697986577181208e-05, 'epoch': 0.6}


  6%|▌         | 370/5960 [02:05<31:00,  3.00it/s]

{'loss': 0.5619, 'learning_rate': 4.6895973154362416e-05, 'epoch': 0.62}


  6%|▋         | 380/5960 [02:08<31:22,  2.96it/s]

{'loss': 0.558, 'learning_rate': 4.6812080536912756e-05, 'epoch': 0.64}


  7%|▋         | 390/5960 [02:11<30:43,  3.02it/s]

{'loss': 0.5382, 'learning_rate': 4.672818791946309e-05, 'epoch': 0.65}


  7%|▋         | 400/5960 [02:15<30:40,  3.02it/s]

{'loss': 0.3904, 'learning_rate': 4.664429530201342e-05, 'epoch': 0.67}


  7%|▋         | 410/5960 [02:18<30:09,  3.07it/s]

{'loss': 0.7382, 'learning_rate': 4.6560402684563755e-05, 'epoch': 0.69}


  7%|▋         | 420/5960 [02:21<31:06,  2.97it/s]

{'loss': 0.668, 'learning_rate': 4.6476510067114095e-05, 'epoch': 0.7}


  7%|▋         | 430/5960 [02:25<30:44,  3.00it/s]

{'loss': 0.4654, 'learning_rate': 4.6392617449664435e-05, 'epoch': 0.72}


  7%|▋         | 440/5960 [02:28<30:25,  3.02it/s]

{'loss': 0.4999, 'learning_rate': 4.630872483221477e-05, 'epoch': 0.74}


  8%|▊         | 450/5960 [02:31<30:47,  2.98it/s]

{'loss': 0.6326, 'learning_rate': 4.62248322147651e-05, 'epoch': 0.76}


  8%|▊         | 460/5960 [02:35<30:05,  3.05it/s]

{'loss': 0.548, 'learning_rate': 4.6140939597315434e-05, 'epoch': 0.77}


  8%|▊         | 470/5960 [02:38<29:31,  3.10it/s]

{'loss': 0.5447, 'learning_rate': 4.6057046979865774e-05, 'epoch': 0.79}


  8%|▊         | 480/5960 [02:41<30:49,  2.96it/s]

{'loss': 0.5808, 'learning_rate': 4.597315436241611e-05, 'epoch': 0.81}


  8%|▊         | 490/5960 [02:45<30:50,  2.96it/s]

{'loss': 0.4453, 'learning_rate': 4.588926174496645e-05, 'epoch': 0.82}


  8%|▊         | 500/5960 [02:48<30:16,  3.01it/s]

{'loss': 0.5175, 'learning_rate': 4.580536912751678e-05, 'epoch': 0.84}


  9%|▊         | 510/5960 [02:51<30:03,  3.02it/s]

{'loss': 0.7127, 'learning_rate': 4.572147651006712e-05, 'epoch': 0.86}


  9%|▊         | 520/5960 [02:55<30:07,  3.01it/s]

{'loss': 0.4962, 'learning_rate': 4.5637583892617453e-05, 'epoch': 0.87}


  9%|▉         | 530/5960 [02:58<30:10,  3.00it/s]

{'loss': 0.4592, 'learning_rate': 4.5553691275167787e-05, 'epoch': 0.89}


  9%|▉         | 540/5960 [03:01<30:08,  3.00it/s]

{'loss': 0.6203, 'learning_rate': 4.546979865771812e-05, 'epoch': 0.91}


  9%|▉         | 550/5960 [03:05<30:08,  2.99it/s]

{'loss': 0.4632, 'learning_rate': 4.538590604026846e-05, 'epoch': 0.92}


  9%|▉         | 560/5960 [03:08<30:00,  3.00it/s]

{'loss': 0.5598, 'learning_rate': 4.530201342281879e-05, 'epoch': 0.94}


 10%|▉         | 570/5960 [03:11<29:56,  3.00it/s]

{'loss': 0.5667, 'learning_rate': 4.521812080536913e-05, 'epoch': 0.96}


 10%|▉         | 580/5960 [03:15<29:54,  3.00it/s]

{'loss': 0.526, 'learning_rate': 4.5134228187919466e-05, 'epoch': 0.97}


 10%|▉         | 590/5960 [03:18<29:51,  3.00it/s]

{'loss': 0.6093, 'learning_rate': 4.50503355704698e-05, 'epoch': 0.99}


                                                  
 10%|█         | 596/5960 [03:34<29:52,  2.99it/s]

{'eval_loss': 0.5204587578773499, 'eval_runtime': 14.1836, 'eval_samples_per_second': 84.041, 'eval_steps_per_second': 10.505, 'epoch': 1.0}


 10%|█         | 600/5960 [03:47<4:22:52,  2.94s/it] 

{'loss': 0.3943, 'learning_rate': 4.496644295302014e-05, 'epoch': 1.01}


 10%|█         | 610/5960 [03:50<35:50,  2.49it/s]  

{'loss': 0.4076, 'learning_rate': 4.488255033557047e-05, 'epoch': 1.02}


 10%|█         | 620/5960 [03:53<29:28,  3.02it/s]

{'loss': 0.5285, 'learning_rate': 4.4798657718120805e-05, 'epoch': 1.04}


 11%|█         | 630/5960 [03:56<29:15,  3.04it/s]

{'loss': 0.4672, 'learning_rate': 4.471476510067114e-05, 'epoch': 1.06}


 11%|█         | 640/5960 [04:00<29:47,  2.98it/s]

{'loss': 0.6582, 'learning_rate': 4.463087248322148e-05, 'epoch': 1.07}


 11%|█         | 650/5960 [04:03<29:33,  2.99it/s]

{'loss': 0.588, 'learning_rate': 4.454697986577182e-05, 'epoch': 1.09}


 11%|█         | 660/5960 [04:07<29:27,  3.00it/s]

{'loss': 0.4562, 'learning_rate': 4.446308724832215e-05, 'epoch': 1.11}


 11%|█         | 670/5960 [04:10<29:24,  3.00it/s]

{'loss': 0.4189, 'learning_rate': 4.4379194630872484e-05, 'epoch': 1.12}


 11%|█▏        | 680/5960 [04:13<29:21,  3.00it/s]

{'loss': 0.4331, 'learning_rate': 4.4295302013422824e-05, 'epoch': 1.14}


 12%|█▏        | 690/5960 [04:17<29:18,  3.00it/s]

{'loss': 0.3277, 'learning_rate': 4.421140939597316e-05, 'epoch': 1.16}


 12%|█▏        | 700/5960 [04:20<29:15,  3.00it/s]

{'loss': 0.4241, 'learning_rate': 4.412751677852349e-05, 'epoch': 1.17}


 12%|█▏        | 710/5960 [04:23<29:10,  3.00it/s]

{'loss': 0.4664, 'learning_rate': 4.4043624161073823e-05, 'epoch': 1.19}


 12%|█▏        | 720/5960 [04:27<29:11,  2.99it/s]

{'loss': 0.4213, 'learning_rate': 4.395973154362416e-05, 'epoch': 1.21}


 12%|█▏        | 730/5960 [04:30<29:05,  3.00it/s]

{'loss': 0.4764, 'learning_rate': 4.38758389261745e-05, 'epoch': 1.22}


 12%|█▏        | 740/5960 [04:33<29:00,  3.00it/s]

{'loss': 0.4214, 'learning_rate': 4.3791946308724836e-05, 'epoch': 1.24}


 13%|█▎        | 750/5960 [04:37<29:01,  2.99it/s]

{'loss': 0.5531, 'learning_rate': 4.370805369127517e-05, 'epoch': 1.26}


 13%|█▎        | 760/5960 [04:40<28:57,  2.99it/s]

{'loss': 0.454, 'learning_rate': 4.36241610738255e-05, 'epoch': 1.28}


 13%|█▎        | 770/5960 [04:43<28:51,  3.00it/s]

{'loss': 0.5185, 'learning_rate': 4.354026845637584e-05, 'epoch': 1.29}


 13%|█▎        | 780/5960 [04:47<28:54,  2.99it/s]

{'loss': 0.5266, 'learning_rate': 4.3456375838926176e-05, 'epoch': 1.31}


 13%|█▎        | 790/5960 [04:50<28:47,  2.99it/s]

{'loss': 0.4075, 'learning_rate': 4.337248322147651e-05, 'epoch': 1.33}


 13%|█▎        | 800/5960 [04:53<28:46,  2.99it/s]

{'loss': 0.3848, 'learning_rate': 4.328859060402685e-05, 'epoch': 1.34}


 14%|█▎        | 810/5960 [04:57<28:31,  3.01it/s]

{'loss': 0.5196, 'learning_rate': 4.320469798657718e-05, 'epoch': 1.36}


 14%|█▍        | 820/5960 [05:00<28:16,  3.03it/s]

{'loss': 0.3876, 'learning_rate': 4.312080536912752e-05, 'epoch': 1.38}


 14%|█▍        | 830/5960 [05:03<28:15,  3.03it/s]

{'loss': 0.5198, 'learning_rate': 4.3036912751677855e-05, 'epoch': 1.39}


 14%|█▍        | 840/5960 [05:07<28:11,  3.03it/s]

{'loss': 0.3543, 'learning_rate': 4.295302013422819e-05, 'epoch': 1.41}


 14%|█▍        | 850/5960 [05:10<28:11,  3.02it/s]

{'loss': 0.3707, 'learning_rate': 4.286912751677852e-05, 'epoch': 1.43}


 14%|█▍        | 860/5960 [05:13<28:02,  3.03it/s]

{'loss': 0.2984, 'learning_rate': 4.278523489932886e-05, 'epoch': 1.44}


 15%|█▍        | 870/5960 [05:17<28:04,  3.02it/s]

{'loss': 0.3569, 'learning_rate': 4.27013422818792e-05, 'epoch': 1.46}


 15%|█▍        | 880/5960 [05:20<28:00,  3.02it/s]

{'loss': 0.563, 'learning_rate': 4.2617449664429534e-05, 'epoch': 1.48}


 15%|█▍        | 890/5960 [05:23<27:55,  3.03it/s]

{'loss': 0.4669, 'learning_rate': 4.253355704697987e-05, 'epoch': 1.49}


 15%|█▌        | 900/5960 [05:26<27:53,  3.02it/s]

{'loss': 0.5945, 'learning_rate': 4.244966442953021e-05, 'epoch': 1.51}


 15%|█▌        | 910/5960 [05:30<27:52,  3.02it/s]

{'loss': 0.492, 'learning_rate': 4.236577181208054e-05, 'epoch': 1.53}


 15%|█▌        | 920/5960 [05:33<27:48,  3.02it/s]

{'loss': 0.4548, 'learning_rate': 4.228187919463087e-05, 'epoch': 1.54}


 16%|█▌        | 930/5960 [05:36<27:46,  3.02it/s]

{'loss': 0.4448, 'learning_rate': 4.2197986577181206e-05, 'epoch': 1.56}


 16%|█▌        | 940/5960 [05:40<27:42,  3.02it/s]

{'loss': 0.5355, 'learning_rate': 4.2114093959731546e-05, 'epoch': 1.58}


 16%|█▌        | 950/5960 [05:43<27:39,  3.02it/s]

{'loss': 0.4331, 'learning_rate': 4.2030201342281886e-05, 'epoch': 1.59}


 16%|█▌        | 960/5960 [05:46<27:36,  3.02it/s]

{'loss': 0.5382, 'learning_rate': 4.194630872483222e-05, 'epoch': 1.61}


 16%|█▋        | 970/5960 [05:50<27:34,  3.02it/s]

{'loss': 0.3868, 'learning_rate': 4.186241610738255e-05, 'epoch': 1.63}


 16%|█▋        | 980/5960 [05:53<27:30,  3.02it/s]

{'loss': 0.3598, 'learning_rate': 4.1778523489932886e-05, 'epoch': 1.64}


 17%|█▋        | 990/5960 [05:56<27:24,  3.02it/s]

{'loss': 0.4966, 'learning_rate': 4.1694630872483225e-05, 'epoch': 1.66}


 17%|█▋        | 1000/5960 [06:00<27:23,  3.02it/s]

{'loss': 0.5458, 'learning_rate': 4.161073825503356e-05, 'epoch': 1.68}


 17%|█▋        | 1010/5960 [06:03<27:18,  3.02it/s]

{'loss': 0.3771, 'learning_rate': 4.152684563758389e-05, 'epoch': 1.69}


 17%|█▋        | 1020/5960 [06:06<27:15,  3.02it/s]

{'loss': 0.4944, 'learning_rate': 4.144295302013423e-05, 'epoch': 1.71}


 17%|█▋        | 1030/5960 [06:10<27:13,  3.02it/s]

{'loss': 0.342, 'learning_rate': 4.135906040268457e-05, 'epoch': 1.73}


 17%|█▋        | 1040/5960 [06:13<27:07,  3.02it/s]

{'loss': 0.52, 'learning_rate': 4.1275167785234905e-05, 'epoch': 1.74}


 18%|█▊        | 1050/5960 [06:16<27:05,  3.02it/s]

{'loss': 0.5246, 'learning_rate': 4.119127516778524e-05, 'epoch': 1.76}


 18%|█▊        | 1060/5960 [06:20<27:03,  3.02it/s]

{'loss': 0.4402, 'learning_rate': 4.110738255033557e-05, 'epoch': 1.78}


 18%|█▊        | 1070/5960 [06:23<26:57,  3.02it/s]

{'loss': 0.4402, 'learning_rate': 4.1023489932885904e-05, 'epoch': 1.8}


 18%|█▊        | 1080/5960 [06:26<26:56,  3.02it/s]

{'loss': 0.4821, 'learning_rate': 4.0939597315436244e-05, 'epoch': 1.81}


 18%|█▊        | 1090/5960 [06:29<26:52,  3.02it/s]

{'loss': 0.4883, 'learning_rate': 4.085570469798658e-05, 'epoch': 1.83}


 18%|█▊        | 1100/5960 [06:33<26:49,  3.02it/s]

{'loss': 0.6476, 'learning_rate': 4.077181208053692e-05, 'epoch': 1.85}


 19%|█▊        | 1110/5960 [06:36<26:40,  3.03it/s]

{'loss': 0.5467, 'learning_rate': 4.068791946308725e-05, 'epoch': 1.86}


 19%|█▉        | 1120/5960 [06:39<26:38,  3.03it/s]

{'loss': 0.5761, 'learning_rate': 4.060402684563759e-05, 'epoch': 1.88}


 19%|█▉        | 1130/5960 [06:43<26:33,  3.03it/s]

{'loss': 0.4069, 'learning_rate': 4.052013422818792e-05, 'epoch': 1.9}


 19%|█▉        | 1140/5960 [06:46<26:30,  3.03it/s]

{'loss': 0.4784, 'learning_rate': 4.0436241610738256e-05, 'epoch': 1.91}


 19%|█▉        | 1150/5960 [06:49<26:29,  3.03it/s]

{'loss': 0.3883, 'learning_rate': 4.035234899328859e-05, 'epoch': 1.93}


 19%|█▉        | 1160/5960 [06:53<26:25,  3.03it/s]

{'loss': 0.3761, 'learning_rate': 4.026845637583892e-05, 'epoch': 1.95}


 20%|█▉        | 1170/5960 [06:56<26:23,  3.03it/s]

{'loss': 0.2679, 'learning_rate': 4.018456375838926e-05, 'epoch': 1.96}


 20%|█▉        | 1180/5960 [06:59<26:15,  3.03it/s]

{'loss': 0.4814, 'learning_rate': 4.01006711409396e-05, 'epoch': 1.98}


 20%|█▉        | 1190/5960 [07:03<26:14,  3.03it/s]

{'loss': 0.4325, 'learning_rate': 4.0016778523489935e-05, 'epoch': 2.0}


                                                   
 20%|██        | 1192/5960 [07:17<26:20,  3.02it/s]

{'eval_loss': 0.45159804821014404, 'eval_runtime': 14.0208, 'eval_samples_per_second': 85.017, 'eval_steps_per_second': 10.627, 'epoch': 2.0}


 20%|██        | 1200/5960 [07:28<1:09:45,  1.14it/s]

{'loss': 0.4156, 'learning_rate': 3.993288590604027e-05, 'epoch': 2.01}


 20%|██        | 1210/5960 [07:31<27:19,  2.90it/s]  

{'loss': 0.3671, 'learning_rate': 3.984899328859061e-05, 'epoch': 2.03}


 20%|██        | 1220/5960 [07:35<26:07,  3.02it/s]

{'loss': 0.3493, 'learning_rate': 3.976510067114094e-05, 'epoch': 2.05}


 21%|██        | 1230/5960 [07:38<26:05,  3.02it/s]

{'loss': 0.4182, 'learning_rate': 3.9681208053691275e-05, 'epoch': 2.06}


 21%|██        | 1240/5960 [07:41<26:12,  3.00it/s]

{'loss': 0.3443, 'learning_rate': 3.959731543624161e-05, 'epoch': 2.08}


 21%|██        | 1250/5960 [07:45<26:13,  2.99it/s]

{'loss': 0.3665, 'learning_rate': 3.951342281879195e-05, 'epoch': 2.1}


 21%|██        | 1260/5960 [07:48<26:10,  2.99it/s]

{'loss': 0.3812, 'learning_rate': 3.942953020134229e-05, 'epoch': 2.11}


 21%|██▏       | 1270/5960 [07:51<26:05,  3.00it/s]

{'loss': 0.4428, 'learning_rate': 3.934563758389262e-05, 'epoch': 2.13}


 21%|██▏       | 1280/5960 [07:55<26:03,  2.99it/s]

{'loss': 0.4686, 'learning_rate': 3.9261744966442954e-05, 'epoch': 2.15}


 22%|██▏       | 1290/5960 [07:58<25:45,  3.02it/s]

{'loss': 0.3337, 'learning_rate': 3.917785234899329e-05, 'epoch': 2.16}


 22%|██▏       | 1300/5960 [08:01<25:42,  3.02it/s]

{'loss': 0.3147, 'learning_rate': 3.909395973154363e-05, 'epoch': 2.18}


 22%|██▏       | 1310/5960 [08:05<25:40,  3.02it/s]

{'loss': 0.3487, 'learning_rate': 3.901006711409396e-05, 'epoch': 2.2}


 22%|██▏       | 1320/5960 [08:08<25:38,  3.02it/s]

{'loss': 0.2655, 'learning_rate': 3.89261744966443e-05, 'epoch': 2.21}


 22%|██▏       | 1330/5960 [08:11<25:33,  3.02it/s]

{'loss': 0.4278, 'learning_rate': 3.884228187919463e-05, 'epoch': 2.23}


 22%|██▏       | 1340/5960 [08:15<25:30,  3.02it/s]

{'loss': 0.4372, 'learning_rate': 3.875838926174497e-05, 'epoch': 2.25}


 23%|██▎       | 1350/5960 [08:18<25:22,  3.03it/s]

{'loss': 0.2489, 'learning_rate': 3.8674496644295306e-05, 'epoch': 2.27}


 23%|██▎       | 1360/5960 [08:21<25:19,  3.03it/s]

{'loss': 0.4273, 'learning_rate': 3.859060402684564e-05, 'epoch': 2.28}


 23%|██▎       | 1370/5960 [08:24<25:16,  3.03it/s]

{'loss': 0.3609, 'learning_rate': 3.850671140939597e-05, 'epoch': 2.3}


 23%|██▎       | 1380/5960 [08:28<25:13,  3.03it/s]

{'loss': 0.2569, 'learning_rate': 3.8422818791946305e-05, 'epoch': 2.32}


 23%|██▎       | 1390/5960 [08:31<25:08,  3.03it/s]

{'loss': 0.3744, 'learning_rate': 3.8338926174496645e-05, 'epoch': 2.33}


 23%|██▎       | 1400/5960 [08:34<25:08,  3.02it/s]

{'loss': 0.3686, 'learning_rate': 3.8255033557046985e-05, 'epoch': 2.35}


 24%|██▎       | 1410/5960 [08:38<25:04,  3.02it/s]

{'loss': 0.2736, 'learning_rate': 3.817114093959732e-05, 'epoch': 2.37}


 24%|██▍       | 1420/5960 [08:41<25:01,  3.02it/s]

{'loss': 0.3327, 'learning_rate': 3.808724832214765e-05, 'epoch': 2.38}


 24%|██▍       | 1430/5960 [08:44<24:59,  3.02it/s]

{'loss': 0.3897, 'learning_rate': 3.800335570469799e-05, 'epoch': 2.4}


 24%|██▍       | 1440/5960 [08:48<24:53,  3.03it/s]

{'loss': 0.2053, 'learning_rate': 3.7919463087248324e-05, 'epoch': 2.42}


 24%|██▍       | 1450/5960 [08:51<24:51,  3.02it/s]

{'loss': 0.3828, 'learning_rate': 3.783557046979866e-05, 'epoch': 2.43}


 24%|██▍       | 1460/5960 [08:54<24:49,  3.02it/s]

{'loss': 0.2424, 'learning_rate': 3.775167785234899e-05, 'epoch': 2.45}


 25%|██▍       | 1470/5960 [08:58<24:47,  3.02it/s]

{'loss': 0.3149, 'learning_rate': 3.766778523489933e-05, 'epoch': 2.47}


 25%|██▍       | 1480/5960 [09:01<24:43,  3.02it/s]

{'loss': 0.3835, 'learning_rate': 3.758389261744967e-05, 'epoch': 2.48}


 25%|██▌       | 1490/5960 [09:04<24:39,  3.02it/s]

{'loss': 0.2995, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.5}


 25%|██▌       | 1500/5960 [09:08<24:33,  3.03it/s]

{'loss': 0.1643, 'learning_rate': 3.741610738255034e-05, 'epoch': 2.52}


 25%|██▌       | 1510/5960 [09:11<24:31,  3.02it/s]

{'loss': 0.2933, 'learning_rate': 3.733221476510067e-05, 'epoch': 2.53}


 26%|██▌       | 1520/5960 [09:14<24:28,  3.02it/s]

{'loss': 0.4803, 'learning_rate': 3.724832214765101e-05, 'epoch': 2.55}


 26%|██▌       | 1530/5960 [09:17<24:27,  3.02it/s]

{'loss': 0.6758, 'learning_rate': 3.716442953020134e-05, 'epoch': 2.57}


 26%|██▌       | 1540/5960 [09:21<24:21,  3.03it/s]

{'loss': 0.3428, 'learning_rate': 3.7080536912751676e-05, 'epoch': 2.58}


 26%|██▌       | 1550/5960 [09:24<24:17,  3.03it/s]

{'loss': 0.3534, 'learning_rate': 3.6996644295302016e-05, 'epoch': 2.6}


 26%|██▌       | 1560/5960 [09:27<24:16,  3.02it/s]

{'loss': 0.3565, 'learning_rate': 3.6912751677852356e-05, 'epoch': 2.62}


 26%|██▋       | 1570/5960 [09:31<24:13,  3.02it/s]

{'loss': 0.3278, 'learning_rate': 3.682885906040269e-05, 'epoch': 2.63}


 27%|██▋       | 1580/5960 [09:34<24:10,  3.02it/s]

{'loss': 0.3465, 'learning_rate': 3.674496644295302e-05, 'epoch': 2.65}


 27%|██▋       | 1590/5960 [09:37<24:05,  3.02it/s]

{'loss': 0.4149, 'learning_rate': 3.6661073825503355e-05, 'epoch': 2.67}


 27%|██▋       | 1600/5960 [09:41<24:01,  3.02it/s]

{'loss': 0.3662, 'learning_rate': 3.6577181208053695e-05, 'epoch': 2.68}


 27%|██▋       | 1610/5960 [09:44<24:01,  3.02it/s]

{'loss': 0.4186, 'learning_rate': 3.649328859060403e-05, 'epoch': 2.7}


 27%|██▋       | 1620/5960 [09:47<23:57,  3.02it/s]

{'loss': 0.3205, 'learning_rate': 3.640939597315436e-05, 'epoch': 2.72}


 27%|██▋       | 1630/5960 [09:51<23:54,  3.02it/s]

{'loss': 0.2837, 'learning_rate': 3.63255033557047e-05, 'epoch': 2.73}


 28%|██▊       | 1640/5960 [09:54<23:51,  3.02it/s]

{'loss': 0.2574, 'learning_rate': 3.6241610738255034e-05, 'epoch': 2.75}


 28%|██▊       | 1650/5960 [09:57<23:48,  3.02it/s]

{'loss': 0.3659, 'learning_rate': 3.6157718120805374e-05, 'epoch': 2.77}


 28%|██▊       | 1660/5960 [10:01<23:43,  3.02it/s]

{'loss': 0.2937, 'learning_rate': 3.607382550335571e-05, 'epoch': 2.79}


 28%|██▊       | 1670/5960 [10:04<23:39,  3.02it/s]

{'loss': 0.3461, 'learning_rate': 3.598993288590604e-05, 'epoch': 2.8}


 28%|██▊       | 1680/5960 [10:07<23:35,  3.02it/s]

{'loss': 0.1869, 'learning_rate': 3.5906040268456373e-05, 'epoch': 2.82}


 28%|██▊       | 1690/5960 [10:11<23:38,  3.01it/s]

{'loss': 0.28, 'learning_rate': 3.582214765100671e-05, 'epoch': 2.84}


 29%|██▊       | 1700/5960 [10:14<23:37,  3.01it/s]

{'loss': 0.5199, 'learning_rate': 3.5738255033557046e-05, 'epoch': 2.85}


 29%|██▊       | 1710/5960 [10:17<23:34,  3.01it/s]

{'loss': 0.4237, 'learning_rate': 3.5654362416107386e-05, 'epoch': 2.87}


 29%|██▉       | 1720/5960 [10:21<23:31,  3.00it/s]

{'loss': 0.4343, 'learning_rate': 3.557046979865772e-05, 'epoch': 2.89}


 29%|██▉       | 1730/5960 [10:24<23:25,  3.01it/s]

{'loss': 0.3826, 'learning_rate': 3.548657718120805e-05, 'epoch': 2.9}


 29%|██▉       | 1740/5960 [10:27<23:25,  3.00it/s]

{'loss': 0.3038, 'learning_rate': 3.540268456375839e-05, 'epoch': 2.92}


 29%|██▉       | 1750/5960 [10:31<23:22,  3.00it/s]

{'loss': 0.4024, 'learning_rate': 3.5318791946308726e-05, 'epoch': 2.94}


 30%|██▉       | 1760/5960 [10:34<23:35,  2.97it/s]

{'loss': 0.2992, 'learning_rate': 3.523489932885906e-05, 'epoch': 2.95}


 30%|██▉       | 1770/5960 [10:37<23:24,  2.98it/s]

{'loss': 0.37, 'learning_rate': 3.51510067114094e-05, 'epoch': 2.97}


 30%|██▉       | 1780/5960 [10:41<24:10,  2.88it/s]

{'loss': 0.3434, 'learning_rate': 3.506711409395974e-05, 'epoch': 2.99}


                                                   
 30%|███       | 1788/5960 [10:58<24:38,  2.82it/s]

{'eval_loss': 0.4070431888103485, 'eval_runtime': 14.5002, 'eval_samples_per_second': 82.206, 'eval_steps_per_second': 10.276, 'epoch': 3.0}


 30%|███       | 1790/5960 [11:06<5:36:48,  4.85s/it]

{'loss': 0.4231, 'learning_rate': 3.498322147651007e-05, 'epoch': 3.0}


 30%|███       | 1800/5960 [11:09<31:59,  2.17it/s]  

{'loss': 0.3243, 'learning_rate': 3.4899328859060405e-05, 'epoch': 3.02}


 30%|███       | 1810/5960 [11:12<23:06,  2.99it/s]

{'loss': 0.3007, 'learning_rate': 3.481543624161074e-05, 'epoch': 3.04}


 31%|███       | 1820/5960 [11:16<22:44,  3.03it/s]

{'loss': 0.2945, 'learning_rate': 3.473154362416108e-05, 'epoch': 3.05}


 31%|███       | 1830/5960 [11:19<22:42,  3.03it/s]

{'loss': 0.3584, 'learning_rate': 3.464765100671141e-05, 'epoch': 3.07}


 31%|███       | 1840/5960 [11:22<22:40,  3.03it/s]

{'loss': 0.3417, 'learning_rate': 3.4563758389261744e-05, 'epoch': 3.09}


 31%|███       | 1850/5960 [11:26<22:37,  3.03it/s]

{'loss': 0.1368, 'learning_rate': 3.4479865771812084e-05, 'epoch': 3.1}


 31%|███       | 1860/5960 [11:29<22:35,  3.03it/s]

{'loss': 0.3651, 'learning_rate': 3.439597315436242e-05, 'epoch': 3.12}


 31%|███▏      | 1870/5960 [11:32<22:32,  3.02it/s]

{'loss': 0.2797, 'learning_rate': 3.431208053691276e-05, 'epoch': 3.14}


 32%|███▏      | 1880/5960 [11:35<22:27,  3.03it/s]

{'loss': 0.1995, 'learning_rate': 3.422818791946309e-05, 'epoch': 3.15}


 32%|███▏      | 1890/5960 [11:39<22:26,  3.02it/s]

{'loss': 0.3715, 'learning_rate': 3.414429530201342e-05, 'epoch': 3.17}


 32%|███▏      | 1900/5960 [11:42<22:22,  3.03it/s]

{'loss': 0.249, 'learning_rate': 3.4060402684563756e-05, 'epoch': 3.19}


 32%|███▏      | 1910/5960 [11:45<22:18,  3.03it/s]

{'loss': 0.2175, 'learning_rate': 3.3976510067114096e-05, 'epoch': 3.2}


 32%|███▏      | 1920/5960 [11:49<22:16,  3.02it/s]

{'loss': 0.2589, 'learning_rate': 3.389261744966443e-05, 'epoch': 3.22}


 32%|███▏      | 1930/5960 [11:52<22:10,  3.03it/s]

{'loss': 0.4388, 'learning_rate': 3.380872483221477e-05, 'epoch': 3.24}


 33%|███▎      | 1940/5960 [11:55<22:10,  3.02it/s]

{'loss': 0.3701, 'learning_rate': 3.37248322147651e-05, 'epoch': 3.26}


 33%|███▎      | 1950/5960 [11:59<22:05,  3.03it/s]

{'loss': 0.1924, 'learning_rate': 3.3640939597315436e-05, 'epoch': 3.27}


 33%|███▎      | 1960/5960 [12:02<22:00,  3.03it/s]

{'loss': 0.2505, 'learning_rate': 3.3557046979865775e-05, 'epoch': 3.29}


 33%|███▎      | 1970/5960 [12:05<22:11,  3.00it/s]

{'loss': 0.2157, 'learning_rate': 3.347315436241611e-05, 'epoch': 3.31}


 33%|███▎      | 1980/5960 [12:09<22:38,  2.93it/s]

{'loss': 0.2629, 'learning_rate': 3.338926174496644e-05, 'epoch': 3.32}


 33%|███▎      | 1990/5960 [12:12<22:37,  2.92it/s]

{'loss': 0.3606, 'learning_rate': 3.3305369127516775e-05, 'epoch': 3.34}


 34%|███▎      | 2000/5960 [12:15<21:31,  3.07it/s]

{'loss': 0.2484, 'learning_rate': 3.3221476510067115e-05, 'epoch': 3.36}


 34%|███▎      | 2010/5960 [12:19<22:33,  2.92it/s]

{'loss': 0.3412, 'learning_rate': 3.3137583892617455e-05, 'epoch': 3.37}


 34%|███▍      | 2020/5960 [12:22<21:39,  3.03it/s]

{'loss': 0.2873, 'learning_rate': 3.305369127516779e-05, 'epoch': 3.39}


 34%|███▍      | 2030/5960 [12:25<21:19,  3.07it/s]

{'loss': 0.1481, 'learning_rate': 3.296979865771812e-05, 'epoch': 3.41}


 34%|███▍      | 2040/5960 [12:29<21:16,  3.07it/s]

{'loss': 0.3721, 'learning_rate': 3.288590604026846e-05, 'epoch': 3.42}


 34%|███▍      | 2050/5960 [12:32<21:12,  3.07it/s]

{'loss': 0.1875, 'learning_rate': 3.2802013422818794e-05, 'epoch': 3.44}


 35%|███▍      | 2060/5960 [12:35<21:09,  3.07it/s]

{'loss': 0.2813, 'learning_rate': 3.271812080536913e-05, 'epoch': 3.46}


 35%|███▍      | 2070/5960 [12:38<21:06,  3.07it/s]

{'loss': 0.1769, 'learning_rate': 3.263422818791946e-05, 'epoch': 3.47}


 35%|███▍      | 2080/5960 [12:42<21:04,  3.07it/s]

{'loss': 0.3946, 'learning_rate': 3.25503355704698e-05, 'epoch': 3.49}


 35%|███▌      | 2090/5960 [12:45<20:58,  3.08it/s]

{'loss': 0.2349, 'learning_rate': 3.246644295302014e-05, 'epoch': 3.51}


 35%|███▌      | 2100/5960 [12:48<21:01,  3.06it/s]

{'loss': 0.375, 'learning_rate': 3.238255033557047e-05, 'epoch': 3.52}


 35%|███▌      | 2110/5960 [12:51<20:54,  3.07it/s]

{'loss': 0.4696, 'learning_rate': 3.2298657718120806e-05, 'epoch': 3.54}


 36%|███▌      | 2120/5960 [12:55<20:52,  3.07it/s]

{'loss': 0.2809, 'learning_rate': 3.221476510067114e-05, 'epoch': 3.56}


 36%|███▌      | 2130/5960 [12:58<20:50,  3.06it/s]

{'loss': 0.3537, 'learning_rate': 3.213087248322148e-05, 'epoch': 3.57}


 36%|███▌      | 2140/5960 [13:01<20:47,  3.06it/s]

{'loss': 0.2038, 'learning_rate': 3.204697986577181e-05, 'epoch': 3.59}


 36%|███▌      | 2150/5960 [13:05<20:45,  3.06it/s]

{'loss': 0.5674, 'learning_rate': 3.196308724832215e-05, 'epoch': 3.61}


 36%|███▌      | 2160/5960 [13:08<20:39,  3.07it/s]

{'loss': 0.2833, 'learning_rate': 3.1879194630872485e-05, 'epoch': 3.62}


 36%|███▋      | 2170/5960 [13:11<20:36,  3.07it/s]

{'loss': 0.4061, 'learning_rate': 3.1795302013422825e-05, 'epoch': 3.64}


 37%|███▋      | 2180/5960 [13:14<20:47,  3.03it/s]

{'loss': 0.1938, 'learning_rate': 3.171140939597316e-05, 'epoch': 3.66}


 37%|███▋      | 2190/5960 [13:18<21:22,  2.94it/s]

{'loss': 0.1412, 'learning_rate': 3.162751677852349e-05, 'epoch': 3.67}


 37%|███▋      | 2200/5960 [13:21<21:09,  2.96it/s]

{'loss': 0.285, 'learning_rate': 3.1543624161073825e-05, 'epoch': 3.69}


 37%|███▋      | 2210/5960 [13:25<20:30,  3.05it/s]

{'loss': 0.3488, 'learning_rate': 3.145973154362416e-05, 'epoch': 3.71}


 37%|███▋      | 2220/5960 [13:28<20:21,  3.06it/s]

{'loss': 0.286, 'learning_rate': 3.13758389261745e-05, 'epoch': 3.72}


 37%|███▋      | 2230/5960 [13:31<20:17,  3.06it/s]

{'loss': 0.2317, 'learning_rate': 3.129194630872484e-05, 'epoch': 3.74}


 38%|███▊      | 2240/5960 [13:34<20:14,  3.06it/s]

{'loss': 0.3048, 'learning_rate': 3.120805369127517e-05, 'epoch': 3.76}


 38%|███▊      | 2250/5960 [13:38<20:11,  3.06it/s]

{'loss': 0.2226, 'learning_rate': 3.1124161073825504e-05, 'epoch': 3.78}


 38%|███▊      | 2260/5960 [13:41<20:08,  3.06it/s]

{'loss': 0.3711, 'learning_rate': 3.1040268456375844e-05, 'epoch': 3.79}


 38%|███▊      | 2270/5960 [13:44<20:04,  3.06it/s]

{'loss': 0.3091, 'learning_rate': 3.095637583892618e-05, 'epoch': 3.81}


 38%|███▊      | 2280/5960 [13:47<20:00,  3.07it/s]

{'loss': 0.6103, 'learning_rate': 3.087248322147651e-05, 'epoch': 3.83}


 38%|███▊      | 2290/5960 [13:51<19:56,  3.07it/s]

{'loss': 0.344, 'learning_rate': 3.078859060402684e-05, 'epoch': 3.84}


 39%|███▊      | 2300/5960 [13:54<19:52,  3.07it/s]

{'loss': 0.2818, 'learning_rate': 3.070469798657718e-05, 'epoch': 3.86}


 39%|███▉      | 2310/5960 [13:57<19:51,  3.06it/s]

{'loss': 0.2969, 'learning_rate': 3.062080536912752e-05, 'epoch': 3.88}


 39%|███▉      | 2320/5960 [14:00<19:46,  3.07it/s]

{'loss': 0.2235, 'learning_rate': 3.0536912751677856e-05, 'epoch': 3.89}


 39%|███▉      | 2330/5960 [14:04<19:44,  3.06it/s]

{'loss': 0.2471, 'learning_rate': 3.045302013422819e-05, 'epoch': 3.91}


 39%|███▉      | 2340/5960 [14:07<19:42,  3.06it/s]

{'loss': 0.2835, 'learning_rate': 3.0369127516778522e-05, 'epoch': 3.93}


 39%|███▉      | 2350/5960 [14:10<19:39,  3.06it/s]

{'loss': 0.2882, 'learning_rate': 3.0285234899328862e-05, 'epoch': 3.94}


 40%|███▉      | 2360/5960 [14:14<19:35,  3.06it/s]

{'loss': 0.3373, 'learning_rate': 3.02013422818792e-05, 'epoch': 3.96}


 40%|███▉      | 2370/5960 [14:17<19:31,  3.07it/s]

{'loss': 0.1586, 'learning_rate': 3.011744966442953e-05, 'epoch': 3.98}


 40%|███▉      | 2380/5960 [14:20<19:28,  3.06it/s]

{'loss': 0.3997, 'learning_rate': 3.0033557046979865e-05, 'epoch': 3.99}


                                                   
 40%|████      | 2384/5960 [14:35<19:28,  3.06it/s]

{'eval_loss': 0.44053035974502563, 'eval_runtime': 13.8584, 'eval_samples_per_second': 86.013, 'eval_steps_per_second': 10.752, 'epoch': 4.0}


 40%|████      | 2390/5960 [14:44<1:21:44,  1.37s/it]

{'loss': 0.1326, 'learning_rate': 2.9949664429530205e-05, 'epoch': 4.01}


 40%|████      | 2400/5960 [14:47<21:00,  2.82it/s]  

{'loss': 0.2125, 'learning_rate': 2.986577181208054e-05, 'epoch': 4.03}


 40%|████      | 2410/5960 [14:51<19:15,  3.07it/s]

{'loss': 0.1881, 'learning_rate': 2.9781879194630874e-05, 'epoch': 4.04}


 41%|████      | 2420/5960 [14:54<19:10,  3.08it/s]

{'loss': 0.4726, 'learning_rate': 2.9697986577181207e-05, 'epoch': 4.06}


 41%|████      | 2430/5960 [14:57<19:06,  3.08it/s]

{'loss': 0.2426, 'learning_rate': 2.9614093959731544e-05, 'epoch': 4.08}


 41%|████      | 2440/5960 [15:00<19:03,  3.08it/s]

{'loss': 0.1199, 'learning_rate': 2.9530201342281884e-05, 'epoch': 4.09}


 41%|████      | 2450/5960 [15:04<19:01,  3.07it/s]

{'loss': 0.2449, 'learning_rate': 2.9446308724832217e-05, 'epoch': 4.11}


 41%|████▏     | 2460/5960 [15:07<18:59,  3.07it/s]

{'loss': 0.1087, 'learning_rate': 2.936241610738255e-05, 'epoch': 4.13}


 41%|████▏     | 2470/5960 [15:10<18:54,  3.08it/s]

{'loss': 0.2609, 'learning_rate': 2.9278523489932887e-05, 'epoch': 4.14}


 42%|████▏     | 2480/5960 [15:13<18:51,  3.08it/s]

{'loss': 0.2728, 'learning_rate': 2.9194630872483227e-05, 'epoch': 4.16}


 42%|████▏     | 2490/5960 [15:17<18:48,  3.07it/s]

{'loss': 0.0527, 'learning_rate': 2.911073825503356e-05, 'epoch': 4.18}


 42%|████▏     | 2500/5960 [15:20<18:44,  3.08it/s]

{'loss': 0.1815, 'learning_rate': 2.9026845637583893e-05, 'epoch': 4.19}


 42%|████▏     | 2510/5960 [15:23<18:41,  3.08it/s]

{'loss': 0.2239, 'learning_rate': 2.894295302013423e-05, 'epoch': 4.21}


 42%|████▏     | 2520/5960 [15:26<18:37,  3.08it/s]

{'loss': 0.1054, 'learning_rate': 2.885906040268457e-05, 'epoch': 4.23}


 42%|████▏     | 2530/5960 [15:30<18:34,  3.08it/s]

{'loss': 0.2509, 'learning_rate': 2.8775167785234902e-05, 'epoch': 4.24}


 43%|████▎     | 2540/5960 [15:33<18:32,  3.07it/s]

{'loss': 0.2348, 'learning_rate': 2.8691275167785235e-05, 'epoch': 4.26}


 43%|████▎     | 2550/5960 [15:36<18:27,  3.08it/s]

{'loss': 0.1495, 'learning_rate': 2.8607382550335572e-05, 'epoch': 4.28}


 43%|████▎     | 2560/5960 [15:39<18:28,  3.07it/s]

{'loss': 0.1523, 'learning_rate': 2.8523489932885905e-05, 'epoch': 4.3}


 43%|████▎     | 2570/5960 [15:43<19:00,  2.97it/s]

{'loss': 0.1778, 'learning_rate': 2.8439597315436245e-05, 'epoch': 4.31}


 43%|████▎     | 2580/5960 [15:46<19:06,  2.95it/s]

{'loss': 0.0826, 'learning_rate': 2.8355704697986578e-05, 'epoch': 4.33}


 43%|████▎     | 2590/5960 [15:50<18:51,  2.98it/s]

{'loss': 0.1079, 'learning_rate': 2.8271812080536915e-05, 'epoch': 4.35}


 44%|████▎     | 2600/5960 [15:53<19:12,  2.92it/s]

{'loss': 0.2165, 'learning_rate': 2.8187919463087248e-05, 'epoch': 4.36}


 44%|████▍     | 2610/5960 [15:56<19:03,  2.93it/s]

{'loss': 0.3719, 'learning_rate': 2.8104026845637588e-05, 'epoch': 4.38}


 44%|████▍     | 2620/5960 [16:00<19:45,  2.82it/s]

{'loss': 0.2143, 'learning_rate': 2.802013422818792e-05, 'epoch': 4.4}


 44%|████▍     | 2630/5960 [16:03<19:40,  2.82it/s]

{'loss': 0.1472, 'learning_rate': 2.7936241610738257e-05, 'epoch': 4.41}


 44%|████▍     | 2640/5960 [16:07<18:44,  2.95it/s]

{'loss': 0.1184, 'learning_rate': 2.785234899328859e-05, 'epoch': 4.43}


 44%|████▍     | 2650/5960 [16:10<18:29,  2.98it/s]

{'loss': 0.3219, 'learning_rate': 2.7768456375838923e-05, 'epoch': 4.45}


 45%|████▍     | 2660/5960 [16:14<17:59,  3.06it/s]

{'loss': 0.1662, 'learning_rate': 2.7684563758389263e-05, 'epoch': 4.46}


 45%|████▍     | 2670/5960 [16:17<18:45,  2.92it/s]

{'loss': 0.2373, 'learning_rate': 2.76006711409396e-05, 'epoch': 4.48}


 45%|████▍     | 2680/5960 [16:20<18:40,  2.93it/s]

{'loss': 0.2269, 'learning_rate': 2.7516778523489933e-05, 'epoch': 4.5}


 45%|████▌     | 2690/5960 [16:24<18:35,  2.93it/s]

{'loss': 0.3042, 'learning_rate': 2.7432885906040266e-05, 'epoch': 4.51}


 45%|████▌     | 2700/5960 [16:27<18:29,  2.94it/s]

{'loss': 0.1892, 'learning_rate': 2.7348993288590606e-05, 'epoch': 4.53}


 45%|████▌     | 2710/5960 [16:31<18:29,  2.93it/s]

{'loss': 0.1514, 'learning_rate': 2.7265100671140943e-05, 'epoch': 4.55}


 46%|████▌     | 2720/5960 [16:34<18:39,  2.89it/s]

{'loss': 0.2772, 'learning_rate': 2.7181208053691276e-05, 'epoch': 4.56}


 46%|████▌     | 2730/5960 [16:38<18:32,  2.90it/s]

{'loss': 0.1716, 'learning_rate': 2.709731543624161e-05, 'epoch': 4.58}


 46%|████▌     | 2740/5960 [16:41<18:23,  2.92it/s]

{'loss': 0.204, 'learning_rate': 2.701342281879195e-05, 'epoch': 4.6}


 46%|████▌     | 2750/5960 [16:44<18:16,  2.93it/s]

{'loss': 0.1083, 'learning_rate': 2.6929530201342285e-05, 'epoch': 4.61}


 46%|████▋     | 2760/5960 [16:48<18:19,  2.91it/s]

{'loss': 0.1709, 'learning_rate': 2.6845637583892618e-05, 'epoch': 4.63}


 46%|████▋     | 2770/5960 [16:51<18:06,  2.93it/s]

{'loss': 0.2016, 'learning_rate': 2.6761744966442955e-05, 'epoch': 4.65}


 47%|████▋     | 2780/5960 [16:55<18:12,  2.91it/s]

{'loss': 0.3827, 'learning_rate': 2.6677852348993288e-05, 'epoch': 4.66}


 47%|████▋     | 2790/5960 [16:58<18:00,  2.93it/s]

{'loss': 0.2324, 'learning_rate': 2.6593959731543628e-05, 'epoch': 4.68}


 47%|████▋     | 2800/5960 [17:02<17:55,  2.94it/s]

{'loss': 0.1081, 'learning_rate': 2.651006711409396e-05, 'epoch': 4.7}


 47%|████▋     | 2810/5960 [17:05<17:51,  2.94it/s]

{'loss': 0.0664, 'learning_rate': 2.6426174496644297e-05, 'epoch': 4.71}


 47%|████▋     | 2820/5960 [17:08<17:55,  2.92it/s]

{'loss': 0.2986, 'learning_rate': 2.634228187919463e-05, 'epoch': 4.73}


 47%|████▋     | 2830/5960 [17:12<17:51,  2.92it/s]

{'loss': 0.1148, 'learning_rate': 2.625838926174497e-05, 'epoch': 4.75}


 48%|████▊     | 2840/5960 [17:15<17:49,  2.92it/s]

{'loss': 0.1253, 'learning_rate': 2.6174496644295304e-05, 'epoch': 4.77}


 48%|████▊     | 2850/5960 [17:19<17:45,  2.92it/s]

{'loss': 0.333, 'learning_rate': 2.609060402684564e-05, 'epoch': 4.78}


 48%|████▊     | 2860/5960 [17:22<17:41,  2.92it/s]

{'loss': 0.4513, 'learning_rate': 2.6006711409395973e-05, 'epoch': 4.8}


 48%|████▊     | 2870/5960 [17:26<17:38,  2.92it/s]

{'loss': 0.227, 'learning_rate': 2.5922818791946306e-05, 'epoch': 4.82}


 48%|████▊     | 2880/5960 [17:29<17:41,  2.90it/s]

{'loss': 0.2047, 'learning_rate': 2.5838926174496646e-05, 'epoch': 4.83}


 48%|████▊     | 2890/5960 [17:32<17:30,  2.92it/s]

{'loss': 0.1836, 'learning_rate': 2.5755033557046983e-05, 'epoch': 4.85}


 49%|████▊     | 2900/5960 [17:36<17:25,  2.93it/s]

{'loss': 0.2296, 'learning_rate': 2.5671140939597316e-05, 'epoch': 4.87}


 49%|████▉     | 2910/5960 [17:39<17:24,  2.92it/s]

{'loss': 0.3173, 'learning_rate': 2.558724832214765e-05, 'epoch': 4.88}


 49%|████▉     | 2920/5960 [17:43<17:22,  2.92it/s]

{'loss': 0.1844, 'learning_rate': 2.550335570469799e-05, 'epoch': 4.9}


 49%|████▉     | 2930/5960 [17:46<17:14,  2.93it/s]

{'loss': 0.1488, 'learning_rate': 2.5419463087248325e-05, 'epoch': 4.92}


 49%|████▉     | 2940/5960 [17:49<17:10,  2.93it/s]

{'loss': 0.2031, 'learning_rate': 2.533557046979866e-05, 'epoch': 4.93}


 49%|████▉     | 2950/5960 [17:53<17:08,  2.93it/s]

{'loss': 0.4115, 'learning_rate': 2.525167785234899e-05, 'epoch': 4.95}


 50%|████▉     | 2960/5960 [17:56<17:04,  2.93it/s]

{'loss': 0.1341, 'learning_rate': 2.516778523489933e-05, 'epoch': 4.97}


 50%|████▉     | 2970/5960 [18:00<16:59,  2.93it/s]

{'loss': 0.2193, 'learning_rate': 2.5083892617449668e-05, 'epoch': 4.98}


 50%|█████     | 2980/5960 [18:03<16:57,  2.93it/s]

{'loss': 0.2089, 'learning_rate': 2.5e-05, 'epoch': 5.0}


                                                   
 50%|█████     | 2980/5960 [18:18<16:57,  2.93it/s]

{'eval_loss': 0.4918535351753235, 'eval_runtime': 14.5509, 'eval_samples_per_second': 81.92, 'eval_steps_per_second': 10.24, 'epoch': 5.0}


 50%|█████     | 2990/5960 [18:29<30:10,  1.64it/s]  

{'loss': 0.1412, 'learning_rate': 2.4916107382550334e-05, 'epoch': 5.02}


 50%|█████     | 3000/5960 [18:32<17:15,  2.86it/s]

{'loss': 0.1065, 'learning_rate': 2.4832214765100674e-05, 'epoch': 5.03}


 51%|█████     | 3010/5960 [18:36<16:54,  2.91it/s]

{'loss': 0.1084, 'learning_rate': 2.4748322147651007e-05, 'epoch': 5.05}


 51%|█████     | 3020/5960 [18:39<16:51,  2.91it/s]

{'loss': 0.1829, 'learning_rate': 2.4664429530201344e-05, 'epoch': 5.07}


 51%|█████     | 3030/5960 [18:43<16:56,  2.88it/s]

{'loss': 0.4423, 'learning_rate': 2.4580536912751677e-05, 'epoch': 5.08}


 51%|█████     | 3040/5960 [18:46<16:43,  2.91it/s]

{'loss': 0.1201, 'learning_rate': 2.4496644295302017e-05, 'epoch': 5.1}


 51%|█████     | 3050/5960 [18:50<16:36,  2.92it/s]

{'loss': 0.1549, 'learning_rate': 2.441275167785235e-05, 'epoch': 5.12}


 51%|█████▏    | 3060/5960 [18:53<16:34,  2.92it/s]

{'loss': 0.0765, 'learning_rate': 2.4328859060402687e-05, 'epoch': 5.13}


 52%|█████▏    | 3070/5960 [18:56<16:22,  2.94it/s]

{'loss': 0.1187, 'learning_rate': 2.424496644295302e-05, 'epoch': 5.15}


 52%|█████▏    | 3080/5960 [19:00<16:19,  2.94it/s]

{'loss': 0.2186, 'learning_rate': 2.416107382550336e-05, 'epoch': 5.17}


 52%|█████▏    | 3090/5960 [19:03<16:15,  2.94it/s]

{'loss': 0.2095, 'learning_rate': 2.4077181208053693e-05, 'epoch': 5.18}


 52%|█████▏    | 3100/5960 [19:07<16:14,  2.94it/s]

{'loss': 0.0047, 'learning_rate': 2.3993288590604026e-05, 'epoch': 5.2}


 52%|█████▏    | 3110/5960 [19:10<16:10,  2.94it/s]

{'loss': 0.0598, 'learning_rate': 2.3909395973154362e-05, 'epoch': 5.22}


 52%|█████▏    | 3120/5960 [19:13<16:09,  2.93it/s]

{'loss': 0.1358, 'learning_rate': 2.38255033557047e-05, 'epoch': 5.23}


 53%|█████▎    | 3130/5960 [19:17<16:08,  2.92it/s]

{'loss': 0.1187, 'learning_rate': 2.3741610738255035e-05, 'epoch': 5.25}


 53%|█████▎    | 3140/5960 [19:20<16:06,  2.92it/s]

{'loss': 0.1679, 'learning_rate': 2.365771812080537e-05, 'epoch': 5.27}


 53%|█████▎    | 3150/5960 [19:24<15:54,  2.94it/s]

{'loss': 0.0671, 'learning_rate': 2.3573825503355705e-05, 'epoch': 5.29}


 53%|█████▎    | 3160/5960 [19:27<15:52,  2.94it/s]

{'loss': 0.3012, 'learning_rate': 2.348993288590604e-05, 'epoch': 5.3}


 53%|█████▎    | 3170/5960 [19:30<15:49,  2.94it/s]

{'loss': 0.1888, 'learning_rate': 2.3406040268456378e-05, 'epoch': 5.32}


 53%|█████▎    | 3180/5960 [19:34<15:51,  2.92it/s]

{'loss': 0.325, 'learning_rate': 2.332214765100671e-05, 'epoch': 5.34}


 54%|█████▎    | 3190/5960 [19:37<15:44,  2.93it/s]

{'loss': 0.069, 'learning_rate': 2.3238255033557048e-05, 'epoch': 5.35}


 54%|█████▎    | 3200/5960 [19:41<15:39,  2.94it/s]

{'loss': 0.1955, 'learning_rate': 2.3154362416107384e-05, 'epoch': 5.37}


 54%|█████▍    | 3210/5960 [19:44<15:35,  2.94it/s]

{'loss': 0.3256, 'learning_rate': 2.3070469798657717e-05, 'epoch': 5.39}


 54%|█████▍    | 3220/5960 [19:48<15:32,  2.94it/s]

{'loss': 0.118, 'learning_rate': 2.2986577181208054e-05, 'epoch': 5.4}


 54%|█████▍    | 3230/5960 [19:51<15:27,  2.94it/s]

{'loss': 0.2785, 'learning_rate': 2.290268456375839e-05, 'epoch': 5.42}


 54%|█████▍    | 3240/5960 [19:54<15:25,  2.94it/s]

{'loss': 0.1111, 'learning_rate': 2.2818791946308727e-05, 'epoch': 5.44}


 55%|█████▍    | 3250/5960 [19:58<15:19,  2.95it/s]

{'loss': 0.01, 'learning_rate': 2.273489932885906e-05, 'epoch': 5.45}


 55%|█████▍    | 3260/5960 [20:01<15:27,  2.91it/s]

{'loss': 0.3986, 'learning_rate': 2.2651006711409396e-05, 'epoch': 5.47}


 55%|█████▍    | 3270/5960 [20:05<15:13,  2.94it/s]

{'loss': 0.1462, 'learning_rate': 2.2567114093959733e-05, 'epoch': 5.49}


 55%|█████▌    | 3280/5960 [20:08<14:59,  2.98it/s]

{'loss': 0.0706, 'learning_rate': 2.248322147651007e-05, 'epoch': 5.5}


 55%|█████▌    | 3290/5960 [20:11<15:20,  2.90it/s]

{'loss': 0.1389, 'learning_rate': 2.2399328859060403e-05, 'epoch': 5.52}


 55%|█████▌    | 3300/5960 [20:15<14:42,  3.02it/s]

{'loss': 0.1504, 'learning_rate': 2.231543624161074e-05, 'epoch': 5.54}


 56%|█████▌    | 3310/5960 [20:18<14:25,  3.06it/s]

{'loss': 0.1477, 'learning_rate': 2.2231543624161076e-05, 'epoch': 5.55}


 56%|█████▌    | 3320/5960 [20:21<14:21,  3.06it/s]

{'loss': 0.1021, 'learning_rate': 2.2147651006711412e-05, 'epoch': 5.57}


 56%|█████▌    | 3330/5960 [20:25<14:18,  3.06it/s]

{'loss': 0.0948, 'learning_rate': 2.2063758389261745e-05, 'epoch': 5.59}


 56%|█████▌    | 3340/5960 [20:28<14:15,  3.06it/s]

{'loss': 0.2744, 'learning_rate': 2.197986577181208e-05, 'epoch': 5.6}


 56%|█████▌    | 3350/5960 [20:31<14:11,  3.07it/s]

{'loss': 0.1766, 'learning_rate': 2.1895973154362418e-05, 'epoch': 5.62}


 56%|█████▋    | 3360/5960 [20:34<14:08,  3.07it/s]

{'loss': 0.2349, 'learning_rate': 2.181208053691275e-05, 'epoch': 5.64}


 57%|█████▋    | 3370/5960 [20:38<14:02,  3.07it/s]

{'loss': 0.2055, 'learning_rate': 2.1728187919463088e-05, 'epoch': 5.65}


 57%|█████▋    | 3380/5960 [20:41<14:00,  3.07it/s]

{'loss': 0.1076, 'learning_rate': 2.1644295302013424e-05, 'epoch': 5.67}


 57%|█████▋    | 3390/5960 [20:44<13:55,  3.08it/s]

{'loss': 0.1791, 'learning_rate': 2.156040268456376e-05, 'epoch': 5.69}


 57%|█████▋    | 3400/5960 [20:47<13:53,  3.07it/s]

{'loss': 0.3097, 'learning_rate': 2.1476510067114094e-05, 'epoch': 5.7}


 57%|█████▋    | 3410/5960 [20:51<13:49,  3.07it/s]

{'loss': 0.1103, 'learning_rate': 2.139261744966443e-05, 'epoch': 5.72}


 57%|█████▋    | 3420/5960 [20:54<13:49,  3.06it/s]

{'loss': 0.1181, 'learning_rate': 2.1308724832214767e-05, 'epoch': 5.74}


 58%|█████▊    | 3430/5960 [20:57<13:42,  3.08it/s]

{'loss': 0.0085, 'learning_rate': 2.1224832214765103e-05, 'epoch': 5.76}


 58%|█████▊    | 3440/5960 [21:00<13:39,  3.07it/s]

{'loss': 0.3067, 'learning_rate': 2.1140939597315437e-05, 'epoch': 5.77}


 58%|█████▊    | 3450/5960 [21:04<14:05,  2.97it/s]

{'loss': 0.1103, 'learning_rate': 2.1057046979865773e-05, 'epoch': 5.79}


 58%|█████▊    | 3460/5960 [21:07<14:05,  2.96it/s]

{'loss': 0.1116, 'learning_rate': 2.097315436241611e-05, 'epoch': 5.81}


 58%|█████▊    | 3470/5960 [21:11<13:53,  2.99it/s]

{'loss': 0.129, 'learning_rate': 2.0889261744966443e-05, 'epoch': 5.82}


 58%|█████▊    | 3480/5960 [21:14<13:57,  2.96it/s]

{'loss': 0.2855, 'learning_rate': 2.080536912751678e-05, 'epoch': 5.84}


 59%|█████▊    | 3490/5960 [21:17<13:42,  3.00it/s]

{'loss': 0.1188, 'learning_rate': 2.0721476510067116e-05, 'epoch': 5.86}


 59%|█████▊    | 3500/5960 [21:21<13:37,  3.01it/s]

{'loss': 0.2431, 'learning_rate': 2.0637583892617452e-05, 'epoch': 5.87}


 59%|█████▉    | 3510/5960 [21:24<13:32,  3.02it/s]

{'loss': 0.349, 'learning_rate': 2.0553691275167785e-05, 'epoch': 5.89}


 59%|█████▉    | 3520/5960 [21:27<13:39,  2.98it/s]

{'loss': 0.1387, 'learning_rate': 2.0469798657718122e-05, 'epoch': 5.91}


 59%|█████▉    | 3530/5960 [21:31<13:28,  3.01it/s]

{'loss': 0.1743, 'learning_rate': 2.038590604026846e-05, 'epoch': 5.92}


 59%|█████▉    | 3540/5960 [21:34<13:23,  3.01it/s]

{'loss': 0.0651, 'learning_rate': 2.0302013422818795e-05, 'epoch': 5.94}


 60%|█████▉    | 3550/5960 [21:37<13:05,  3.07it/s]

{'loss': 0.3302, 'learning_rate': 2.0218120805369128e-05, 'epoch': 5.96}


 60%|█████▉    | 3560/5960 [21:41<13:15,  3.02it/s]

{'loss': 0.2175, 'learning_rate': 2.013422818791946e-05, 'epoch': 5.97}


 60%|█████▉    | 3570/5960 [21:44<13:14,  3.01it/s]

{'loss': 0.1184, 'learning_rate': 2.00503355704698e-05, 'epoch': 5.99}


                                                   
 60%|██████    | 3576/5960 [22:00<13:11,  3.01it/s]

{'eval_loss': 0.47791990637779236, 'eval_runtime': 14.0446, 'eval_samples_per_second': 84.872, 'eval_steps_per_second': 10.609, 'epoch': 6.0}


 60%|██████    | 3580/5960 [22:07<1:34:03,  2.37s/it]

{'loss': 0.1326, 'learning_rate': 1.9966442953020134e-05, 'epoch': 6.01}


 60%|██████    | 3590/5960 [22:10<15:16,  2.58it/s]  

{'loss': 0.1293, 'learning_rate': 1.988255033557047e-05, 'epoch': 6.02}


 60%|██████    | 3600/5960 [22:14<13:00,  3.02it/s]

{'loss': 0.0063, 'learning_rate': 1.9798657718120804e-05, 'epoch': 6.04}


 61%|██████    | 3610/5960 [22:17<12:54,  3.03it/s]

{'loss': 0.1112, 'learning_rate': 1.9714765100671144e-05, 'epoch': 6.06}


 61%|██████    | 3620/5960 [22:20<12:55,  3.02it/s]

{'loss': 0.2088, 'learning_rate': 1.9630872483221477e-05, 'epoch': 6.07}


 61%|██████    | 3630/5960 [22:23<12:50,  3.02it/s]

{'loss': 0.0729, 'learning_rate': 1.9546979865771813e-05, 'epoch': 6.09}


 61%|██████    | 3640/5960 [22:27<12:48,  3.02it/s]

{'loss': 0.2295, 'learning_rate': 1.946308724832215e-05, 'epoch': 6.11}


 61%|██████    | 3650/5960 [22:30<12:46,  3.01it/s]

{'loss': 0.0503, 'learning_rate': 1.9379194630872486e-05, 'epoch': 6.12}


 61%|██████▏   | 3660/5960 [22:33<12:42,  3.02it/s]

{'loss': 0.0641, 'learning_rate': 1.929530201342282e-05, 'epoch': 6.14}


 62%|██████▏   | 3670/5960 [22:37<12:40,  3.01it/s]

{'loss': 0.0736, 'learning_rate': 1.9211409395973153e-05, 'epoch': 6.16}


 62%|██████▏   | 3680/5960 [22:40<12:38,  3.01it/s]

{'loss': 0.0759, 'learning_rate': 1.9127516778523493e-05, 'epoch': 6.17}


 62%|██████▏   | 3690/5960 [22:43<12:04,  3.14it/s]

{'loss': 0.0741, 'learning_rate': 1.9043624161073826e-05, 'epoch': 6.19}


 62%|██████▏   | 3700/5960 [22:47<12:26,  3.03it/s]

{'loss': 0.0743, 'learning_rate': 1.8959731543624162e-05, 'epoch': 6.21}


 62%|██████▏   | 3710/5960 [22:50<12:32,  2.99it/s]

{'loss': 0.0642, 'learning_rate': 1.8875838926174495e-05, 'epoch': 6.22}


 62%|██████▏   | 3720/5960 [22:53<12:29,  2.99it/s]

{'loss': 0.0883, 'learning_rate': 1.8791946308724835e-05, 'epoch': 6.24}


 63%|██████▎   | 3730/5960 [22:57<12:26,  2.99it/s]

{'loss': 0.2608, 'learning_rate': 1.870805369127517e-05, 'epoch': 6.26}


 63%|██████▎   | 3740/5960 [23:00<12:13,  3.03it/s]

{'loss': 0.1081, 'learning_rate': 1.8624161073825505e-05, 'epoch': 6.28}


 63%|██████▎   | 3750/5960 [23:03<12:09,  3.03it/s]

{'loss': 0.0061, 'learning_rate': 1.8540268456375838e-05, 'epoch': 6.29}


 63%|██████▎   | 3760/5960 [23:07<12:07,  3.03it/s]

{'loss': 0.1226, 'learning_rate': 1.8456375838926178e-05, 'epoch': 6.31}


 63%|██████▎   | 3770/5960 [23:10<12:03,  3.03it/s]

{'loss': 0.2813, 'learning_rate': 1.837248322147651e-05, 'epoch': 6.33}


 63%|██████▎   | 3780/5960 [23:13<11:59,  3.03it/s]

{'loss': 0.1374, 'learning_rate': 1.8288590604026847e-05, 'epoch': 6.34}


 64%|██████▎   | 3790/5960 [23:16<11:56,  3.03it/s]

{'loss': 0.1948, 'learning_rate': 1.820469798657718e-05, 'epoch': 6.36}


 64%|██████▍   | 3800/5960 [23:20<11:52,  3.03it/s]

{'loss': 0.0134, 'learning_rate': 1.8120805369127517e-05, 'epoch': 6.38}


 64%|██████▍   | 3810/5960 [23:23<11:49,  3.03it/s]

{'loss': 0.1067, 'learning_rate': 1.8036912751677854e-05, 'epoch': 6.39}


 64%|██████▍   | 3820/5960 [23:26<11:45,  3.03it/s]

{'loss': 0.1152, 'learning_rate': 1.7953020134228187e-05, 'epoch': 6.41}


 64%|██████▍   | 3830/5960 [23:30<11:43,  3.03it/s]

{'loss': 0.0701, 'learning_rate': 1.7869127516778523e-05, 'epoch': 6.43}


 64%|██████▍   | 3840/5960 [23:33<11:39,  3.03it/s]

{'loss': 0.1973, 'learning_rate': 1.778523489932886e-05, 'epoch': 6.44}


 65%|██████▍   | 3850/5960 [23:36<11:33,  3.04it/s]

{'loss': 0.1382, 'learning_rate': 1.7701342281879196e-05, 'epoch': 6.46}


 65%|██████▍   | 3860/5960 [23:40<11:29,  3.04it/s]

{'loss': 0.0205, 'learning_rate': 1.761744966442953e-05, 'epoch': 6.48}


 65%|██████▍   | 3870/5960 [23:43<11:26,  3.04it/s]

{'loss': 0.1112, 'learning_rate': 1.753355704697987e-05, 'epoch': 6.49}


 65%|██████▌   | 3880/5960 [23:46<11:23,  3.04it/s]

{'loss': 0.0613, 'learning_rate': 1.7449664429530202e-05, 'epoch': 6.51}


 65%|██████▌   | 3890/5960 [23:49<11:21,  3.04it/s]

{'loss': 0.1629, 'learning_rate': 1.736577181208054e-05, 'epoch': 6.53}


 65%|██████▌   | 3900/5960 [23:53<11:17,  3.04it/s]

{'loss': 0.1516, 'learning_rate': 1.7281879194630872e-05, 'epoch': 6.54}


 66%|██████▌   | 3910/5960 [23:56<11:14,  3.04it/s]

{'loss': 0.0031, 'learning_rate': 1.719798657718121e-05, 'epoch': 6.56}


 66%|██████▌   | 3920/5960 [23:59<11:11,  3.04it/s]

{'loss': 0.1987, 'learning_rate': 1.7114093959731545e-05, 'epoch': 6.58}


 66%|██████▌   | 3930/5960 [24:03<11:12,  3.02it/s]

{'loss': 0.0037, 'learning_rate': 1.7030201342281878e-05, 'epoch': 6.59}


 66%|██████▌   | 3940/5960 [24:06<11:08,  3.02it/s]

{'loss': 0.0747, 'learning_rate': 1.6946308724832215e-05, 'epoch': 6.61}


 66%|██████▋   | 3950/5960 [24:09<11:03,  3.03it/s]

{'loss': 0.1676, 'learning_rate': 1.686241610738255e-05, 'epoch': 6.63}


 66%|██████▋   | 3960/5960 [24:13<10:59,  3.03it/s]

{'loss': 0.0438, 'learning_rate': 1.6778523489932888e-05, 'epoch': 6.64}


 67%|██████▋   | 3970/5960 [24:16<10:55,  3.04it/s]

{'loss': 0.0735, 'learning_rate': 1.669463087248322e-05, 'epoch': 6.66}


 67%|██████▋   | 3980/5960 [24:19<10:52,  3.03it/s]

{'loss': 0.2416, 'learning_rate': 1.6610738255033557e-05, 'epoch': 6.68}


 67%|██████▋   | 3990/5960 [24:22<10:49,  3.03it/s]

{'loss': 0.0734, 'learning_rate': 1.6526845637583894e-05, 'epoch': 6.69}


 67%|██████▋   | 4000/5960 [24:26<10:44,  3.04it/s]

{'loss': 0.2026, 'learning_rate': 1.644295302013423e-05, 'epoch': 6.71}


 67%|██████▋   | 4010/5960 [24:29<10:40,  3.04it/s]

{'loss': 0.0783, 'learning_rate': 1.6359060402684563e-05, 'epoch': 6.73}


 67%|██████▋   | 4020/5960 [24:32<10:37,  3.04it/s]

{'loss': 0.0099, 'learning_rate': 1.62751677852349e-05, 'epoch': 6.74}


 68%|██████▊   | 4030/5960 [24:36<10:33,  3.04it/s]

{'loss': 0.2179, 'learning_rate': 1.6191275167785237e-05, 'epoch': 6.76}


 68%|██████▊   | 4040/5960 [24:39<10:30,  3.05it/s]

{'loss': 0.0688, 'learning_rate': 1.610738255033557e-05, 'epoch': 6.78}


 68%|██████▊   | 4050/5960 [24:42<10:27,  3.04it/s]

{'loss': 0.0955, 'learning_rate': 1.6023489932885906e-05, 'epoch': 6.8}


 68%|██████▊   | 4060/5960 [24:46<10:26,  3.03it/s]

{'loss': 0.2082, 'learning_rate': 1.5939597315436243e-05, 'epoch': 6.81}


 68%|██████▊   | 4070/5960 [24:49<10:20,  3.04it/s]

{'loss': 0.0763, 'learning_rate': 1.585570469798658e-05, 'epoch': 6.83}


 68%|██████▊   | 4080/5960 [24:52<10:20,  3.03it/s]

{'loss': 0.0734, 'learning_rate': 1.5771812080536912e-05, 'epoch': 6.85}


 69%|██████▊   | 4090/5960 [24:55<10:17,  3.03it/s]

{'loss': 0.105, 'learning_rate': 1.568791946308725e-05, 'epoch': 6.86}


 69%|██████▉   | 4100/5960 [24:59<10:14,  3.02it/s]

{'loss': 0.1055, 'learning_rate': 1.5604026845637585e-05, 'epoch': 6.88}


 69%|██████▉   | 4110/5960 [25:02<10:11,  3.02it/s]

{'loss': 0.1051, 'learning_rate': 1.5520134228187922e-05, 'epoch': 6.9}


 69%|██████▉   | 4120/5960 [25:05<10:05,  3.04it/s]

{'loss': 0.0701, 'learning_rate': 1.5436241610738255e-05, 'epoch': 6.91}


 69%|██████▉   | 4130/5960 [25:09<10:02,  3.04it/s]

{'loss': 0.0035, 'learning_rate': 1.535234899328859e-05, 'epoch': 6.93}


 69%|██████▉   | 4140/5960 [25:12<09:58,  3.04it/s]

{'loss': 0.226, 'learning_rate': 1.5268456375838928e-05, 'epoch': 6.95}


 70%|██████▉   | 4150/5960 [25:15<09:55,  3.04it/s]

{'loss': 0.1254, 'learning_rate': 1.5184563758389261e-05, 'epoch': 6.96}


 70%|██████▉   | 4160/5960 [25:18<09:51,  3.04it/s]

{'loss': 0.0578, 'learning_rate': 1.51006711409396e-05, 'epoch': 6.98}


 70%|██████▉   | 4170/5960 [25:22<09:49,  3.04it/s]

{'loss': 0.0217, 'learning_rate': 1.5016778523489932e-05, 'epoch': 7.0}


                                                   
 70%|███████   | 4172/5960 [25:37<09:51,  3.03it/s]

{'eval_loss': 0.5309024453163147, 'eval_runtime': 14.3413, 'eval_samples_per_second': 83.117, 'eval_steps_per_second': 10.39, 'epoch': 7.0}


 70%|███████   | 4180/5960 [25:46<24:59,  1.19it/s]  

{'loss': 0.0041, 'learning_rate': 1.493288590604027e-05, 'epoch': 7.01}


 70%|███████   | 4190/5960 [25:49<10:08,  2.91it/s]

{'loss': 0.1391, 'learning_rate': 1.4848993288590604e-05, 'epoch': 7.03}


 70%|███████   | 4200/5960 [25:52<09:39,  3.04it/s]

{'loss': 0.0036, 'learning_rate': 1.4765100671140942e-05, 'epoch': 7.05}


 71%|███████   | 4210/5960 [25:56<09:36,  3.04it/s]

{'loss': 0.0305, 'learning_rate': 1.4681208053691275e-05, 'epoch': 7.06}


 71%|███████   | 4220/5960 [25:59<09:33,  3.03it/s]

{'loss': 0.0979, 'learning_rate': 1.4597315436241613e-05, 'epoch': 7.08}


 71%|███████   | 4230/5960 [26:02<09:31,  3.03it/s]

{'loss': 0.0027, 'learning_rate': 1.4513422818791946e-05, 'epoch': 7.1}


 71%|███████   | 4240/5960 [26:06<09:31,  3.01it/s]

{'loss': 0.1435, 'learning_rate': 1.4429530201342285e-05, 'epoch': 7.11}


 71%|███████▏  | 4250/5960 [26:09<09:28,  3.01it/s]

{'loss': 0.005, 'learning_rate': 1.4345637583892618e-05, 'epoch': 7.13}


 71%|███████▏  | 4260/5960 [26:12<09:26,  3.00it/s]

{'loss': 0.0025, 'learning_rate': 1.4261744966442953e-05, 'epoch': 7.15}


 72%|███████▏  | 4270/5960 [26:16<09:22,  3.00it/s]

{'loss': 0.1773, 'learning_rate': 1.4177852348993289e-05, 'epoch': 7.16}


 72%|███████▏  | 4280/5960 [26:19<09:18,  3.01it/s]

{'loss': 0.0225, 'learning_rate': 1.4093959731543624e-05, 'epoch': 7.18}


 72%|███████▏  | 4290/5960 [26:22<09:16,  3.00it/s]

{'loss': 0.0372, 'learning_rate': 1.401006711409396e-05, 'epoch': 7.2}


 72%|███████▏  | 4300/5960 [26:26<09:13,  3.00it/s]

{'loss': 0.1194, 'learning_rate': 1.3926174496644295e-05, 'epoch': 7.21}


 72%|███████▏  | 4310/5960 [26:29<09:09,  3.00it/s]

{'loss': 0.1968, 'learning_rate': 1.3842281879194632e-05, 'epoch': 7.23}


 72%|███████▏  | 4320/5960 [26:32<09:05,  3.00it/s]

{'loss': 0.1349, 'learning_rate': 1.3758389261744966e-05, 'epoch': 7.25}


 73%|███████▎  | 4330/5960 [26:36<09:03,  3.00it/s]

{'loss': 0.0609, 'learning_rate': 1.3674496644295303e-05, 'epoch': 7.27}


 73%|███████▎  | 4340/5960 [26:39<09:00,  3.00it/s]

{'loss': 0.087, 'learning_rate': 1.3590604026845638e-05, 'epoch': 7.28}


 73%|███████▎  | 4350/5960 [26:42<08:56,  3.00it/s]

{'loss': 0.0798, 'learning_rate': 1.3506711409395974e-05, 'epoch': 7.3}


 73%|███████▎  | 4360/5960 [26:46<08:52,  3.00it/s]

{'loss': 0.1088, 'learning_rate': 1.3422818791946309e-05, 'epoch': 7.32}


 73%|███████▎  | 4370/5960 [26:49<08:50,  3.00it/s]

{'loss': 0.1259, 'learning_rate': 1.3338926174496644e-05, 'epoch': 7.33}


 73%|███████▎  | 4380/5960 [26:52<08:48,  2.99it/s]

{'loss': 0.0029, 'learning_rate': 1.325503355704698e-05, 'epoch': 7.35}


 74%|███████▎  | 4390/5960 [26:56<08:46,  2.98it/s]

{'loss': 0.0658, 'learning_rate': 1.3171140939597315e-05, 'epoch': 7.37}


 74%|███████▍  | 4400/5960 [26:59<08:47,  2.96it/s]

{'loss': 0.1298, 'learning_rate': 1.3087248322147652e-05, 'epoch': 7.38}


 74%|███████▍  | 4410/5960 [27:03<09:05,  2.84it/s]

{'loss': 0.1551, 'learning_rate': 1.3003355704697987e-05, 'epoch': 7.4}


 74%|███████▍  | 4420/5960 [27:06<08:53,  2.89it/s]

{'loss': 0.1535, 'learning_rate': 1.2919463087248323e-05, 'epoch': 7.42}


 74%|███████▍  | 4430/5960 [27:10<08:48,  2.90it/s]

{'loss': 0.0104, 'learning_rate': 1.2835570469798658e-05, 'epoch': 7.43}


 74%|███████▍  | 4440/5960 [27:13<08:47,  2.88it/s]

{'loss': 0.0046, 'learning_rate': 1.2751677852348994e-05, 'epoch': 7.45}


 75%|███████▍  | 4450/5960 [27:16<08:38,  2.91it/s]

{'loss': 0.0722, 'learning_rate': 1.266778523489933e-05, 'epoch': 7.47}


 75%|███████▍  | 4460/5960 [27:20<08:43,  2.86it/s]

{'loss': 0.0718, 'learning_rate': 1.2583892617449666e-05, 'epoch': 7.48}


 75%|███████▌  | 4470/5960 [27:23<08:34,  2.90it/s]

{'loss': 0.1695, 'learning_rate': 1.25e-05, 'epoch': 7.5}


 75%|███████▌  | 4480/5960 [27:27<08:33,  2.88it/s]

{'loss': 0.0029, 'learning_rate': 1.2416107382550337e-05, 'epoch': 7.52}


 75%|███████▌  | 4490/5960 [27:30<08:28,  2.89it/s]

{'loss': 0.0609, 'learning_rate': 1.2332214765100672e-05, 'epoch': 7.53}


 76%|███████▌  | 4500/5960 [27:34<08:23,  2.90it/s]

{'loss': 0.2068, 'learning_rate': 1.2248322147651008e-05, 'epoch': 7.55}


 76%|███████▌  | 4510/5960 [27:37<08:28,  2.85it/s]

{'loss': 0.0023, 'learning_rate': 1.2164429530201343e-05, 'epoch': 7.57}


 76%|███████▌  | 4520/5960 [27:41<08:13,  2.92it/s]

{'loss': 0.0072, 'learning_rate': 1.208053691275168e-05, 'epoch': 7.58}


 76%|███████▌  | 4530/5960 [27:44<08:17,  2.87it/s]

{'loss': 0.0105, 'learning_rate': 1.1996644295302013e-05, 'epoch': 7.6}


 76%|███████▌  | 4540/5960 [27:48<08:10,  2.90it/s]

{'loss': 0.0669, 'learning_rate': 1.191275167785235e-05, 'epoch': 7.62}


 76%|███████▋  | 4550/5960 [27:51<08:08,  2.89it/s]

{'loss': 0.1252, 'learning_rate': 1.1828859060402684e-05, 'epoch': 7.63}


 77%|███████▋  | 4560/5960 [27:55<08:05,  2.89it/s]

{'loss': 0.0254, 'learning_rate': 1.174496644295302e-05, 'epoch': 7.65}


 77%|███████▋  | 4570/5960 [27:58<07:56,  2.92it/s]

{'loss': 0.0106, 'learning_rate': 1.1661073825503356e-05, 'epoch': 7.67}


 77%|███████▋  | 4580/5960 [28:01<07:58,  2.88it/s]

{'loss': 0.0026, 'learning_rate': 1.1577181208053692e-05, 'epoch': 7.68}


 77%|███████▋  | 4590/5960 [28:05<07:48,  2.93it/s]

{'loss': 0.0019, 'learning_rate': 1.1493288590604027e-05, 'epoch': 7.7}


 77%|███████▋  | 4600/5960 [28:08<07:50,  2.89it/s]

{'loss': 0.0764, 'learning_rate': 1.1409395973154363e-05, 'epoch': 7.72}


 77%|███████▋  | 4610/5960 [28:12<07:44,  2.91it/s]

{'loss': 0.0544, 'learning_rate': 1.1325503355704698e-05, 'epoch': 7.73}


 78%|███████▊  | 4620/5960 [28:15<07:42,  2.90it/s]

{'loss': 0.0125, 'learning_rate': 1.1241610738255035e-05, 'epoch': 7.75}


 78%|███████▊  | 4630/5960 [28:19<07:39,  2.89it/s]

{'loss': 0.0055, 'learning_rate': 1.115771812080537e-05, 'epoch': 7.77}


 78%|███████▊  | 4640/5960 [28:22<07:32,  2.91it/s]

{'loss': 0.0597, 'learning_rate': 1.1073825503355706e-05, 'epoch': 7.79}


 78%|███████▊  | 4650/5960 [28:26<07:33,  2.89it/s]

{'loss': 0.0783, 'learning_rate': 1.098993288590604e-05, 'epoch': 7.8}


 78%|███████▊  | 4660/5960 [28:29<07:24,  2.93it/s]

{'loss': 0.0732, 'learning_rate': 1.0906040268456376e-05, 'epoch': 7.82}


 78%|███████▊  | 4670/5960 [28:33<07:24,  2.90it/s]

{'loss': 0.0178, 'learning_rate': 1.0822147651006712e-05, 'epoch': 7.84}


 79%|███████▊  | 4680/5960 [28:36<07:19,  2.91it/s]

{'loss': 0.1257, 'learning_rate': 1.0738255033557047e-05, 'epoch': 7.85}


 79%|███████▊  | 4690/5960 [28:39<07:17,  2.90it/s]

{'loss': 0.1364, 'learning_rate': 1.0654362416107383e-05, 'epoch': 7.87}


 79%|███████▉  | 4700/5960 [28:43<07:15,  2.89it/s]

{'loss': 0.24, 'learning_rate': 1.0570469798657718e-05, 'epoch': 7.89}


 79%|███████▉  | 4710/5960 [28:46<07:08,  2.92it/s]

{'loss': 0.0196, 'learning_rate': 1.0486577181208055e-05, 'epoch': 7.9}


 79%|███████▉  | 4720/5960 [28:50<07:08,  2.89it/s]

{'loss': 0.0025, 'learning_rate': 1.040268456375839e-05, 'epoch': 7.92}


 79%|███████▉  | 4730/5960 [28:53<06:58,  2.94it/s]

{'loss': 0.1518, 'learning_rate': 1.0318791946308726e-05, 'epoch': 7.94}


 80%|███████▉  | 4740/5960 [28:57<07:02,  2.89it/s]

{'loss': 0.2448, 'learning_rate': 1.0234899328859061e-05, 'epoch': 7.95}


 80%|███████▉  | 4750/5960 [29:00<06:56,  2.90it/s]

{'loss': 0.1325, 'learning_rate': 1.0151006711409397e-05, 'epoch': 7.97}


 80%|███████▉  | 4760/5960 [29:04<06:52,  2.91it/s]

{'loss': 0.2674, 'learning_rate': 1.006711409395973e-05, 'epoch': 7.99}


                                                   
 80%|████████  | 4768/5960 [29:21<06:49,  2.91it/s]

{'eval_loss': 0.532723069190979, 'eval_runtime': 14.6784, 'eval_samples_per_second': 81.208, 'eval_steps_per_second': 10.151, 'epoch': 8.0}


 80%|████████  | 4770/5960 [29:28<1:32:37,  4.67s/it]

{'loss': 0.016, 'learning_rate': 9.983221476510067e-06, 'epoch': 8.0}


 80%|████████  | 4780/5960 [29:31<09:08,  2.15it/s]  

{'loss': 0.0035, 'learning_rate': 9.899328859060402e-06, 'epoch': 8.02}


 80%|████████  | 4790/5960 [29:34<06:43,  2.90it/s]

{'loss': 0.0733, 'learning_rate': 9.815436241610738e-06, 'epoch': 8.04}


 81%|████████  | 4800/5960 [29:38<06:38,  2.91it/s]

{'loss': 0.0025, 'learning_rate': 9.731543624161075e-06, 'epoch': 8.05}


 81%|████████  | 4810/5960 [29:41<06:31,  2.94it/s]

{'loss': 0.0763, 'learning_rate': 9.64765100671141e-06, 'epoch': 8.07}


 81%|████████  | 4820/5960 [29:45<06:31,  2.91it/s]

{'loss': 0.0752, 'learning_rate': 9.563758389261746e-06, 'epoch': 8.09}


 81%|████████  | 4830/5960 [29:48<06:26,  2.92it/s]

{'loss': 0.0299, 'learning_rate': 9.479865771812081e-06, 'epoch': 8.1}


 81%|████████  | 4840/5960 [29:52<06:24,  2.92it/s]

{'loss': 0.0264, 'learning_rate': 9.395973154362418e-06, 'epoch': 8.12}


 81%|████████▏ | 4850/5960 [29:55<06:23,  2.90it/s]

{'loss': 0.1294, 'learning_rate': 9.312080536912752e-06, 'epoch': 8.14}


 82%|████████▏ | 4860/5960 [29:58<06:16,  2.92it/s]

{'loss': 0.0018, 'learning_rate': 9.228187919463089e-06, 'epoch': 8.15}


 82%|████████▏ | 4870/5960 [30:02<06:18,  2.88it/s]

{'loss': 0.0094, 'learning_rate': 9.144295302013424e-06, 'epoch': 8.17}


 82%|████████▏ | 4880/5960 [30:05<06:12,  2.90it/s]

{'loss': 0.0606, 'learning_rate': 9.060402684563759e-06, 'epoch': 8.19}


 82%|████████▏ | 4890/5960 [30:09<06:11,  2.88it/s]

{'loss': 0.1289, 'learning_rate': 8.976510067114093e-06, 'epoch': 8.2}


 82%|████████▏ | 4900/5960 [30:12<06:07,  2.89it/s]

{'loss': 0.1491, 'learning_rate': 8.89261744966443e-06, 'epoch': 8.22}


 82%|████████▏ | 4910/5960 [30:16<06:03,  2.89it/s]

{'loss': 0.0811, 'learning_rate': 8.808724832214765e-06, 'epoch': 8.24}


 83%|████████▎ | 4920/5960 [30:19<06:00,  2.88it/s]

{'loss': 0.0016, 'learning_rate': 8.724832214765101e-06, 'epoch': 8.26}


 83%|████████▎ | 4930/5960 [30:23<05:57,  2.88it/s]

{'loss': 0.0017, 'learning_rate': 8.640939597315436e-06, 'epoch': 8.27}


 83%|████████▎ | 4940/5960 [30:26<05:56,  2.86it/s]

{'loss': 0.0666, 'learning_rate': 8.557046979865773e-06, 'epoch': 8.29}


 83%|████████▎ | 4950/5960 [30:30<05:50,  2.88it/s]

{'loss': 0.1228, 'learning_rate': 8.473154362416107e-06, 'epoch': 8.31}


 83%|████████▎ | 4960/5960 [30:33<05:45,  2.89it/s]

{'loss': 0.1508, 'learning_rate': 8.389261744966444e-06, 'epoch': 8.32}


 83%|████████▎ | 4970/5960 [30:37<05:43,  2.88it/s]

{'loss': 0.0022, 'learning_rate': 8.305369127516779e-06, 'epoch': 8.34}


 84%|████████▎ | 4980/5960 [30:40<05:38,  2.89it/s]

{'loss': 0.0024, 'learning_rate': 8.221476510067115e-06, 'epoch': 8.36}


 84%|████████▎ | 4990/5960 [30:44<05:36,  2.88it/s]

{'loss': 0.0016, 'learning_rate': 8.13758389261745e-06, 'epoch': 8.37}


 84%|████████▍ | 5000/5960 [30:47<05:29,  2.91it/s]

{'loss': 0.01, 'learning_rate': 8.053691275167785e-06, 'epoch': 8.39}


 84%|████████▍ | 5010/5960 [30:50<05:28,  2.89it/s]

{'loss': 0.1396, 'learning_rate': 7.969798657718121e-06, 'epoch': 8.41}


 84%|████████▍ | 5020/5960 [30:54<05:24,  2.90it/s]

{'loss': 0.0793, 'learning_rate': 7.885906040268456e-06, 'epoch': 8.42}


 84%|████████▍ | 5030/5960 [30:57<05:22,  2.88it/s]

{'loss': 0.0907, 'learning_rate': 7.802013422818793e-06, 'epoch': 8.44}


 85%|████████▍ | 5040/5960 [31:01<05:20,  2.87it/s]

{'loss': 0.0019, 'learning_rate': 7.718120805369127e-06, 'epoch': 8.46}


 85%|████████▍ | 5050/5960 [31:04<05:15,  2.88it/s]

{'loss': 0.0015, 'learning_rate': 7.634228187919464e-06, 'epoch': 8.47}


 85%|████████▍ | 5060/5960 [31:08<05:03,  2.97it/s]

{'loss': 0.045, 'learning_rate': 7.5503355704698e-06, 'epoch': 8.49}


 85%|████████▌ | 5070/5960 [31:11<05:00,  2.96it/s]

{'loss': 0.0991, 'learning_rate': 7.466442953020135e-06, 'epoch': 8.51}


 85%|████████▌ | 5080/5960 [31:15<04:58,  2.95it/s]

{'loss': 0.0025, 'learning_rate': 7.382550335570471e-06, 'epoch': 8.52}


 85%|████████▌ | 5090/5960 [31:18<05:02,  2.87it/s]

{'loss': 0.0014, 'learning_rate': 7.298657718120807e-06, 'epoch': 8.54}


 86%|████████▌ | 5100/5960 [31:21<04:55,  2.91it/s]

{'loss': 0.077, 'learning_rate': 7.214765100671142e-06, 'epoch': 8.56}


 86%|████████▌ | 5110/5960 [31:25<04:52,  2.91it/s]

{'loss': 0.0727, 'learning_rate': 7.130872483221476e-06, 'epoch': 8.57}


 86%|████████▌ | 5120/5960 [31:28<04:37,  3.02it/s]

{'loss': 0.1409, 'learning_rate': 7.046979865771812e-06, 'epoch': 8.59}


 86%|████████▌ | 5130/5960 [31:32<04:35,  3.02it/s]

{'loss': 0.0851, 'learning_rate': 6.963087248322148e-06, 'epoch': 8.61}


 86%|████████▌ | 5140/5960 [31:35<04:34,  2.99it/s]

{'loss': 0.1139, 'learning_rate': 6.879194630872483e-06, 'epoch': 8.62}


 86%|████████▋ | 5150/5960 [31:38<04:21,  3.10it/s]

{'loss': 0.0582, 'learning_rate': 6.795302013422819e-06, 'epoch': 8.64}


 87%|████████▋ | 5160/5960 [31:41<04:17,  3.10it/s]

{'loss': 0.0595, 'learning_rate': 6.7114093959731546e-06, 'epoch': 8.66}


 87%|████████▋ | 5170/5960 [31:45<04:14,  3.10it/s]

{'loss': 0.1228, 'learning_rate': 6.62751677852349e-06, 'epoch': 8.67}


 87%|████████▋ | 5180/5960 [31:48<04:10,  3.11it/s]

{'loss': 0.0585, 'learning_rate': 6.543624161073826e-06, 'epoch': 8.69}


 87%|████████▋ | 5190/5960 [31:51<04:07,  3.11it/s]

{'loss': 0.166, 'learning_rate': 6.4597315436241616e-06, 'epoch': 8.71}


 87%|████████▋ | 5200/5960 [31:54<04:04,  3.11it/s]

{'loss': 0.2091, 'learning_rate': 6.375838926174497e-06, 'epoch': 8.72}


 87%|████████▋ | 5210/5960 [31:58<04:01,  3.11it/s]

{'loss': 0.004, 'learning_rate': 6.291946308724833e-06, 'epoch': 8.74}


 88%|████████▊ | 5220/5960 [32:01<03:58,  3.11it/s]

{'loss': 0.0688, 'learning_rate': 6.2080536912751686e-06, 'epoch': 8.76}


 88%|████████▊ | 5230/5960 [32:04<03:54,  3.11it/s]

{'loss': 0.0291, 'learning_rate': 6.124161073825504e-06, 'epoch': 8.78}


 88%|████████▊ | 5240/5960 [32:07<03:51,  3.11it/s]

{'loss': 0.0016, 'learning_rate': 6.04026845637584e-06, 'epoch': 8.79}


 88%|████████▊ | 5250/5960 [32:10<03:48,  3.11it/s]

{'loss': 0.0014, 'learning_rate': 5.956375838926175e-06, 'epoch': 8.81}


 88%|████████▊ | 5260/5960 [32:14<03:45,  3.11it/s]

{'loss': 0.0157, 'learning_rate': 5.87248322147651e-06, 'epoch': 8.83}


 88%|████████▊ | 5270/5960 [32:17<03:41,  3.11it/s]

{'loss': 0.0371, 'learning_rate': 5.788590604026846e-06, 'epoch': 8.84}


 89%|████████▊ | 5280/5960 [32:20<03:38,  3.11it/s]

{'loss': 0.026, 'learning_rate': 5.704697986577182e-06, 'epoch': 8.86}


 89%|████████▉ | 5290/5960 [32:23<03:35,  3.11it/s]

{'loss': 0.127, 'learning_rate': 5.620805369127517e-06, 'epoch': 8.88}


 89%|████████▉ | 5300/5960 [32:26<03:32,  3.10it/s]

{'loss': 0.0473, 'learning_rate': 5.536912751677853e-06, 'epoch': 8.89}


 89%|████████▉ | 5310/5960 [32:30<03:29,  3.10it/s]

{'loss': 0.0842, 'learning_rate': 5.453020134228188e-06, 'epoch': 8.91}


 89%|████████▉ | 5320/5960 [32:33<03:27,  3.09it/s]

{'loss': 0.0011, 'learning_rate': 5.3691275167785235e-06, 'epoch': 8.93}


 89%|████████▉ | 5330/5960 [32:36<03:37,  2.89it/s]

{'loss': 0.0868, 'learning_rate': 5.285234899328859e-06, 'epoch': 8.94}


 90%|████████▉ | 5340/5960 [32:40<03:37,  2.85it/s]

{'loss': 0.0959, 'learning_rate': 5.201342281879195e-06, 'epoch': 8.96}


 90%|████████▉ | 5350/5960 [32:43<03:18,  3.07it/s]

{'loss': 0.0028, 'learning_rate': 5.1174496644295305e-06, 'epoch': 8.98}


 90%|████████▉ | 5360/5960 [32:46<03:14,  3.09it/s]

{'loss': 0.0715, 'learning_rate': 5.033557046979865e-06, 'epoch': 8.99}


                                                   
 90%|█████████ | 5364/5960 [33:02<03:12,  3.09it/s]

{'eval_loss': 0.6558883786201477, 'eval_runtime': 13.9585, 'eval_samples_per_second': 85.396, 'eval_steps_per_second': 10.675, 'epoch': 9.0}


 90%|█████████ | 5370/5960 [33:10<13:01,  1.32s/it]  

{'loss': 0.0013, 'learning_rate': 4.949664429530201e-06, 'epoch': 9.01}


 90%|█████████ | 5380/5960 [33:13<03:23,  2.85it/s]

{'loss': 0.0013, 'learning_rate': 4.8657718120805375e-06, 'epoch': 9.03}


 90%|█████████ | 5390/5960 [33:16<03:04,  3.09it/s]

{'loss': 0.0148, 'learning_rate': 4.781879194630873e-06, 'epoch': 9.04}


 91%|█████████ | 5400/5960 [33:19<03:00,  3.10it/s]

{'loss': 0.0047, 'learning_rate': 4.697986577181209e-06, 'epoch': 9.06}


 91%|█████████ | 5410/5960 [33:22<02:57,  3.10it/s]

{'loss': 0.0011, 'learning_rate': 4.6140939597315445e-06, 'epoch': 9.08}


 91%|█████████ | 5420/5960 [33:26<02:54,  3.10it/s]

{'loss': 0.0565, 'learning_rate': 4.530201342281879e-06, 'epoch': 9.09}


 91%|█████████ | 5430/5960 [33:29<02:51,  3.10it/s]

{'loss': 0.001, 'learning_rate': 4.446308724832215e-06, 'epoch': 9.11}


 91%|█████████▏| 5440/5960 [33:32<02:47,  3.10it/s]

{'loss': 0.0013, 'learning_rate': 4.362416107382551e-06, 'epoch': 9.13}


 91%|█████████▏| 5450/5960 [33:35<02:49,  3.01it/s]

{'loss': 0.0713, 'learning_rate': 4.278523489932886e-06, 'epoch': 9.14}


 92%|█████████▏| 5460/5960 [33:39<02:49,  2.94it/s]

{'loss': 0.0061, 'learning_rate': 4.194630872483222e-06, 'epoch': 9.16}


 92%|█████████▏| 5470/5960 [33:42<02:47,  2.92it/s]

{'loss': 0.0692, 'learning_rate': 4.110738255033558e-06, 'epoch': 9.18}


 92%|█████████▏| 5480/5960 [33:46<02:42,  2.95it/s]

{'loss': 0.0799, 'learning_rate': 4.026845637583892e-06, 'epoch': 9.19}


 92%|█████████▏| 5490/5960 [33:49<02:38,  2.96it/s]

{'loss': 0.0009, 'learning_rate': 3.942953020134228e-06, 'epoch': 9.21}


 92%|█████████▏| 5500/5960 [33:52<02:35,  2.95it/s]

{'loss': 0.001, 'learning_rate': 3.859060402684564e-06, 'epoch': 9.23}


 92%|█████████▏| 5510/5960 [33:56<02:31,  2.97it/s]

{'loss': 0.001, 'learning_rate': 3.7751677852349e-06, 'epoch': 9.24}


 93%|█████████▎| 5520/5960 [33:59<02:28,  2.97it/s]

{'loss': 0.0011, 'learning_rate': 3.6912751677852355e-06, 'epoch': 9.26}


 93%|█████████▎| 5530/5960 [34:03<02:24,  2.97it/s]

{'loss': 0.001, 'learning_rate': 3.607382550335571e-06, 'epoch': 9.28}


 93%|█████████▎| 5540/5960 [34:06<02:21,  2.97it/s]

{'loss': 0.1044, 'learning_rate': 3.523489932885906e-06, 'epoch': 9.3}


 93%|█████████▎| 5550/5960 [34:09<02:18,  2.97it/s]

{'loss': 0.0011, 'learning_rate': 3.4395973154362416e-06, 'epoch': 9.31}


 93%|█████████▎| 5560/5960 [34:13<02:15,  2.96it/s]

{'loss': 0.0009, 'learning_rate': 3.3557046979865773e-06, 'epoch': 9.33}


 93%|█████████▎| 5570/5960 [34:16<02:11,  2.97it/s]

{'loss': 0.0574, 'learning_rate': 3.271812080536913e-06, 'epoch': 9.35}


 94%|█████████▎| 5580/5960 [34:19<02:08,  2.96it/s]

{'loss': 0.0062, 'learning_rate': 3.1879194630872486e-06, 'epoch': 9.36}


 94%|█████████▍| 5590/5960 [34:23<02:04,  2.97it/s]

{'loss': 0.0008, 'learning_rate': 3.1040268456375843e-06, 'epoch': 9.38}


 94%|█████████▍| 5600/5960 [34:26<02:01,  2.96it/s]

{'loss': 0.0335, 'learning_rate': 3.02013422818792e-06, 'epoch': 9.4}


 94%|█████████▍| 5610/5960 [34:30<01:58,  2.96it/s]

{'loss': 0.0869, 'learning_rate': 2.936241610738255e-06, 'epoch': 9.41}


 94%|█████████▍| 5620/5960 [34:33<01:55,  2.95it/s]

{'loss': 0.0012, 'learning_rate': 2.852348993288591e-06, 'epoch': 9.43}


 94%|█████████▍| 5630/5960 [34:36<01:51,  2.95it/s]

{'loss': 0.001, 'learning_rate': 2.7684563758389265e-06, 'epoch': 9.45}


 95%|█████████▍| 5640/5960 [34:40<01:48,  2.96it/s]

{'loss': 0.2522, 'learning_rate': 2.6845637583892617e-06, 'epoch': 9.46}


 95%|█████████▍| 5650/5960 [34:43<01:45,  2.95it/s]

{'loss': 0.1129, 'learning_rate': 2.6006711409395974e-06, 'epoch': 9.48}


 95%|█████████▍| 5660/5960 [34:47<01:41,  2.95it/s]

{'loss': 0.0009, 'learning_rate': 2.5167785234899326e-06, 'epoch': 9.5}


 95%|█████████▌| 5670/5960 [34:50<01:42,  2.83it/s]

{'loss': 0.0779, 'learning_rate': 2.4328859060402687e-06, 'epoch': 9.51}


 95%|█████████▌| 5680/5960 [34:53<01:37,  2.86it/s]

{'loss': 0.0008, 'learning_rate': 2.3489932885906044e-06, 'epoch': 9.53}


 95%|█████████▌| 5690/5960 [34:57<01:33,  2.89it/s]

{'loss': 0.0747, 'learning_rate': 2.2651006711409396e-06, 'epoch': 9.55}


 96%|█████████▌| 5700/5960 [35:00<01:30,  2.88it/s]

{'loss': 0.0857, 'learning_rate': 2.1812080536912753e-06, 'epoch': 9.56}


 96%|█████████▌| 5710/5960 [35:04<01:26,  2.88it/s]

{'loss': 0.0008, 'learning_rate': 2.097315436241611e-06, 'epoch': 9.58}


 96%|█████████▌| 5720/5960 [35:07<01:23,  2.88it/s]

{'loss': 0.0008, 'learning_rate': 2.013422818791946e-06, 'epoch': 9.6}


 96%|█████████▌| 5730/5960 [35:11<01:20,  2.86it/s]

{'loss': 0.0769, 'learning_rate': 1.929530201342282e-06, 'epoch': 9.61}


 96%|█████████▋| 5740/5960 [35:14<01:16,  2.86it/s]

{'loss': 0.0016, 'learning_rate': 1.8456375838926177e-06, 'epoch': 9.63}


 96%|█████████▋| 5750/5960 [35:18<01:13,  2.86it/s]

{'loss': 0.0105, 'learning_rate': 1.761744966442953e-06, 'epoch': 9.65}


 97%|█████████▋| 5760/5960 [35:21<01:09,  2.86it/s]

{'loss': 0.0956, 'learning_rate': 1.6778523489932886e-06, 'epoch': 9.66}


 97%|█████████▋| 5770/5960 [35:25<01:06,  2.87it/s]

{'loss': 0.0008, 'learning_rate': 1.5939597315436243e-06, 'epoch': 9.68}


 97%|█████████▋| 5780/5960 [35:28<01:02,  2.87it/s]

{'loss': 0.001, 'learning_rate': 1.51006711409396e-06, 'epoch': 9.7}


 97%|█████████▋| 5790/5960 [35:32<00:59,  2.88it/s]

{'loss': 0.0488, 'learning_rate': 1.4261744966442954e-06, 'epoch': 9.71}


 97%|█████████▋| 5800/5960 [35:35<00:55,  2.87it/s]

{'loss': 0.0008, 'learning_rate': 1.3422818791946309e-06, 'epoch': 9.73}


 97%|█████████▋| 5810/5960 [35:39<00:52,  2.88it/s]

{'loss': 0.0921, 'learning_rate': 1.2583892617449663e-06, 'epoch': 9.75}


 98%|█████████▊| 5820/5960 [35:42<00:49,  2.85it/s]

{'loss': 0.001, 'learning_rate': 1.1744966442953022e-06, 'epoch': 9.77}


 98%|█████████▊| 5830/5960 [35:46<00:45,  2.85it/s]

{'loss': 0.0008, 'learning_rate': 1.0906040268456377e-06, 'epoch': 9.78}


 98%|█████████▊| 5840/5960 [35:49<00:42,  2.86it/s]

{'loss': 0.128, 'learning_rate': 1.006711409395973e-06, 'epoch': 9.8}


 98%|█████████▊| 5850/5960 [35:53<00:38,  2.85it/s]

{'loss': 0.0501, 'learning_rate': 9.228187919463089e-07, 'epoch': 9.82}


 98%|█████████▊| 5860/5960 [35:56<00:35,  2.84it/s]

{'loss': 0.0014, 'learning_rate': 8.389261744966443e-07, 'epoch': 9.83}


 98%|█████████▊| 5870/5960 [36:00<00:31,  2.85it/s]

{'loss': 0.0008, 'learning_rate': 7.5503355704698e-07, 'epoch': 9.85}


 99%|█████████▊| 5880/5960 [36:03<00:28,  2.86it/s]

{'loss': 0.0009, 'learning_rate': 6.711409395973154e-07, 'epoch': 9.87}


 99%|█████████▉| 5890/5960 [36:07<00:24,  2.85it/s]

{'loss': 0.0008, 'learning_rate': 5.872483221476511e-07, 'epoch': 9.88}


 99%|█████████▉| 5900/5960 [36:10<00:21,  2.85it/s]

{'loss': 0.001, 'learning_rate': 5.033557046979866e-07, 'epoch': 9.9}


 99%|█████████▉| 5910/5960 [36:14<00:17,  2.87it/s]

{'loss': 0.0823, 'learning_rate': 4.1946308724832216e-07, 'epoch': 9.92}


 99%|█████████▉| 5920/5960 [36:17<00:13,  2.86it/s]

{'loss': 0.001, 'learning_rate': 3.355704697986577e-07, 'epoch': 9.93}


 99%|█████████▉| 5930/5960 [36:21<00:10,  2.86it/s]

{'loss': 0.0569, 'learning_rate': 2.516778523489933e-07, 'epoch': 9.95}


100%|█████████▉| 5940/5960 [36:24<00:06,  2.87it/s]

{'loss': 0.0021, 'learning_rate': 1.6778523489932886e-07, 'epoch': 9.97}


100%|█████████▉| 5950/5960 [36:28<00:03,  2.87it/s]

{'loss': 0.0009, 'learning_rate': 8.389261744966443e-08, 'epoch': 9.98}


100%|██████████| 5960/5960 [36:31<00:00,  2.87it/s]

{'loss': 0.053, 'learning_rate': 0.0, 'epoch': 10.0}


                                                   
100%|██████████| 5960/5960 [36:46<00:00,  2.87it/s]

{'eval_loss': 0.666321337223053, 'eval_runtime': 14.9707, 'eval_samples_per_second': 79.622, 'eval_steps_per_second': 9.953, 'epoch': 10.0}


100%|██████████| 5960/5960 [36:53<00:00,  2.69it/s]

{'train_runtime': 2213.1349, 'train_samples_per_second': 21.544, 'train_steps_per_second': 2.693, 'train_loss': 0.2339770257262038, 'epoch': 10.0}





TrainOutput(global_step=5960, training_loss=0.2339770257262038, metrics={'train_runtime': 2213.1349, 'train_samples_per_second': 21.544, 'train_steps_per_second': 2.693, 'train_loss': 0.2339770257262038, 'epoch': 10.0})

In [8]:
import numpy as np
from sklearn.metrics import classification_report

predictions = trainer.predict(encoded_dataset["test"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print(classification_report(y_true, y_pred, target_names=["no", "yes"]))


100%|██████████| 149/149 [00:13<00:00, 10.70it/s]

              precision    recall  f1-score   support

          no       0.91      0.91      0.91       611
         yes       0.90      0.90      0.90       581

    accuracy                           0.90      1192
   macro avg       0.90      0.90      0.90      1192
weighted avg       0.90      0.90      0.90      1192






In [9]:
trainer.save_model("phobert_duration_model")


In [10]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model đã fine-tune
model_path = "./phobert_duration_model"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # chuyển sang chế độ dự đoán

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Hàm chuyển nhãn
def id_to_label(pred):
    return "yes" if pred == 1 else "no"

# Hàm dự đoán nhãn cho từng option
def predict_labels(context, question, options):
    labels = []
    for option in options:
        text = context + " " + question + " " + option
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            labels.append(id_to_label(pred))
    return labels

# Đọc file test
test_data = []
with open("public_test.txt", "r", encoding="utf-8") as f:
    for line in f:
        sample = json.loads(line.strip())
        test_data.append(sample)

# Dự đoán và ghi kết quả
with open("submission.txt", "w", encoding="utf-8") as f:
    for sample in test_data:
        predicted_labels = predict_labels(sample["context"], sample["question"], sample["options"])
        sample["labels"] = predicted_labels
        f.write(json.dumps(sample, ensure_ascii=False) + "\n")

print("✅ Dự đoán hoàn tất! Kết quả đã lưu vào file: submission.txt")


✅ Dự đoán hoàn tất! Kết quả đã lưu vào file: submission.txt


In [27]:
import random

# Đọc toàn bộ submission
with open("submission.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Chọn ngẫu nhiên 3 dòng
samples = random.sample(lines, k=min(3, len(lines)))

# In nội dung từng dòng
for idx, line in enumerate(samples, 1):
    sample = json.loads(line)
    print(f"\n🧩 Sample {idx} — QID: {sample['qid']}")
    print(f"📘 Context: {sample['context']}")
    print(f"❓ Question: {sample['question']}")
    print("🧠 Options + Labels:")
    for opt, label in zip(sample["options"], sample["labels"]):
        print(f"   - {opt:<15} → {label}")


🧩 Sample 1 — QID: 116
📘 Context: Một nhóm phóng viên đang điều tra một vụ bê bối lớn trong ngành công nghiệp thực phẩm. Họ đã thu thập rất nhiều tài liệu và chứng cứ từ các nguồn khác nhau để làm rõ sự thật.
❓ Question: Mất bao lâu để một nhóm phóng viên hoàn thành điều tra vụ bê bối trong ngành công nghiệp thực phẩm?
🧠 Options + Labels:
   - 3 tuần          → yes
   - 6 tuần          → yes
   - 1 tháng         → yes
   - 12 giờ          → yes

🧩 Sample 2 — QID: 77
📘 Context: Trong một ngôi làng nhỏ, có một nhóm cư dân quyết định tổ chức một lễ hội để kỷ niệm sự đoàn kết của họ. Họ đã lên kế hoạch cho các hoạt động như thi nấu ăn, biểu diễn văn nghệ và nhiều trò chơi dân gian khác.
❓ Question: Mất bao lâu để tổ chức lễ hội này?
🧠 Options + Labels:
   - 3 tuần          → yes
   - 1 ngày          → yes
   - 2 tháng         → yes
   - 5 tuần          → yes

🧩 Sample 3 — QID: 108
📘 Context: Trong một buổi họp báo, phóng viên đã đặt câu hỏi về một sự kiện quan trọng sắp diễn ra, nơi mà nhữ

In [8]:
import json
import re
import numpy as np
from sklearn.metrics import classification_report

def load_labels(path):
    """
    Đọc file JSON (mỗi dòng một object hoặc nối tiếp) và trả về dict: {qid: [0/1,...]}
    với 0 = "no", 1 = "yes".
    """
    text = open(path, 'r', encoding='utf-8').read()
    raw_objs = re.findall(r'\{.*?\}', text, re.DOTALL)
    data = {}
    for obj in raw_objs:
        entry = json.loads(obj)
        # Chuyển "yes"/"no" thành 1/0
        bin_labels = [1 if lab=="yes" else 0 for lab in entry['labels']]
        data[entry['qid']] = bin_labels
    return data

# Đường dẫn tới hai file
gt_path = 'public_test_full_labeled.json'
sub_path = 'submission.json'

# Load nhãn
gt = load_labels(gt_path)
sub = load_labels(sub_path)

# Xây dựng y_true, y_pred theo thứ tự qid nhất định (vd sort theo qid)
y_true, y_pred = [], []
for qid in sorted(gt.keys()):
    true = gt[qid]
    pred = sub.get(qid, [0]*len(true))
    y_true.extend(true)
    y_pred.extend(pred)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

# In classification report
print(classification_report(y_true, y_pred, target_names=["no", "yes"]))


              precision    recall  f1-score   support

          no       0.26      0.48      0.34       206
         yes       0.85      0.69      0.76       906

    accuracy                           0.65      1112
   macro avg       0.56      0.59      0.55      1112
weighted avg       0.74      0.65      0.69      1112



In [9]:
import json
import re
from collections import Counter

def load_json_objects(path):
    text = open(path, 'r', encoding='utf-8').read()
    # \{.*?\} với DOTALL để bắt cả xuống dòng
    raw_objs = re.findall(r'\{.*?\}', text, re.DOTALL)
    return [json.loads(obj) for obj in raw_objs]

# 1. Load ground truth và submission
gt_list = load_json_objects('public_test_full_labeled.json')
sub_list = load_json_objects('submission.json')

# Convert sang dict {qid: labels}
gt = {e['qid']: e['labels'] for e in gt_list}
sub = {e['qid']: e['labels'] for e in sub_list}

# 2. So sánh và build comparison_results
comparison = []
for qid, true_labels in gt.items():
    pred_labels = sub.get(qid, [""] * len(true_labels))
    correct_flags = [t == p for t, p in zip(true_labels, pred_labels)]
    comparison.append({
        "qid": qid,
        "true_labels": true_labels,
        "pred_labels": pred_labels,
        "correct": correct_flags
    })

with open('comparison_results.json', 'w', encoding='utf-8') as f:
    json.dump(comparison, f, ensure_ascii=False, indent=2)

# 3. Tính precision, recall, F1 cho 'yes' và 'no'
counters = {
    'yes': Counter({'tp':0,'fp':0,'fn':0}),
    'no':  Counter({'tp':0,'fp':0,'fn':0})
}

for entry in comparison:
    for true, pred in zip(entry['true_labels'], entry['pred_labels']):
        # yes
        if pred == 'yes':
            counters['yes']['tp' if true=='yes' else 'fp'] += 1
        elif true == 'yes':
            counters['yes']['fn'] += 1
        # no
        if pred == 'no':
            counters['no']['tp' if true=='no' else 'fp'] += 1
        elif true == 'no':
            counters['no']['fn'] += 1

metrics = {}
for lbl, cnt in counters.items():
    tp, fp, fn = cnt['tp'], cnt['fp'], cnt['fn']
    precision = tp/(tp+fp) if tp+fp else 0
    recall    = tp/(tp+fn) if tp+fn else 0
    f1        = 2*precision*recall/(precision+recall) if precision+recall else 0
    metrics[lbl] = {
        'precision': precision,
        'recall':    recall,
        'f1':        f1
    }

# 4. In kết quả
for lbl, m in metrics.items():
    print(f"Label '{lbl}': "
          f"Precision: {m['precision']:.2%}, "
          f"Recall: {m['recall']:.2%}, "
          f"F1: {m['f1']:.2%}")


Label 'yes': Precision: 85.40%, Recall: 69.09%, F1: 76.39%
Label 'no': Precision: 26.12%, Recall: 48.06%, F1: 33.85%
