In [1]:
import json
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

# Đọc dữ liệu từ file
train_data = []
with open("duration_training_dataset_normalize.json", "r", encoding="utf-8") as f:  # ← đổi tên file cho đúng với file bạn đang dùng
    for line in f:
        if line.strip():  # Bỏ qua dòng trống
            train_data.append(json.loads(line.strip()))

print(f"Số lượng mẫu: {len(train_data)}")
print("Ví dụ một mẫu đầu tiên:")
print(json.dumps(train_data[0], indent=2, ensure_ascii=False))


Số lượng mẫu: 1490
Ví dụ một mẫu đầu tiên:
{
  "context": "Trong một lớp học, các học sinh đang học về các chủ đề khác nhau. Một số em rất chăm chỉ và thường xuyên hoàn thành bài tập về nhà, trong khi những em khác lại bỏ lỡ nhiều bài. Cô giáo luôn cố gắng khuyến khích cả lớp bằng cách tổ chức các buổi thảo luận thú vị.",
  "labels": [
    "yes",
    "no",
    "yes",
    "no"
  ],
  "options": [
    "168.0 giờ",
    "43800.0 giờ",
    "240.0 giờ",
    "0.003 giờ"
  ],
  "qid": 650,
  "question": "Mất bao lâu để hoàn thành tất cả bài tập về nhà của lớp học?"
}


In [3]:
from collections import Counter

label_counter = Counter()
option_lengths = []

for sample in train_data:
    label_counter.update(sample["labels"])
    option_lengths.append(len(sample["options"]))

print("Tổng số nhãn:")
print(label_counter)
print(f"Số option trung bình mỗi câu hỏi: {sum(option_lengths) / len(option_lengths):.2f}")

Tổng số nhãn:
Counter({'no': 3003, 'yes': 2957})
Số option trung bình mỗi câu hỏi: 4.00


In [4]:
import re

# Từ khóa phân loại context
SHORT_CTX   = ["biểu diễn", "sửa", "trình diễn", "hòa nhạc", "vẽ", "chơi", "làm bài", "bài tập"]
MEDIUM_CTX  = ["tổ chức", "startup", "công ty", "lớp học", "học", "nhóm"]
LONG_CTX    = ["chuẩn bị", "triển lãm", "phát triển", "hoàn thành", "năm", "sự thật", "cuộc sống", "đời sống"]

# Ngưỡng (giờ)
T_SHORT_MED = 168    # 7 ngày
T_MED_LONG  = 720    # 30 ngày

def extract_hours(opt_str):
    m = re.search(r"([\d\.]+)", opt_str)
    return float(m.group(1)) if m else 0.0

def rule_based_predict(sample):
    ctx = sample["context"].lower()
    outs = []
    for opt in sample["options"]:
        h = extract_hours(opt)
        label = "no"
        if h < T_SHORT_MED:
            if any(w in ctx for w in SHORT_CTX):
                label = "yes"
        elif h <= T_MED_LONG:
            if any(w in ctx for w in MEDIUM_CTX):
                label = "yes"
        else:  # h > T_MED_LONG
            if any(w in ctx for w in LONG_CTX):
                label = "yes"
        outs.append(label)
    return outs

# BƯỚC 3: Đánh giá
from sklearn.metrics import classification_report

y_true = []
y_pred = []

for sample in train_data:
    pred = rule_based_predict(sample)
    gold = sample["labels"]
    y_true.extend(gold)
    y_pred.extend(pred)

print(classification_report(y_true, y_pred, target_names=["no", "yes"]))


              precision    recall  f1-score   support

          no       0.51      0.87      0.65      3003
         yes       0.55      0.17      0.26      2957

    accuracy                           0.52      5960
   macro avg       0.53      0.52      0.45      5960
weighted avg       0.53      0.52      0.45      5960



In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSeq

In [6]:
import json

examples = []
with open("duration_training_dataset_normalize.json", "r", encoding="utf-8") as f:
    for line in f:
        sample = json.loads(line.strip())
        for opt, label in zip(sample["options"], sample["labels"]):
            text = sample["context"] + " " + sample["question"] + " " + opt
            label_id = 1 if label == "yes" else 0
            examples.append({
                "text": text,
                "label": label_id
            })

print(f"Tổng số sample: {len(examples)}")

Tổng số sample: 5960


In [7]:
from datasets import Dataset

dataset = Dataset.from_list(examples)

def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.train_test_split(test_size=0.2)

Map: 100%|██████████| 5960/5960 [00:00<00:00, 7053.31 examples/s]


In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./phobert_duration_hour",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()

  0%|          | 10/5960 [00:03<32:18,  3.07it/s] 

{'loss': 0.7343, 'learning_rate': 4.9916107382550336e-05, 'epoch': 0.02}


  0%|          | 20/5960 [00:06<32:12,  3.07it/s]

{'loss': 0.6583, 'learning_rate': 4.983221476510067e-05, 'epoch': 0.03}


  1%|          | 30/5960 [00:10<31:59,  3.09it/s]

{'loss': 0.6704, 'learning_rate': 4.974832214765101e-05, 'epoch': 0.05}


  1%|          | 40/5960 [00:13<31:42,  3.11it/s]

{'loss': 0.6388, 'learning_rate': 4.966442953020135e-05, 'epoch': 0.07}


  1%|          | 50/5960 [00:16<30:26,  3.24it/s]

{'loss': 0.6177, 'learning_rate': 4.958053691275168e-05, 'epoch': 0.08}


  1%|          | 60/5960 [00:19<29:41,  3.31it/s]

{'loss': 0.4887, 'learning_rate': 4.9496644295302015e-05, 'epoch': 0.1}


  1%|          | 70/5960 [00:22<29:46,  3.30it/s]

{'loss': 0.5624, 'learning_rate': 4.9412751677852355e-05, 'epoch': 0.12}


  1%|▏         | 80/5960 [00:25<29:49,  3.29it/s]

{'loss': 0.5736, 'learning_rate': 4.932885906040269e-05, 'epoch': 0.13}


  2%|▏         | 90/5960 [00:28<30:05,  3.25it/s]

{'loss': 0.5535, 'learning_rate': 4.924496644295302e-05, 'epoch': 0.15}


  2%|▏         | 100/5960 [00:31<31:08,  3.14it/s]

{'loss': 0.6454, 'learning_rate': 4.9161073825503354e-05, 'epoch': 0.17}


  2%|▏         | 110/5960 [00:34<31:53,  3.06it/s]

{'loss': 0.6026, 'learning_rate': 4.9077181208053694e-05, 'epoch': 0.18}


  2%|▏         | 120/5960 [00:38<31:42,  3.07it/s]

{'loss': 0.6529, 'learning_rate': 4.8993288590604034e-05, 'epoch': 0.2}


  2%|▏         | 130/5960 [00:41<31:05,  3.13it/s]

{'loss': 0.5841, 'learning_rate': 4.890939597315437e-05, 'epoch': 0.22}


  2%|▏         | 140/5960 [00:44<30:58,  3.13it/s]

{'loss': 0.4768, 'learning_rate': 4.88255033557047e-05, 'epoch': 0.23}


  3%|▎         | 150/5960 [00:47<30:13,  3.20it/s]

{'loss': 0.4959, 'learning_rate': 4.874161073825503e-05, 'epoch': 0.25}


  3%|▎         | 160/5960 [00:50<30:34,  3.16it/s]

{'loss': 0.568, 'learning_rate': 4.865771812080537e-05, 'epoch': 0.27}


  3%|▎         | 170/5960 [00:54<31:31,  3.06it/s]

{'loss': 0.5716, 'learning_rate': 4.8573825503355706e-05, 'epoch': 0.29}


  3%|▎         | 180/5960 [00:57<31:33,  3.05it/s]

{'loss': 0.5694, 'learning_rate': 4.848993288590604e-05, 'epoch': 0.3}


  3%|▎         | 190/5960 [01:00<29:52,  3.22it/s]

{'loss': 0.5533, 'learning_rate': 4.840604026845638e-05, 'epoch': 0.32}


  3%|▎         | 200/5960 [01:03<29:21,  3.27it/s]

{'loss': 0.6109, 'learning_rate': 4.832214765100672e-05, 'epoch': 0.34}


  4%|▎         | 210/5960 [01:06<29:16,  3.27it/s]

{'loss': 0.5115, 'learning_rate': 4.823825503355705e-05, 'epoch': 0.35}


  4%|▎         | 220/5960 [01:10<30:54,  3.10it/s]

{'loss': 0.786, 'learning_rate': 4.8154362416107385e-05, 'epoch': 0.37}


  4%|▍         | 230/5960 [01:13<31:27,  3.04it/s]

{'loss': 0.6197, 'learning_rate': 4.807046979865772e-05, 'epoch': 0.39}


  4%|▍         | 240/5960 [01:16<29:50,  3.19it/s]

{'loss': 0.6535, 'learning_rate': 4.798657718120805e-05, 'epoch': 0.4}


  4%|▍         | 250/5960 [01:19<31:06,  3.06it/s]

{'loss': 0.613, 'learning_rate': 4.790268456375839e-05, 'epoch': 0.42}


  4%|▍         | 260/5960 [01:23<31:20,  3.03it/s]

{'loss': 0.5387, 'learning_rate': 4.7818791946308725e-05, 'epoch': 0.44}


  5%|▍         | 270/5960 [01:26<31:14,  3.04it/s]

{'loss': 0.5648, 'learning_rate': 4.7734899328859064e-05, 'epoch': 0.45}


  5%|▍         | 280/5960 [01:29<31:07,  3.04it/s]

{'loss': 0.6283, 'learning_rate': 4.76510067114094e-05, 'epoch': 0.47}


  5%|▍         | 290/5960 [01:32<30:14,  3.12it/s]

{'loss': 0.4992, 'learning_rate': 4.756711409395974e-05, 'epoch': 0.49}


  5%|▌         | 300/5960 [01:36<29:52,  3.16it/s]

{'loss': 0.5101, 'learning_rate': 4.748322147651007e-05, 'epoch': 0.5}


  5%|▌         | 310/5960 [01:39<31:04,  3.03it/s]

{'loss': 0.5369, 'learning_rate': 4.7399328859060404e-05, 'epoch': 0.52}


  5%|▌         | 320/5960 [01:42<30:51,  3.05it/s]

{'loss': 0.5704, 'learning_rate': 4.731543624161074e-05, 'epoch': 0.54}


  6%|▌         | 330/5960 [01:45<30:25,  3.08it/s]

{'loss': 0.6025, 'learning_rate': 4.723154362416108e-05, 'epoch': 0.55}


  6%|▌         | 340/5960 [01:49<29:13,  3.21it/s]

{'loss': 0.5757, 'learning_rate': 4.714765100671141e-05, 'epoch': 0.57}


  6%|▌         | 350/5960 [01:52<30:08,  3.10it/s]

{'loss': 0.591, 'learning_rate': 4.706375838926175e-05, 'epoch': 0.59}


  6%|▌         | 360/5960 [01:55<29:14,  3.19it/s]

{'loss': 0.5887, 'learning_rate': 4.697986577181208e-05, 'epoch': 0.6}


  6%|▌         | 370/5960 [01:58<28:59,  3.21it/s]

{'loss': 0.4741, 'learning_rate': 4.6895973154362416e-05, 'epoch': 0.62}


  6%|▋         | 380/5960 [02:01<29:09,  3.19it/s]

{'loss': 0.4502, 'learning_rate': 4.6812080536912756e-05, 'epoch': 0.64}


  7%|▋         | 390/5960 [02:04<29:39,  3.13it/s]

{'loss': 0.5511, 'learning_rate': 4.672818791946309e-05, 'epoch': 0.65}


  7%|▋         | 400/5960 [02:08<28:46,  3.22it/s]

{'loss': 0.4169, 'learning_rate': 4.664429530201342e-05, 'epoch': 0.67}


  7%|▋         | 410/5960 [02:11<28:57,  3.19it/s]

{'loss': 0.4623, 'learning_rate': 4.6560402684563755e-05, 'epoch': 0.69}


  7%|▋         | 420/5960 [02:14<29:51,  3.09it/s]

{'loss': 0.5671, 'learning_rate': 4.6476510067114095e-05, 'epoch': 0.7}


  7%|▋         | 430/5960 [02:17<29:31,  3.12it/s]

{'loss': 0.605, 'learning_rate': 4.6392617449664435e-05, 'epoch': 0.72}


  7%|▋         | 440/5960 [02:20<28:41,  3.21it/s]

{'loss': 0.5411, 'learning_rate': 4.630872483221477e-05, 'epoch': 0.74}


  8%|▊         | 450/5960 [02:23<29:04,  3.16it/s]

{'loss': 0.474, 'learning_rate': 4.62248322147651e-05, 'epoch': 0.76}


  8%|▊         | 460/5960 [02:27<30:06,  3.04it/s]

{'loss': 0.6796, 'learning_rate': 4.6140939597315434e-05, 'epoch': 0.77}


  8%|▊         | 470/5960 [02:30<29:42,  3.08it/s]

{'loss': 0.5664, 'learning_rate': 4.6057046979865774e-05, 'epoch': 0.79}


  8%|▊         | 480/5960 [02:33<28:39,  3.19it/s]

{'loss': 0.4027, 'learning_rate': 4.597315436241611e-05, 'epoch': 0.81}


  8%|▊         | 490/5960 [02:36<29:53,  3.05it/s]

{'loss': 0.4504, 'learning_rate': 4.588926174496645e-05, 'epoch': 0.82}


  8%|▊         | 500/5960 [02:40<29:14,  3.11it/s]

{'loss': 0.6414, 'learning_rate': 4.580536912751678e-05, 'epoch': 0.84}


  9%|▊         | 510/5960 [02:43<29:09,  3.12it/s]

{'loss': 0.4702, 'learning_rate': 4.572147651006712e-05, 'epoch': 0.86}


  9%|▊         | 520/5960 [02:46<29:07,  3.11it/s]

{'loss': 0.4899, 'learning_rate': 4.5637583892617453e-05, 'epoch': 0.87}


  9%|▉         | 530/5960 [02:49<29:36,  3.06it/s]

{'loss': 0.5339, 'learning_rate': 4.5553691275167787e-05, 'epoch': 0.89}


  9%|▉         | 540/5960 [02:52<29:11,  3.09it/s]

{'loss': 0.6457, 'learning_rate': 4.546979865771812e-05, 'epoch': 0.91}


  9%|▉         | 550/5960 [02:56<29:05,  3.10it/s]

{'loss': 0.5031, 'learning_rate': 4.538590604026846e-05, 'epoch': 0.92}


  9%|▉         | 560/5960 [02:59<29:40,  3.03it/s]

{'loss': 0.6293, 'learning_rate': 4.530201342281879e-05, 'epoch': 0.94}


 10%|▉         | 570/5960 [03:02<29:16,  3.07it/s]

{'loss': 0.4409, 'learning_rate': 4.521812080536913e-05, 'epoch': 0.96}


 10%|▉         | 580/5960 [03:05<29:05,  3.08it/s]

{'loss': 0.536, 'learning_rate': 4.5134228187919466e-05, 'epoch': 0.97}


 10%|▉         | 590/5960 [03:09<28:30,  3.14it/s]

{'loss': 0.5427, 'learning_rate': 4.50503355704698e-05, 'epoch': 0.99}


                                                  
 10%|█         | 596/5960 [03:25<29:24,  3.04it/s]

{'eval_loss': 0.5518612265586853, 'eval_runtime': 13.8665, 'eval_samples_per_second': 85.963, 'eval_steps_per_second': 10.745, 'epoch': 1.0}


 10%|█         | 600/5960 [03:29<3:09:08,  2.12s/it]

{'loss': 0.4701, 'learning_rate': 4.496644295302014e-05, 'epoch': 1.01}


 10%|█         | 610/5960 [03:33<32:14,  2.77it/s]  

{'loss': 0.4396, 'learning_rate': 4.488255033557047e-05, 'epoch': 1.02}


 10%|█         | 620/5960 [03:36<27:35,  3.23it/s]

{'loss': 0.628, 'learning_rate': 4.4798657718120805e-05, 'epoch': 1.04}


 11%|█         | 630/5960 [03:39<27:38,  3.21it/s]

{'loss': 0.5666, 'learning_rate': 4.471476510067114e-05, 'epoch': 1.06}


 11%|█         | 640/5960 [03:42<28:09,  3.15it/s]

{'loss': 0.4418, 'learning_rate': 4.463087248322148e-05, 'epoch': 1.07}


 11%|█         | 650/5960 [03:45<27:27,  3.22it/s]

{'loss': 0.7461, 'learning_rate': 4.454697986577182e-05, 'epoch': 1.09}


 11%|█         | 660/5960 [03:48<27:18,  3.24it/s]

{'loss': 0.5744, 'learning_rate': 4.446308724832215e-05, 'epoch': 1.11}


 11%|█         | 670/5960 [03:51<27:33,  3.20it/s]

{'loss': 0.5849, 'learning_rate': 4.4379194630872484e-05, 'epoch': 1.12}


 11%|█▏        | 680/5960 [03:54<28:09,  3.13it/s]

{'loss': 0.5238, 'learning_rate': 4.4295302013422824e-05, 'epoch': 1.14}


 12%|█▏        | 690/5960 [03:58<27:41,  3.17it/s]

{'loss': 0.4284, 'learning_rate': 4.421140939597316e-05, 'epoch': 1.16}


 12%|█▏        | 700/5960 [04:01<28:33,  3.07it/s]

{'loss': 0.4503, 'learning_rate': 4.412751677852349e-05, 'epoch': 1.17}


 12%|█▏        | 710/5960 [04:04<29:03,  3.01it/s]

{'loss': 0.4562, 'learning_rate': 4.4043624161073823e-05, 'epoch': 1.19}


 12%|█▏        | 720/5960 [04:07<28:47,  3.03it/s]

{'loss': 0.4243, 'learning_rate': 4.395973154362416e-05, 'epoch': 1.21}


 12%|█▏        | 730/5960 [04:11<28:55,  3.01it/s]

{'loss': 0.4788, 'learning_rate': 4.38758389261745e-05, 'epoch': 1.22}


 12%|█▏        | 740/5960 [04:14<28:57,  3.00it/s]

{'loss': 0.5178, 'learning_rate': 4.3791946308724836e-05, 'epoch': 1.24}


 13%|█▎        | 750/5960 [04:17<28:54,  3.00it/s]

{'loss': 0.3691, 'learning_rate': 4.370805369127517e-05, 'epoch': 1.26}


 13%|█▎        | 760/5960 [04:21<28:57,  2.99it/s]

{'loss': 0.4083, 'learning_rate': 4.36241610738255e-05, 'epoch': 1.28}


 13%|█▎        | 770/5960 [04:24<28:49,  3.00it/s]

{'loss': 0.3302, 'learning_rate': 4.354026845637584e-05, 'epoch': 1.29}


 13%|█▎        | 780/5960 [04:27<28:59,  2.98it/s]

{'loss': 0.5167, 'learning_rate': 4.3456375838926176e-05, 'epoch': 1.31}


 13%|█▎        | 790/5960 [04:31<29:03,  2.96it/s]

{'loss': 0.6256, 'learning_rate': 4.337248322147651e-05, 'epoch': 1.33}


 13%|█▎        | 800/5960 [04:34<28:25,  3.03it/s]

{'loss': 0.444, 'learning_rate': 4.328859060402685e-05, 'epoch': 1.34}


 14%|█▎        | 810/5960 [04:37<28:24,  3.02it/s]

{'loss': 0.5249, 'learning_rate': 4.320469798657718e-05, 'epoch': 1.36}


 14%|█▍        | 820/5960 [04:41<28:14,  3.03it/s]

{'loss': 0.4345, 'learning_rate': 4.312080536912752e-05, 'epoch': 1.38}


 14%|█▍        | 830/5960 [04:44<28:14,  3.03it/s]

{'loss': 0.3404, 'learning_rate': 4.3036912751677855e-05, 'epoch': 1.39}


 14%|█▍        | 840/5960 [04:47<28:26,  3.00it/s]

{'loss': 0.5129, 'learning_rate': 4.295302013422819e-05, 'epoch': 1.41}


 14%|█▍        | 850/5960 [04:51<28:22,  3.00it/s]

{'loss': 0.427, 'learning_rate': 4.286912751677852e-05, 'epoch': 1.43}


 14%|█▍        | 860/5960 [04:54<28:06,  3.02it/s]

{'loss': 0.5601, 'learning_rate': 4.278523489932886e-05, 'epoch': 1.44}


 15%|█▍        | 870/5960 [04:57<27:45,  3.06it/s]

{'loss': 0.5598, 'learning_rate': 4.27013422818792e-05, 'epoch': 1.46}


 15%|█▍        | 880/5960 [05:01<27:48,  3.04it/s]

{'loss': 0.6696, 'learning_rate': 4.2617449664429534e-05, 'epoch': 1.48}


 15%|█▍        | 890/5960 [05:04<26:22,  3.20it/s]

{'loss': 0.6556, 'learning_rate': 4.253355704697987e-05, 'epoch': 1.49}


 15%|█▌        | 900/5960 [05:07<27:48,  3.03it/s]

{'loss': 0.5814, 'learning_rate': 4.244966442953021e-05, 'epoch': 1.51}


 15%|█▌        | 910/5960 [05:10<27:56,  3.01it/s]

{'loss': 0.5029, 'learning_rate': 4.236577181208054e-05, 'epoch': 1.53}


 15%|█▌        | 920/5960 [05:14<27:59,  3.00it/s]

{'loss': 0.6422, 'learning_rate': 4.228187919463087e-05, 'epoch': 1.54}


 16%|█▌        | 930/5960 [05:17<27:57,  3.00it/s]

{'loss': 0.624, 'learning_rate': 4.2197986577181206e-05, 'epoch': 1.56}


 16%|█▌        | 940/5960 [05:20<26:34,  3.15it/s]

{'loss': 0.5419, 'learning_rate': 4.2114093959731546e-05, 'epoch': 1.58}


 16%|█▌        | 950/5960 [05:23<27:39,  3.02it/s]

{'loss': 0.5628, 'learning_rate': 4.2030201342281886e-05, 'epoch': 1.59}


 16%|█▌        | 960/5960 [05:27<27:43,  3.01it/s]

{'loss': 0.5636, 'learning_rate': 4.194630872483222e-05, 'epoch': 1.61}


 16%|█▋        | 970/5960 [05:30<27:24,  3.03it/s]

{'loss': 0.4524, 'learning_rate': 4.186241610738255e-05, 'epoch': 1.63}


 16%|█▋        | 980/5960 [05:33<27:16,  3.04it/s]

{'loss': 0.4277, 'learning_rate': 4.1778523489932886e-05, 'epoch': 1.64}


 17%|█▋        | 990/5960 [05:37<27:22,  3.03it/s]

{'loss': 0.4636, 'learning_rate': 4.1694630872483225e-05, 'epoch': 1.66}


 17%|█▋        | 1000/5960 [05:40<27:04,  3.05it/s]

{'loss': 0.4557, 'learning_rate': 4.161073825503356e-05, 'epoch': 1.68}


 17%|█▋        | 1010/5960 [05:43<26:55,  3.06it/s]

{'loss': 0.4058, 'learning_rate': 4.152684563758389e-05, 'epoch': 1.69}


 17%|█▋        | 1020/5960 [05:47<26:50,  3.07it/s]

{'loss': 0.479, 'learning_rate': 4.144295302013423e-05, 'epoch': 1.71}


 17%|█▋        | 1030/5960 [05:50<27:04,  3.03it/s]

{'loss': 0.5438, 'learning_rate': 4.135906040268457e-05, 'epoch': 1.73}


 17%|█▋        | 1040/5960 [05:53<26:55,  3.05it/s]

{'loss': 0.4826, 'learning_rate': 4.1275167785234905e-05, 'epoch': 1.74}


 18%|█▊        | 1050/5960 [05:56<27:02,  3.03it/s]

{'loss': 0.4263, 'learning_rate': 4.119127516778524e-05, 'epoch': 1.76}


 18%|█▊        | 1060/5960 [06:00<26:53,  3.04it/s]

{'loss': 0.5263, 'learning_rate': 4.110738255033557e-05, 'epoch': 1.78}


 18%|█▊        | 1070/5960 [06:03<26:55,  3.03it/s]

{'loss': 0.4006, 'learning_rate': 4.1023489932885904e-05, 'epoch': 1.8}


 18%|█▊        | 1080/5960 [06:06<26:57,  3.02it/s]

{'loss': 0.4665, 'learning_rate': 4.0939597315436244e-05, 'epoch': 1.81}


 18%|█▊        | 1090/5960 [06:10<26:55,  3.02it/s]

{'loss': 0.5307, 'learning_rate': 4.085570469798658e-05, 'epoch': 1.83}


 18%|█▊        | 1100/5960 [06:13<26:38,  3.04it/s]

{'loss': 0.4531, 'learning_rate': 4.077181208053692e-05, 'epoch': 1.85}


 19%|█▊        | 1110/5960 [06:16<26:45,  3.02it/s]

{'loss': 0.4558, 'learning_rate': 4.068791946308725e-05, 'epoch': 1.86}


 19%|█▉        | 1120/5960 [06:20<26:48,  3.01it/s]

{'loss': 0.4723, 'learning_rate': 4.060402684563759e-05, 'epoch': 1.88}


 19%|█▉        | 1130/5960 [06:23<26:38,  3.02it/s]

{'loss': 0.439, 'learning_rate': 4.052013422818792e-05, 'epoch': 1.9}


 19%|█▉        | 1140/5960 [06:26<26:37,  3.02it/s]

{'loss': 0.5163, 'learning_rate': 4.0436241610738256e-05, 'epoch': 1.91}


 19%|█▉        | 1150/5960 [06:30<26:42,  3.00it/s]

{'loss': 0.5468, 'learning_rate': 4.035234899328859e-05, 'epoch': 1.93}


 19%|█▉        | 1160/5960 [06:33<26:23,  3.03it/s]

{'loss': 0.5022, 'learning_rate': 4.026845637583892e-05, 'epoch': 1.95}


 20%|█▉        | 1170/5960 [06:36<26:19,  3.03it/s]

{'loss': 0.5599, 'learning_rate': 4.018456375838926e-05, 'epoch': 1.96}


 20%|█▉        | 1180/5960 [06:39<26:10,  3.04it/s]

{'loss': 0.5647, 'learning_rate': 4.01006711409396e-05, 'epoch': 1.98}


 20%|█▉        | 1190/5960 [06:43<26:10,  3.04it/s]

{'loss': 0.4991, 'learning_rate': 4.0016778523489935e-05, 'epoch': 2.0}


                                                   
 20%|██        | 1192/5960 [06:58<26:14,  3.03it/s]

{'eval_loss': 0.5054112672805786, 'eval_runtime': 14.1426, 'eval_samples_per_second': 84.284, 'eval_steps_per_second': 10.536, 'epoch': 2.0}


 20%|██        | 1200/5960 [07:05<1:02:58,  1.26it/s]

{'loss': 0.5633, 'learning_rate': 3.993288590604027e-05, 'epoch': 2.01}


 20%|██        | 1210/5960 [07:08<27:18,  2.90it/s]  

{'loss': 0.429, 'learning_rate': 3.984899328859061e-05, 'epoch': 2.03}


 20%|██        | 1220/5960 [07:11<26:20,  3.00it/s]

{'loss': 0.5082, 'learning_rate': 3.976510067114094e-05, 'epoch': 2.05}


 21%|██        | 1230/5960 [07:15<26:23,  2.99it/s]

{'loss': 0.428, 'learning_rate': 3.9681208053691275e-05, 'epoch': 2.06}


 21%|██        | 1240/5960 [07:18<26:16,  2.99it/s]

{'loss': 0.4478, 'learning_rate': 3.959731543624161e-05, 'epoch': 2.08}


 21%|██        | 1250/5960 [07:21<26:11,  3.00it/s]

{'loss': 0.4002, 'learning_rate': 3.951342281879195e-05, 'epoch': 2.1}


 21%|██        | 1260/5960 [07:25<26:07,  3.00it/s]

{'loss': 0.5228, 'learning_rate': 3.942953020134229e-05, 'epoch': 2.11}


 21%|██▏       | 1270/5960 [07:28<26:05,  3.00it/s]

{'loss': 0.5514, 'learning_rate': 3.934563758389262e-05, 'epoch': 2.13}


 21%|██▏       | 1280/5960 [07:31<26:00,  3.00it/s]

{'loss': 0.5112, 'learning_rate': 3.9261744966442954e-05, 'epoch': 2.15}


 22%|██▏       | 1290/5960 [07:35<26:03,  2.99it/s]

{'loss': 0.4952, 'learning_rate': 3.917785234899329e-05, 'epoch': 2.16}


 22%|██▏       | 1300/5960 [07:38<25:58,  2.99it/s]

{'loss': 0.5384, 'learning_rate': 3.909395973154363e-05, 'epoch': 2.18}


 22%|██▏       | 1310/5960 [07:42<25:51,  3.00it/s]

{'loss': 0.5333, 'learning_rate': 3.901006711409396e-05, 'epoch': 2.2}


 22%|██▏       | 1320/5960 [07:45<25:54,  2.99it/s]

{'loss': 0.7004, 'learning_rate': 3.89261744966443e-05, 'epoch': 2.21}


 22%|██▏       | 1330/5960 [07:48<25:43,  3.00it/s]

{'loss': 0.5684, 'learning_rate': 3.884228187919463e-05, 'epoch': 2.23}


 22%|██▏       | 1340/5960 [07:52<25:39,  3.00it/s]

{'loss': 0.4729, 'learning_rate': 3.875838926174497e-05, 'epoch': 2.25}


 23%|██▎       | 1350/5960 [07:55<25:37,  3.00it/s]

{'loss': 0.4505, 'learning_rate': 3.8674496644295306e-05, 'epoch': 2.27}


 23%|██▎       | 1360/5960 [07:58<25:21,  3.02it/s]

{'loss': 0.4861, 'learning_rate': 3.859060402684564e-05, 'epoch': 2.28}


 23%|██▎       | 1370/5960 [08:02<25:13,  3.03it/s]

{'loss': 0.4803, 'learning_rate': 3.850671140939597e-05, 'epoch': 2.3}


 23%|██▎       | 1380/5960 [08:05<25:09,  3.03it/s]

{'loss': 0.4825, 'learning_rate': 3.8422818791946305e-05, 'epoch': 2.32}


 23%|██▎       | 1390/5960 [08:08<25:26,  2.99it/s]

{'loss': 0.3912, 'learning_rate': 3.8338926174496645e-05, 'epoch': 2.33}


 23%|██▎       | 1400/5960 [08:11<25:18,  3.00it/s]

{'loss': 0.5064, 'learning_rate': 3.8255033557046985e-05, 'epoch': 2.35}


 24%|██▎       | 1410/5960 [08:15<25:14,  3.00it/s]

{'loss': 0.3825, 'learning_rate': 3.817114093959732e-05, 'epoch': 2.37}


 24%|██▍       | 1420/5960 [08:18<25:04,  3.02it/s]

{'loss': 0.5861, 'learning_rate': 3.808724832214765e-05, 'epoch': 2.38}


 24%|██▍       | 1430/5960 [08:21<25:12,  3.00it/s]

{'loss': 0.5412, 'learning_rate': 3.800335570469799e-05, 'epoch': 2.4}


 24%|██▍       | 1440/5960 [08:25<25:05,  3.00it/s]

{'loss': 0.6559, 'learning_rate': 3.7919463087248324e-05, 'epoch': 2.42}


 24%|██▍       | 1450/5960 [08:28<25:00,  3.00it/s]

{'loss': 0.7022, 'learning_rate': 3.783557046979866e-05, 'epoch': 2.43}


 24%|██▍       | 1460/5960 [08:31<24:57,  3.01it/s]

{'loss': 0.7106, 'learning_rate': 3.775167785234899e-05, 'epoch': 2.45}


 25%|██▍       | 1470/5960 [08:35<24:40,  3.03it/s]

{'loss': 0.6693, 'learning_rate': 3.766778523489933e-05, 'epoch': 2.47}


 25%|██▍       | 1480/5960 [08:38<24:40,  3.03it/s]

{'loss': 0.5949, 'learning_rate': 3.758389261744967e-05, 'epoch': 2.48}


 25%|██▌       | 1490/5960 [08:41<24:45,  3.01it/s]

{'loss': 0.5448, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.5}


 25%|██▌       | 1500/5960 [08:45<24:47,  3.00it/s]

{'loss': 0.5407, 'learning_rate': 3.741610738255034e-05, 'epoch': 2.52}


 25%|██▌       | 1510/5960 [08:48<24:45,  3.00it/s]

{'loss': 0.5209, 'learning_rate': 3.733221476510067e-05, 'epoch': 2.53}


 26%|██▌       | 1520/5960 [08:51<24:39,  3.00it/s]

{'loss': 0.5594, 'learning_rate': 3.724832214765101e-05, 'epoch': 2.55}


 26%|██▌       | 1530/5960 [08:55<24:38,  3.00it/s]

{'loss': 0.5471, 'learning_rate': 3.716442953020134e-05, 'epoch': 2.57}


 26%|██▌       | 1540/5960 [08:58<24:34,  3.00it/s]

{'loss': 0.3852, 'learning_rate': 3.7080536912751676e-05, 'epoch': 2.58}


 26%|██▌       | 1550/5960 [09:01<24:31,  3.00it/s]

{'loss': 0.5876, 'learning_rate': 3.6996644295302016e-05, 'epoch': 2.6}


 26%|██▌       | 1560/5960 [09:05<24:28,  3.00it/s]

{'loss': 0.5527, 'learning_rate': 3.6912751677852356e-05, 'epoch': 2.62}


 26%|██▋       | 1570/5960 [09:08<24:23,  3.00it/s]

{'loss': 0.364, 'learning_rate': 3.682885906040269e-05, 'epoch': 2.63}


 27%|██▋       | 1580/5960 [09:11<24:19,  3.00it/s]

{'loss': 0.4516, 'learning_rate': 3.674496644295302e-05, 'epoch': 2.65}


 27%|██▋       | 1590/5960 [09:15<24:19,  2.99it/s]

{'loss': 0.5696, 'learning_rate': 3.6661073825503355e-05, 'epoch': 2.67}


 27%|██▋       | 1600/5960 [09:18<24:15,  3.00it/s]

{'loss': 0.3978, 'learning_rate': 3.6577181208053695e-05, 'epoch': 2.68}


 27%|██▋       | 1610/5960 [09:21<24:11,  3.00it/s]

{'loss': 0.5484, 'learning_rate': 3.649328859060403e-05, 'epoch': 2.7}


 27%|██▋       | 1620/5960 [09:25<24:09,  2.99it/s]

{'loss': 0.3483, 'learning_rate': 3.640939597315436e-05, 'epoch': 2.72}


 27%|██▋       | 1630/5960 [09:28<24:04,  3.00it/s]

{'loss': 0.6406, 'learning_rate': 3.63255033557047e-05, 'epoch': 2.73}


 28%|██▊       | 1640/5960 [09:31<23:48,  3.02it/s]

{'loss': 0.3953, 'learning_rate': 3.6241610738255034e-05, 'epoch': 2.75}


 28%|██▊       | 1650/5960 [09:35<23:34,  3.05it/s]

{'loss': 0.3465, 'learning_rate': 3.6157718120805374e-05, 'epoch': 2.77}


 28%|██▊       | 1660/5960 [09:38<23:28,  3.05it/s]

{'loss': 0.6337, 'learning_rate': 3.607382550335571e-05, 'epoch': 2.79}


 28%|██▊       | 1670/5960 [09:41<23:25,  3.05it/s]

{'loss': 0.4039, 'learning_rate': 3.598993288590604e-05, 'epoch': 2.8}


 28%|██▊       | 1680/5960 [09:45<23:22,  3.05it/s]

{'loss': 0.5895, 'learning_rate': 3.5906040268456373e-05, 'epoch': 2.82}


 28%|██▊       | 1690/5960 [09:48<23:17,  3.05it/s]

{'loss': 0.5134, 'learning_rate': 3.582214765100671e-05, 'epoch': 2.84}


 29%|██▊       | 1700/5960 [09:51<23:16,  3.05it/s]

{'loss': 0.3004, 'learning_rate': 3.5738255033557046e-05, 'epoch': 2.85}


 29%|██▊       | 1710/5960 [09:55<23:32,  3.01it/s]

{'loss': 0.5003, 'learning_rate': 3.5654362416107386e-05, 'epoch': 2.87}


 29%|██▉       | 1720/5960 [09:58<23:27,  3.01it/s]

{'loss': 0.5645, 'learning_rate': 3.557046979865772e-05, 'epoch': 2.89}


 29%|██▉       | 1730/5960 [10:01<23:22,  3.02it/s]

{'loss': 0.51, 'learning_rate': 3.548657718120805e-05, 'epoch': 2.9}


 29%|██▉       | 1740/5960 [10:04<23:15,  3.02it/s]

{'loss': 0.6183, 'learning_rate': 3.540268456375839e-05, 'epoch': 2.92}


 29%|██▉       | 1750/5960 [10:08<23:02,  3.05it/s]

{'loss': 0.5841, 'learning_rate': 3.5318791946308726e-05, 'epoch': 2.94}


 30%|██▉       | 1760/5960 [10:11<22:59,  3.04it/s]

{'loss': 0.3362, 'learning_rate': 3.523489932885906e-05, 'epoch': 2.95}


 30%|██▉       | 1770/5960 [10:14<22:59,  3.04it/s]

{'loss': 0.3773, 'learning_rate': 3.51510067114094e-05, 'epoch': 2.97}


 30%|██▉       | 1780/5960 [10:18<22:51,  3.05it/s]

{'loss': 0.4961, 'learning_rate': 3.506711409395974e-05, 'epoch': 2.99}


                                                   
 30%|███       | 1788/5960 [10:34<22:52,  3.04it/s]

{'eval_loss': 0.5713002681732178, 'eval_runtime': 13.9738, 'eval_samples_per_second': 85.302, 'eval_steps_per_second': 10.663, 'epoch': 3.0}


 30%|███       | 1790/5960 [10:39<4:41:16,  4.05s/it]

{'loss': 0.3846, 'learning_rate': 3.498322147651007e-05, 'epoch': 3.0}


 30%|███       | 1800/5960 [10:42<30:08,  2.30it/s]  

{'loss': 0.3724, 'learning_rate': 3.4899328859060405e-05, 'epoch': 3.02}


 30%|███       | 1810/5960 [10:45<23:02,  3.00it/s]

{'loss': 0.4497, 'learning_rate': 3.481543624161074e-05, 'epoch': 3.04}


 31%|███       | 1820/5960 [10:49<22:48,  3.03it/s]

{'loss': 0.3105, 'learning_rate': 3.473154362416108e-05, 'epoch': 3.05}


 31%|███       | 1830/5960 [10:52<22:43,  3.03it/s]

{'loss': 0.4727, 'learning_rate': 3.464765100671141e-05, 'epoch': 3.07}


 31%|███       | 1840/5960 [10:55<22:40,  3.03it/s]

{'loss': 0.4865, 'learning_rate': 3.4563758389261744e-05, 'epoch': 3.09}


 31%|███       | 1850/5960 [10:58<22:34,  3.03it/s]

{'loss': 0.5854, 'learning_rate': 3.4479865771812084e-05, 'epoch': 3.1}


 31%|███       | 1860/5960 [11:02<22:33,  3.03it/s]

{'loss': 0.5263, 'learning_rate': 3.439597315436242e-05, 'epoch': 3.12}


 31%|███▏      | 1870/5960 [11:05<22:31,  3.03it/s]

{'loss': 0.5122, 'learning_rate': 3.431208053691276e-05, 'epoch': 3.14}


 32%|███▏      | 1880/5960 [11:08<22:25,  3.03it/s]

{'loss': 0.5227, 'learning_rate': 3.422818791946309e-05, 'epoch': 3.15}


 32%|███▏      | 1890/5960 [11:12<22:24,  3.03it/s]

{'loss': 0.5557, 'learning_rate': 3.414429530201342e-05, 'epoch': 3.17}


 32%|███▏      | 1900/5960 [11:15<22:23,  3.02it/s]

{'loss': 0.6405, 'learning_rate': 3.4060402684563756e-05, 'epoch': 3.19}


 32%|███▏      | 1910/5960 [11:18<22:15,  3.03it/s]

{'loss': 0.4244, 'learning_rate': 3.3976510067114096e-05, 'epoch': 3.2}


 32%|███▏      | 1920/5960 [11:22<22:13,  3.03it/s]

{'loss': 0.5127, 'learning_rate': 3.389261744966443e-05, 'epoch': 3.22}


 32%|███▏      | 1930/5960 [11:25<22:11,  3.03it/s]

{'loss': 0.4149, 'learning_rate': 3.380872483221477e-05, 'epoch': 3.24}


 33%|███▎      | 1940/5960 [11:28<22:06,  3.03it/s]

{'loss': 0.4302, 'learning_rate': 3.37248322147651e-05, 'epoch': 3.26}


 33%|███▎      | 1950/5960 [11:32<22:05,  3.02it/s]

{'loss': 0.3989, 'learning_rate': 3.3640939597315436e-05, 'epoch': 3.27}


 33%|███▎      | 1960/5960 [11:35<22:02,  3.02it/s]

{'loss': 0.4536, 'learning_rate': 3.3557046979865775e-05, 'epoch': 3.29}


 33%|███▎      | 1970/5960 [11:38<22:00,  3.02it/s]

{'loss': 0.4028, 'learning_rate': 3.347315436241611e-05, 'epoch': 3.31}


 33%|███▎      | 1980/5960 [11:41<21:52,  3.03it/s]

{'loss': 0.5425, 'learning_rate': 3.338926174496644e-05, 'epoch': 3.32}


 33%|███▎      | 1990/5960 [11:45<21:50,  3.03it/s]

{'loss': 0.421, 'learning_rate': 3.3305369127516775e-05, 'epoch': 3.34}


 34%|███▎      | 2000/5960 [11:48<21:47,  3.03it/s]

{'loss': 0.4368, 'learning_rate': 3.3221476510067115e-05, 'epoch': 3.36}


 34%|███▎      | 2010/5960 [11:51<21:44,  3.03it/s]

{'loss': 0.5047, 'learning_rate': 3.3137583892617455e-05, 'epoch': 3.37}


 34%|███▍      | 2020/5960 [11:55<21:39,  3.03it/s]

{'loss': 0.4455, 'learning_rate': 3.305369127516779e-05, 'epoch': 3.39}


 34%|███▍      | 2030/5960 [11:58<21:34,  3.04it/s]

{'loss': 0.4789, 'learning_rate': 3.296979865771812e-05, 'epoch': 3.41}


 34%|███▍      | 2040/5960 [12:01<21:32,  3.03it/s]

{'loss': 0.45, 'learning_rate': 3.288590604026846e-05, 'epoch': 3.42}


 34%|███▍      | 2050/5960 [12:05<21:31,  3.03it/s]

{'loss': 0.4623, 'learning_rate': 3.2802013422818794e-05, 'epoch': 3.44}


 35%|███▍      | 2060/5960 [12:08<21:26,  3.03it/s]

{'loss': 0.4324, 'learning_rate': 3.271812080536913e-05, 'epoch': 3.46}


 35%|███▍      | 2070/5960 [12:11<21:23,  3.03it/s]

{'loss': 0.4673, 'learning_rate': 3.263422818791946e-05, 'epoch': 3.47}


 35%|███▍      | 2080/5960 [12:15<21:23,  3.02it/s]

{'loss': 0.4243, 'learning_rate': 3.25503355704698e-05, 'epoch': 3.49}


 35%|███▌      | 2090/5960 [12:18<21:17,  3.03it/s]

{'loss': 0.475, 'learning_rate': 3.246644295302014e-05, 'epoch': 3.51}


 35%|███▌      | 2100/5960 [12:21<21:24,  3.01it/s]

{'loss': 0.4269, 'learning_rate': 3.238255033557047e-05, 'epoch': 3.52}


 35%|███▌      | 2110/5960 [12:24<21:27,  2.99it/s]

{'loss': 0.3134, 'learning_rate': 3.2298657718120806e-05, 'epoch': 3.54}


 36%|███▌      | 2120/5960 [12:28<21:24,  2.99it/s]

{'loss': 0.3528, 'learning_rate': 3.221476510067114e-05, 'epoch': 3.56}


 36%|███▌      | 2130/5960 [12:31<21:20,  2.99it/s]

{'loss': 0.3394, 'learning_rate': 3.213087248322148e-05, 'epoch': 3.57}


 36%|███▌      | 2140/5960 [12:35<21:15,  2.99it/s]

{'loss': 0.3891, 'learning_rate': 3.204697986577181e-05, 'epoch': 3.59}


 36%|███▌      | 2150/5960 [12:38<21:13,  2.99it/s]

{'loss': 0.5316, 'learning_rate': 3.196308724832215e-05, 'epoch': 3.61}


 36%|███▌      | 2160/5960 [12:41<21:07,  3.00it/s]

{'loss': 0.4191, 'learning_rate': 3.1879194630872485e-05, 'epoch': 3.62}


 36%|███▋      | 2170/5960 [12:45<21:02,  3.00it/s]

{'loss': 0.5635, 'learning_rate': 3.1795302013422825e-05, 'epoch': 3.64}


 37%|███▋      | 2180/5960 [12:48<20:48,  3.03it/s]

{'loss': 0.3958, 'learning_rate': 3.171140939597316e-05, 'epoch': 3.66}


 37%|███▋      | 2190/5960 [12:51<20:45,  3.03it/s]

{'loss': 0.3754, 'learning_rate': 3.162751677852349e-05, 'epoch': 3.67}


 37%|███▋      | 2200/5960 [12:54<20:50,  3.01it/s]

{'loss': 0.4162, 'learning_rate': 3.1543624161073825e-05, 'epoch': 3.69}


 37%|███▋      | 2210/5960 [12:58<20:51,  3.00it/s]

{'loss': 0.3695, 'learning_rate': 3.145973154362416e-05, 'epoch': 3.71}


 37%|███▋      | 2220/5960 [13:01<20:50,  2.99it/s]

{'loss': 0.3455, 'learning_rate': 3.13758389261745e-05, 'epoch': 3.72}


 37%|███▋      | 2230/5960 [13:04<20:33,  3.02it/s]

{'loss': 0.6435, 'learning_rate': 3.129194630872484e-05, 'epoch': 3.74}


 38%|███▊      | 2240/5960 [13:08<20:30,  3.02it/s]

{'loss': 0.375, 'learning_rate': 3.120805369127517e-05, 'epoch': 3.76}


 38%|███▊      | 2250/5960 [13:11<20:27,  3.02it/s]

{'loss': 0.4319, 'learning_rate': 3.1124161073825504e-05, 'epoch': 3.78}


 38%|███▊      | 2260/5960 [13:14<20:23,  3.02it/s]

{'loss': 0.5188, 'learning_rate': 3.1040268456375844e-05, 'epoch': 3.79}


 38%|███▊      | 2270/5960 [13:18<20:18,  3.03it/s]

{'loss': 0.4597, 'learning_rate': 3.095637583892618e-05, 'epoch': 3.81}


 38%|███▊      | 2280/5960 [13:21<20:16,  3.02it/s]

{'loss': 0.4223, 'learning_rate': 3.087248322147651e-05, 'epoch': 3.83}


 38%|███▊      | 2290/5960 [13:24<20:11,  3.03it/s]

{'loss': 0.569, 'learning_rate': 3.078859060402684e-05, 'epoch': 3.84}


 39%|███▊      | 2300/5960 [13:28<20:11,  3.02it/s]

{'loss': 0.4635, 'learning_rate': 3.070469798657718e-05, 'epoch': 3.86}


 39%|███▉      | 2310/5960 [13:31<20:07,  3.02it/s]

{'loss': 0.4239, 'learning_rate': 3.062080536912752e-05, 'epoch': 3.88}


 39%|███▉      | 2320/5960 [13:34<20:02,  3.03it/s]

{'loss': 0.46, 'learning_rate': 3.0536912751677856e-05, 'epoch': 3.89}


 39%|███▉      | 2330/5960 [13:38<20:02,  3.02it/s]

{'loss': 0.3568, 'learning_rate': 3.045302013422819e-05, 'epoch': 3.91}


 39%|███▉      | 2340/5960 [13:41<19:55,  3.03it/s]

{'loss': 0.3456, 'learning_rate': 3.0369127516778522e-05, 'epoch': 3.93}


 39%|███▉      | 2350/5960 [13:44<19:52,  3.03it/s]

{'loss': 0.6272, 'learning_rate': 3.0285234899328862e-05, 'epoch': 3.94}


 40%|███▉      | 2360/5960 [13:47<19:41,  3.05it/s]

{'loss': 0.4149, 'learning_rate': 3.02013422818792e-05, 'epoch': 3.96}


 40%|███▉      | 2370/5960 [13:51<19:38,  3.05it/s]

{'loss': 0.5397, 'learning_rate': 3.011744966442953e-05, 'epoch': 3.98}


 40%|███▉      | 2380/5960 [13:54<19:35,  3.05it/s]

{'loss': 0.5468, 'learning_rate': 3.0033557046979865e-05, 'epoch': 3.99}


                                                   
 40%|████      | 2384/5960 [14:09<19:35,  3.04it/s]

{'eval_loss': 0.537239134311676, 'eval_runtime': 13.9328, 'eval_samples_per_second': 85.553, 'eval_steps_per_second': 10.694, 'epoch': 4.0}


 40%|████      | 2390/5960 [14:16<1:14:14,  1.25s/it]

{'loss': 0.2624, 'learning_rate': 2.9949664429530205e-05, 'epoch': 4.01}


 40%|████      | 2400/5960 [14:19<20:56,  2.83it/s]  

{'loss': 0.4461, 'learning_rate': 2.986577181208054e-05, 'epoch': 4.03}


 40%|████      | 2410/5960 [14:22<19:23,  3.05it/s]

{'loss': 0.2938, 'learning_rate': 2.9781879194630874e-05, 'epoch': 4.04}


 41%|████      | 2420/5960 [14:25<19:26,  3.03it/s]

{'loss': 0.4811, 'learning_rate': 2.9697986577181207e-05, 'epoch': 4.06}


 41%|████      | 2430/5960 [14:29<19:20,  3.04it/s]

{'loss': 0.4364, 'learning_rate': 2.9614093959731544e-05, 'epoch': 4.08}


 41%|████      | 2440/5960 [14:32<19:18,  3.04it/s]

{'loss': 0.4982, 'learning_rate': 2.9530201342281884e-05, 'epoch': 4.09}


 41%|████      | 2450/5960 [14:35<19:15,  3.04it/s]

{'loss': 0.4459, 'learning_rate': 2.9446308724832217e-05, 'epoch': 4.11}


 41%|████▏     | 2460/5960 [14:39<19:14,  3.03it/s]

{'loss': 0.6092, 'learning_rate': 2.936241610738255e-05, 'epoch': 4.13}


 41%|████▏     | 2470/5960 [14:42<19:10,  3.03it/s]

{'loss': 0.4862, 'learning_rate': 2.9278523489932887e-05, 'epoch': 4.14}


 42%|████▏     | 2480/5960 [14:45<19:06,  3.04it/s]

{'loss': 0.5802, 'learning_rate': 2.9194630872483227e-05, 'epoch': 4.16}


 42%|████▏     | 2490/5960 [14:49<19:03,  3.03it/s]

{'loss': 0.3782, 'learning_rate': 2.911073825503356e-05, 'epoch': 4.18}


 42%|████▏     | 2500/5960 [14:52<18:58,  3.04it/s]

{'loss': 0.5213, 'learning_rate': 2.9026845637583893e-05, 'epoch': 4.19}


 42%|████▏     | 2510/5960 [14:55<18:56,  3.04it/s]

{'loss': 0.4089, 'learning_rate': 2.894295302013423e-05, 'epoch': 4.21}


 42%|████▏     | 2520/5960 [14:58<18:56,  3.03it/s]

{'loss': 0.3947, 'learning_rate': 2.885906040268457e-05, 'epoch': 4.23}


 42%|████▏     | 2530/5960 [15:02<18:51,  3.03it/s]

{'loss': 0.3916, 'learning_rate': 2.8775167785234902e-05, 'epoch': 4.24}


 43%|████▎     | 2540/5960 [15:05<18:48,  3.03it/s]

{'loss': 0.4546, 'learning_rate': 2.8691275167785235e-05, 'epoch': 4.26}


 43%|████▎     | 2550/5960 [15:08<18:54,  3.01it/s]

{'loss': 0.3907, 'learning_rate': 2.8607382550335572e-05, 'epoch': 4.28}


 43%|████▎     | 2560/5960 [15:12<18:53,  3.00it/s]

{'loss': 0.3571, 'learning_rate': 2.8523489932885905e-05, 'epoch': 4.3}


 43%|████▎     | 2570/5960 [15:15<18:53,  2.99it/s]

{'loss': 0.5364, 'learning_rate': 2.8439597315436245e-05, 'epoch': 4.31}


 43%|████▎     | 2580/5960 [15:18<18:47,  3.00it/s]

{'loss': 0.4645, 'learning_rate': 2.8355704697986578e-05, 'epoch': 4.33}


 43%|████▎     | 2590/5960 [15:22<18:34,  3.02it/s]

{'loss': 0.5624, 'learning_rate': 2.8271812080536915e-05, 'epoch': 4.35}


 44%|████▎     | 2600/5960 [15:25<18:31,  3.02it/s]

{'loss': 0.3114, 'learning_rate': 2.8187919463087248e-05, 'epoch': 4.36}


 44%|████▍     | 2610/5960 [15:28<18:28,  3.02it/s]

{'loss': 0.4272, 'learning_rate': 2.8104026845637588e-05, 'epoch': 4.38}


 44%|████▍     | 2620/5960 [15:32<18:24,  3.02it/s]

{'loss': 0.3439, 'learning_rate': 2.802013422818792e-05, 'epoch': 4.4}


 44%|████▍     | 2630/5960 [15:35<18:18,  3.03it/s]

{'loss': 0.4106, 'learning_rate': 2.7936241610738257e-05, 'epoch': 4.41}


 44%|████▍     | 2640/5960 [15:38<18:16,  3.03it/s]

{'loss': 0.3722, 'learning_rate': 2.785234899328859e-05, 'epoch': 4.43}


 44%|████▍     | 2650/5960 [15:42<18:13,  3.03it/s]

{'loss': 0.252, 'learning_rate': 2.7768456375838923e-05, 'epoch': 4.45}


 45%|████▍     | 2660/5960 [15:45<18:08,  3.03it/s]

{'loss': 0.39, 'learning_rate': 2.7684563758389263e-05, 'epoch': 4.46}


 45%|████▍     | 2670/5960 [15:48<18:05,  3.03it/s]

{'loss': 0.482, 'learning_rate': 2.76006711409396e-05, 'epoch': 4.48}


 45%|████▍     | 2680/5960 [15:51<18:03,  3.03it/s]

{'loss': 0.6073, 'learning_rate': 2.7516778523489933e-05, 'epoch': 4.5}


 45%|████▌     | 2690/5960 [15:55<18:00,  3.03it/s]

{'loss': 0.4707, 'learning_rate': 2.7432885906040266e-05, 'epoch': 4.51}


 45%|████▌     | 2700/5960 [15:58<17:56,  3.03it/s]

{'loss': 0.4336, 'learning_rate': 2.7348993288590606e-05, 'epoch': 4.53}


 45%|████▌     | 2710/5960 [16:01<17:53,  3.03it/s]

{'loss': 0.5277, 'learning_rate': 2.7265100671140943e-05, 'epoch': 4.55}


 46%|████▌     | 2720/5960 [16:05<17:50,  3.03it/s]

{'loss': 0.4406, 'learning_rate': 2.7181208053691276e-05, 'epoch': 4.56}


 46%|████▌     | 2730/5960 [16:08<17:46,  3.03it/s]

{'loss': 0.2944, 'learning_rate': 2.709731543624161e-05, 'epoch': 4.58}


 46%|████▌     | 2740/5960 [16:11<17:43,  3.03it/s]

{'loss': 0.537, 'learning_rate': 2.701342281879195e-05, 'epoch': 4.6}


 46%|████▌     | 2750/5960 [16:15<17:42,  3.02it/s]

{'loss': 0.3495, 'learning_rate': 2.6929530201342285e-05, 'epoch': 4.61}


 46%|████▋     | 2760/5960 [16:18<17:36,  3.03it/s]

{'loss': 0.496, 'learning_rate': 2.6845637583892618e-05, 'epoch': 4.63}


 46%|████▋     | 2770/5960 [16:21<17:33,  3.03it/s]

{'loss': 0.5214, 'learning_rate': 2.6761744966442955e-05, 'epoch': 4.65}


 47%|████▋     | 2780/5960 [16:24<17:31,  3.02it/s]

{'loss': 0.4392, 'learning_rate': 2.6677852348993288e-05, 'epoch': 4.66}


 47%|████▋     | 2790/5960 [16:28<17:31,  3.02it/s]

{'loss': 0.4123, 'learning_rate': 2.6593959731543628e-05, 'epoch': 4.68}


 47%|████▋     | 2800/5960 [16:31<17:24,  3.02it/s]

{'loss': 0.3551, 'learning_rate': 2.651006711409396e-05, 'epoch': 4.7}


 47%|████▋     | 2810/5960 [16:34<17:19,  3.03it/s]

{'loss': 0.3919, 'learning_rate': 2.6426174496644297e-05, 'epoch': 4.71}


 47%|████▋     | 2820/5960 [16:38<17:16,  3.03it/s]

{'loss': 0.4009, 'learning_rate': 2.634228187919463e-05, 'epoch': 4.73}


 47%|████▋     | 2830/5960 [16:41<17:14,  3.03it/s]

{'loss': 0.3609, 'learning_rate': 2.625838926174497e-05, 'epoch': 4.75}


 48%|████▊     | 2840/5960 [16:44<17:10,  3.03it/s]

{'loss': 0.54, 'learning_rate': 2.6174496644295304e-05, 'epoch': 4.77}


 48%|████▊     | 2850/5960 [16:48<17:05,  3.03it/s]

{'loss': 0.5031, 'learning_rate': 2.609060402684564e-05, 'epoch': 4.78}


 48%|████▊     | 2860/5960 [16:51<17:02,  3.03it/s]

{'loss': 0.4489, 'learning_rate': 2.6006711409395973e-05, 'epoch': 4.8}


 48%|████▊     | 2870/5960 [16:54<16:57,  3.04it/s]

{'loss': 0.336, 'learning_rate': 2.5922818791946306e-05, 'epoch': 4.82}


 48%|████▊     | 2880/5960 [16:58<16:56,  3.03it/s]

{'loss': 0.4779, 'learning_rate': 2.5838926174496646e-05, 'epoch': 4.83}


 48%|████▊     | 2890/5960 [17:01<16:53,  3.03it/s]

{'loss': 0.4839, 'learning_rate': 2.5755033557046983e-05, 'epoch': 4.85}


 49%|████▊     | 2900/5960 [17:04<16:46,  3.04it/s]

{'loss': 0.4441, 'learning_rate': 2.5671140939597316e-05, 'epoch': 4.87}


 49%|████▉     | 2910/5960 [17:07<16:44,  3.04it/s]

{'loss': 0.4764, 'learning_rate': 2.558724832214765e-05, 'epoch': 4.88}


 49%|████▉     | 2920/5960 [17:11<16:41,  3.04it/s]

{'loss': 0.4247, 'learning_rate': 2.550335570469799e-05, 'epoch': 4.9}


 49%|████▉     | 2930/5960 [17:14<16:35,  3.04it/s]

{'loss': 0.5405, 'learning_rate': 2.5419463087248325e-05, 'epoch': 4.92}


 49%|████▉     | 2940/5960 [17:17<16:34,  3.04it/s]

{'loss': 0.2988, 'learning_rate': 2.533557046979866e-05, 'epoch': 4.93}


 49%|████▉     | 2950/5960 [17:21<16:28,  3.05it/s]

{'loss': 0.4454, 'learning_rate': 2.525167785234899e-05, 'epoch': 4.95}


 50%|████▉     | 2960/5960 [17:24<16:26,  3.04it/s]

{'loss': 0.6859, 'learning_rate': 2.516778523489933e-05, 'epoch': 4.97}


 50%|████▉     | 2970/5960 [17:27<16:23,  3.04it/s]

{'loss': 0.4813, 'learning_rate': 2.5083892617449668e-05, 'epoch': 4.98}


 50%|█████     | 2980/5960 [17:30<16:19,  3.04it/s]

{'loss': 0.5382, 'learning_rate': 2.5e-05, 'epoch': 5.0}


                                                   
 50%|█████     | 2980/5960 [17:44<16:19,  3.04it/s]

{'eval_loss': 0.5802309513092041, 'eval_runtime': 13.9536, 'eval_samples_per_second': 85.426, 'eval_steps_per_second': 10.678, 'epoch': 5.0}


 50%|█████     | 2990/5960 [17:52<26:59,  1.83it/s]  

{'loss': 0.4446, 'learning_rate': 2.4916107382550334e-05, 'epoch': 5.02}


 50%|█████     | 3000/5960 [17:55<16:30,  2.99it/s]

{'loss': 0.2971, 'learning_rate': 2.4832214765100674e-05, 'epoch': 5.03}


 51%|█████     | 3010/5960 [17:58<16:05,  3.06it/s]

{'loss': 0.5916, 'learning_rate': 2.4748322147651007e-05, 'epoch': 5.05}


 51%|█████     | 3020/5960 [18:02<16:04,  3.05it/s]

{'loss': 0.529, 'learning_rate': 2.4664429530201344e-05, 'epoch': 5.07}


 51%|█████     | 3030/5960 [18:05<15:58,  3.06it/s]

{'loss': 0.3323, 'learning_rate': 2.4580536912751677e-05, 'epoch': 5.08}


 51%|█████     | 3040/5960 [18:08<15:55,  3.06it/s]

{'loss': 0.3215, 'learning_rate': 2.4496644295302017e-05, 'epoch': 5.1}


 51%|█████     | 3050/5960 [18:11<15:54,  3.05it/s]

{'loss': 0.4215, 'learning_rate': 2.441275167785235e-05, 'epoch': 5.12}


 51%|█████▏    | 3060/5960 [18:15<15:50,  3.05it/s]

{'loss': 0.3806, 'learning_rate': 2.4328859060402687e-05, 'epoch': 5.13}


 52%|█████▏    | 3070/5960 [18:18<15:47,  3.05it/s]

{'loss': 0.415, 'learning_rate': 2.424496644295302e-05, 'epoch': 5.15}


 52%|█████▏    | 3080/5960 [18:21<15:44,  3.05it/s]

{'loss': 0.4984, 'learning_rate': 2.416107382550336e-05, 'epoch': 5.17}


 52%|█████▏    | 3090/5960 [18:25<15:40,  3.05it/s]

{'loss': 0.3649, 'learning_rate': 2.4077181208053693e-05, 'epoch': 5.18}


 52%|█████▏    | 3100/5960 [18:28<15:38,  3.05it/s]

{'loss': 0.3089, 'learning_rate': 2.3993288590604026e-05, 'epoch': 5.2}


 52%|█████▏    | 3110/5960 [18:31<15:40,  3.03it/s]

{'loss': 0.6444, 'learning_rate': 2.3909395973154362e-05, 'epoch': 5.22}


 52%|█████▏    | 3120/5960 [18:34<15:34,  3.04it/s]

{'loss': 0.3655, 'learning_rate': 2.38255033557047e-05, 'epoch': 5.23}


 53%|█████▎    | 3130/5960 [18:38<15:32,  3.04it/s]

{'loss': 0.4338, 'learning_rate': 2.3741610738255035e-05, 'epoch': 5.25}


 53%|█████▎    | 3140/5960 [18:41<15:33,  3.02it/s]

{'loss': 0.3056, 'learning_rate': 2.365771812080537e-05, 'epoch': 5.27}


 53%|█████▎    | 3150/5960 [18:44<15:26,  3.03it/s]

{'loss': 0.3423, 'learning_rate': 2.3573825503355705e-05, 'epoch': 5.29}


 53%|█████▎    | 3160/5960 [18:48<15:21,  3.04it/s]

{'loss': 0.2672, 'learning_rate': 2.348993288590604e-05, 'epoch': 5.3}


 53%|█████▎    | 3170/5960 [18:51<15:17,  3.04it/s]

{'loss': 0.4579, 'learning_rate': 2.3406040268456378e-05, 'epoch': 5.32}


 53%|█████▎    | 3180/5960 [18:54<15:16,  3.03it/s]

{'loss': 0.4813, 'learning_rate': 2.332214765100671e-05, 'epoch': 5.34}


 54%|█████▎    | 3190/5960 [18:58<15:11,  3.04it/s]

{'loss': 0.3958, 'learning_rate': 2.3238255033557048e-05, 'epoch': 5.35}


 54%|█████▎    | 3200/5960 [19:01<15:08,  3.04it/s]

{'loss': 0.5318, 'learning_rate': 2.3154362416107384e-05, 'epoch': 5.37}


 54%|█████▍    | 3210/5960 [19:04<15:06,  3.03it/s]

{'loss': 0.4147, 'learning_rate': 2.3070469798657717e-05, 'epoch': 5.39}


 54%|█████▍    | 3220/5960 [19:07<15:05,  3.03it/s]

{'loss': 0.4649, 'learning_rate': 2.2986577181208054e-05, 'epoch': 5.4}


 54%|█████▍    | 3230/5960 [19:11<15:00,  3.03it/s]

{'loss': 0.3308, 'learning_rate': 2.290268456375839e-05, 'epoch': 5.42}


 54%|█████▍    | 3240/5960 [19:14<14:57,  3.03it/s]

{'loss': 0.5491, 'learning_rate': 2.2818791946308727e-05, 'epoch': 5.44}


 55%|█████▍    | 3250/5960 [19:17<14:55,  3.03it/s]

{'loss': 0.3418, 'learning_rate': 2.273489932885906e-05, 'epoch': 5.45}


 55%|█████▍    | 3260/5960 [19:21<14:57,  3.01it/s]

{'loss': 0.3312, 'learning_rate': 2.2651006711409396e-05, 'epoch': 5.47}


 55%|█████▍    | 3270/5960 [19:24<14:55,  3.00it/s]

{'loss': 0.4222, 'learning_rate': 2.2567114093959733e-05, 'epoch': 5.49}


 55%|█████▌    | 3280/5960 [19:27<14:53,  3.00it/s]

{'loss': 0.2927, 'learning_rate': 2.248322147651007e-05, 'epoch': 5.5}


 55%|█████▌    | 3290/5960 [19:31<14:49,  3.00it/s]

{'loss': 0.4053, 'learning_rate': 2.2399328859060403e-05, 'epoch': 5.52}


 55%|█████▌    | 3300/5960 [19:34<14:40,  3.02it/s]

{'loss': 0.2968, 'learning_rate': 2.231543624161074e-05, 'epoch': 5.54}


 56%|█████▌    | 3310/5960 [19:37<14:35,  3.03it/s]

{'loss': 0.3736, 'learning_rate': 2.2231543624161076e-05, 'epoch': 5.55}


 56%|█████▌    | 3320/5960 [19:41<14:33,  3.02it/s]

{'loss': 0.2783, 'learning_rate': 2.2147651006711412e-05, 'epoch': 5.57}


 56%|█████▌    | 3330/5960 [19:44<14:29,  3.02it/s]

{'loss': 0.3701, 'learning_rate': 2.2063758389261745e-05, 'epoch': 5.59}


 56%|█████▌    | 3340/5960 [19:47<14:27,  3.02it/s]

{'loss': 0.3153, 'learning_rate': 2.197986577181208e-05, 'epoch': 5.6}


 56%|█████▌    | 3350/5960 [19:51<14:24,  3.02it/s]

{'loss': 0.3474, 'learning_rate': 2.1895973154362418e-05, 'epoch': 5.62}


 56%|█████▋    | 3360/5960 [19:54<14:21,  3.02it/s]

{'loss': 0.4666, 'learning_rate': 2.181208053691275e-05, 'epoch': 5.64}


 57%|█████▋    | 3370/5960 [19:57<14:18,  3.02it/s]

{'loss': 0.4154, 'learning_rate': 2.1728187919463088e-05, 'epoch': 5.65}


 57%|█████▋    | 3380/5960 [20:01<14:14,  3.02it/s]

{'loss': 0.4117, 'learning_rate': 2.1644295302013424e-05, 'epoch': 5.67}


 57%|█████▋    | 3390/5960 [20:04<14:09,  3.02it/s]

{'loss': 0.261, 'learning_rate': 2.156040268456376e-05, 'epoch': 5.69}


 57%|█████▋    | 3400/5960 [20:07<14:07,  3.02it/s]

{'loss': 0.4131, 'learning_rate': 2.1476510067114094e-05, 'epoch': 5.7}


 57%|█████▋    | 3410/5960 [20:10<14:04,  3.02it/s]

{'loss': 0.2941, 'learning_rate': 2.139261744966443e-05, 'epoch': 5.72}


 57%|█████▋    | 3420/5960 [20:14<14:00,  3.02it/s]

{'loss': 0.447, 'learning_rate': 2.1308724832214767e-05, 'epoch': 5.74}


 58%|█████▊    | 3430/5960 [20:17<13:56,  3.03it/s]

{'loss': 0.3175, 'learning_rate': 2.1224832214765103e-05, 'epoch': 5.76}


 58%|█████▊    | 3440/5960 [20:20<13:54,  3.02it/s]

{'loss': 0.3279, 'learning_rate': 2.1140939597315437e-05, 'epoch': 5.77}


 58%|█████▊    | 3450/5960 [20:24<13:50,  3.02it/s]

{'loss': 0.4128, 'learning_rate': 2.1057046979865773e-05, 'epoch': 5.79}


 58%|█████▊    | 3460/5960 [20:27<13:46,  3.02it/s]

{'loss': 0.4065, 'learning_rate': 2.097315436241611e-05, 'epoch': 5.81}


 58%|█████▊    | 3470/5960 [20:30<13:44,  3.02it/s]

{'loss': 0.3763, 'learning_rate': 2.0889261744966443e-05, 'epoch': 5.82}


 58%|█████▊    | 3480/5960 [20:34<13:41,  3.02it/s]

{'loss': 0.2671, 'learning_rate': 2.080536912751678e-05, 'epoch': 5.84}


 59%|█████▊    | 3490/5960 [20:37<13:38,  3.02it/s]

{'loss': 0.4516, 'learning_rate': 2.0721476510067116e-05, 'epoch': 5.86}


 59%|█████▊    | 3500/5960 [20:40<13:34,  3.02it/s]

{'loss': 0.3331, 'learning_rate': 2.0637583892617452e-05, 'epoch': 5.87}


 59%|█████▉    | 3510/5960 [20:44<13:31,  3.02it/s]

{'loss': 0.2854, 'learning_rate': 2.0553691275167785e-05, 'epoch': 5.89}


 59%|█████▉    | 3520/5960 [20:47<13:27,  3.02it/s]

{'loss': 0.3836, 'learning_rate': 2.0469798657718122e-05, 'epoch': 5.91}


 59%|█████▉    | 3530/5960 [20:50<13:23,  3.02it/s]

{'loss': 0.4045, 'learning_rate': 2.038590604026846e-05, 'epoch': 5.92}


 59%|█████▉    | 3540/5960 [20:54<13:19,  3.03it/s]

{'loss': 0.3067, 'learning_rate': 2.0302013422818795e-05, 'epoch': 5.94}


 60%|█████▉    | 3550/5960 [20:57<13:17,  3.02it/s]

{'loss': 0.2765, 'learning_rate': 2.0218120805369128e-05, 'epoch': 5.96}


 60%|█████▉    | 3560/5960 [21:00<13:13,  3.02it/s]

{'loss': 0.4874, 'learning_rate': 2.013422818791946e-05, 'epoch': 5.97}


 60%|█████▉    | 3570/5960 [21:03<13:09,  3.03it/s]

{'loss': 0.4324, 'learning_rate': 2.00503355704698e-05, 'epoch': 5.99}


                                                   
 60%|██████    | 3576/5960 [21:19<13:09,  3.02it/s]

{'eval_loss': 0.5191736817359924, 'eval_runtime': 14.0025, 'eval_samples_per_second': 85.128, 'eval_steps_per_second': 10.641, 'epoch': 6.0}


 60%|██████    | 3580/5960 [21:25<1:27:26,  2.20s/it]

{'loss': 0.397, 'learning_rate': 1.9966442953020134e-05, 'epoch': 6.01}


 60%|██████    | 3590/5960 [21:28<15:03,  2.62it/s]  

{'loss': 0.3555, 'learning_rate': 1.988255033557047e-05, 'epoch': 6.02}


 60%|██████    | 3600/5960 [21:32<12:56,  3.04it/s]

{'loss': 0.4539, 'learning_rate': 1.9798657718120804e-05, 'epoch': 6.04}


 61%|██████    | 3610/5960 [21:35<12:50,  3.05it/s]

{'loss': 0.3266, 'learning_rate': 1.9714765100671144e-05, 'epoch': 6.06}


 61%|██████    | 3620/5960 [21:38<12:47,  3.05it/s]

{'loss': 0.38, 'learning_rate': 1.9630872483221477e-05, 'epoch': 6.07}


 61%|██████    | 3630/5960 [21:41<12:43,  3.05it/s]

{'loss': 0.2315, 'learning_rate': 1.9546979865771813e-05, 'epoch': 6.09}


 61%|██████    | 3640/5960 [21:45<12:41,  3.05it/s]

{'loss': 0.3211, 'learning_rate': 1.946308724832215e-05, 'epoch': 6.11}


 61%|██████    | 3650/5960 [21:48<12:37,  3.05it/s]

{'loss': 0.2991, 'learning_rate': 1.9379194630872486e-05, 'epoch': 6.12}


 61%|██████▏   | 3660/5960 [21:51<12:36,  3.04it/s]

{'loss': 0.1558, 'learning_rate': 1.929530201342282e-05, 'epoch': 6.14}


 62%|██████▏   | 3670/5960 [21:55<12:32,  3.04it/s]

{'loss': 0.5391, 'learning_rate': 1.9211409395973153e-05, 'epoch': 6.16}


 62%|██████▏   | 3680/5960 [21:58<12:27,  3.05it/s]

{'loss': 0.2431, 'learning_rate': 1.9127516778523493e-05, 'epoch': 6.17}


 62%|██████▏   | 3690/5960 [22:01<12:25,  3.04it/s]

{'loss': 0.2629, 'learning_rate': 1.9043624161073826e-05, 'epoch': 6.19}


 62%|██████▏   | 3700/5960 [22:04<12:24,  3.03it/s]

{'loss': 0.3803, 'learning_rate': 1.8959731543624162e-05, 'epoch': 6.21}


 62%|██████▏   | 3710/5960 [22:08<12:27,  3.01it/s]

{'loss': 0.301, 'learning_rate': 1.8875838926174495e-05, 'epoch': 6.22}


 62%|██████▏   | 3720/5960 [22:11<12:22,  3.02it/s]

{'loss': 0.3876, 'learning_rate': 1.8791946308724835e-05, 'epoch': 6.24}


 63%|██████▎   | 3730/5960 [22:14<12:20,  3.01it/s]

{'loss': 0.4822, 'learning_rate': 1.870805369127517e-05, 'epoch': 6.26}


 63%|██████▎   | 3740/5960 [22:18<12:13,  3.03it/s]

{'loss': 0.2788, 'learning_rate': 1.8624161073825505e-05, 'epoch': 6.28}


 63%|██████▎   | 3750/5960 [22:21<12:11,  3.02it/s]

{'loss': 0.4421, 'learning_rate': 1.8540268456375838e-05, 'epoch': 6.29}


 63%|██████▎   | 3760/5960 [22:24<12:08,  3.02it/s]

{'loss': 0.3053, 'learning_rate': 1.8456375838926178e-05, 'epoch': 6.31}


 63%|██████▎   | 3770/5960 [22:28<12:04,  3.02it/s]

{'loss': 0.3705, 'learning_rate': 1.837248322147651e-05, 'epoch': 6.33}


 63%|██████▎   | 3780/5960 [22:31<12:01,  3.02it/s]

{'loss': 0.2223, 'learning_rate': 1.8288590604026847e-05, 'epoch': 6.34}


 64%|██████▎   | 3790/5960 [22:34<11:58,  3.02it/s]

{'loss': 0.3828, 'learning_rate': 1.820469798657718e-05, 'epoch': 6.36}


 64%|██████▍   | 3800/5960 [22:38<11:56,  3.02it/s]

{'loss': 0.4703, 'learning_rate': 1.8120805369127517e-05, 'epoch': 6.38}


 64%|██████▍   | 3810/5960 [22:41<11:52,  3.02it/s]

{'loss': 0.4021, 'learning_rate': 1.8036912751677854e-05, 'epoch': 6.39}


 64%|██████▍   | 3820/5960 [22:44<11:47,  3.03it/s]

{'loss': 0.3358, 'learning_rate': 1.7953020134228187e-05, 'epoch': 6.41}


 64%|██████▍   | 3830/5960 [22:48<11:44,  3.02it/s]

{'loss': 0.222, 'learning_rate': 1.7869127516778523e-05, 'epoch': 6.43}


 64%|██████▍   | 3840/5960 [22:51<11:42,  3.02it/s]

{'loss': 0.3087, 'learning_rate': 1.778523489932886e-05, 'epoch': 6.44}


 65%|██████▍   | 3850/5960 [22:54<11:38,  3.02it/s]

{'loss': 0.3507, 'learning_rate': 1.7701342281879196e-05, 'epoch': 6.46}


 65%|██████▍   | 3860/5960 [22:57<11:35,  3.02it/s]

{'loss': 0.348, 'learning_rate': 1.761744966442953e-05, 'epoch': 6.48}


 65%|██████▍   | 3870/5960 [23:01<11:32,  3.02it/s]

{'loss': 0.2791, 'learning_rate': 1.753355704697987e-05, 'epoch': 6.49}


 65%|██████▌   | 3880/5960 [23:04<11:28,  3.02it/s]

{'loss': 0.2871, 'learning_rate': 1.7449664429530202e-05, 'epoch': 6.51}


 65%|██████▌   | 3890/5960 [23:07<11:25,  3.02it/s]

{'loss': 0.2885, 'learning_rate': 1.736577181208054e-05, 'epoch': 6.53}


 65%|██████▌   | 3900/5960 [23:11<11:22,  3.02it/s]

{'loss': 0.3355, 'learning_rate': 1.7281879194630872e-05, 'epoch': 6.54}


 66%|██████▌   | 3910/5960 [23:14<11:19,  3.02it/s]

{'loss': 0.3409, 'learning_rate': 1.719798657718121e-05, 'epoch': 6.56}


 66%|██████▌   | 3920/5960 [23:17<11:15,  3.02it/s]

{'loss': 0.445, 'learning_rate': 1.7114093959731545e-05, 'epoch': 6.58}


 66%|██████▌   | 3930/5960 [23:21<11:12,  3.02it/s]

{'loss': 0.1642, 'learning_rate': 1.7030201342281878e-05, 'epoch': 6.59}


 66%|██████▌   | 3940/5960 [23:24<11:08,  3.02it/s]

{'loss': 0.4061, 'learning_rate': 1.6946308724832215e-05, 'epoch': 6.61}


 66%|██████▋   | 3950/5960 [23:27<11:05,  3.02it/s]

{'loss': 0.5563, 'learning_rate': 1.686241610738255e-05, 'epoch': 6.63}


 66%|██████▋   | 3960/5960 [23:31<11:02,  3.02it/s]

{'loss': 0.3686, 'learning_rate': 1.6778523489932888e-05, 'epoch': 6.64}


 67%|██████▋   | 3970/5960 [23:34<10:58,  3.02it/s]

{'loss': 0.2394, 'learning_rate': 1.669463087248322e-05, 'epoch': 6.66}


 67%|██████▋   | 3980/5960 [23:37<10:56,  3.02it/s]

{'loss': 0.3127, 'learning_rate': 1.6610738255033557e-05, 'epoch': 6.68}


 67%|██████▋   | 3990/5960 [23:41<10:52,  3.02it/s]

{'loss': 0.3138, 'learning_rate': 1.6526845637583894e-05, 'epoch': 6.69}


 67%|██████▋   | 4000/5960 [23:44<10:49,  3.02it/s]

{'loss': 0.3632, 'learning_rate': 1.644295302013423e-05, 'epoch': 6.71}


 67%|██████▋   | 4010/5960 [23:47<10:45,  3.02it/s]

{'loss': 0.258, 'learning_rate': 1.6359060402684563e-05, 'epoch': 6.73}


 67%|██████▋   | 4020/5960 [23:51<10:42,  3.02it/s]

{'loss': 0.3012, 'learning_rate': 1.62751677852349e-05, 'epoch': 6.74}


 68%|██████▊   | 4030/5960 [23:54<10:39,  3.02it/s]

{'loss': 0.3461, 'learning_rate': 1.6191275167785237e-05, 'epoch': 6.76}


 68%|██████▊   | 4040/5960 [23:57<10:36,  3.02it/s]

{'loss': 0.2166, 'learning_rate': 1.610738255033557e-05, 'epoch': 6.78}


 68%|██████▊   | 4050/5960 [24:00<10:33,  3.02it/s]

{'loss': 0.3312, 'learning_rate': 1.6023489932885906e-05, 'epoch': 6.8}


 68%|██████▊   | 4060/5960 [24:04<10:30,  3.02it/s]

{'loss': 0.4831, 'learning_rate': 1.5939597315436243e-05, 'epoch': 6.81}


 68%|██████▊   | 4070/5960 [24:07<10:26,  3.02it/s]

{'loss': 0.2245, 'learning_rate': 1.585570469798658e-05, 'epoch': 6.83}


 68%|██████▊   | 4080/5960 [24:10<10:21,  3.02it/s]

{'loss': 0.3674, 'learning_rate': 1.5771812080536912e-05, 'epoch': 6.85}


 69%|██████▊   | 4090/5960 [24:14<10:18,  3.02it/s]

{'loss': 0.3171, 'learning_rate': 1.568791946308725e-05, 'epoch': 6.86}


 69%|██████▉   | 4100/5960 [24:17<10:16,  3.02it/s]

{'loss': 0.3967, 'learning_rate': 1.5604026845637585e-05, 'epoch': 6.88}


 69%|██████▉   | 4110/5960 [24:20<10:15,  3.01it/s]

{'loss': 0.305, 'learning_rate': 1.5520134228187922e-05, 'epoch': 6.9}


 69%|██████▉   | 4120/5960 [24:24<10:10,  3.01it/s]

{'loss': 0.4571, 'learning_rate': 1.5436241610738255e-05, 'epoch': 6.91}


 69%|██████▉   | 4130/5960 [24:27<10:07,  3.01it/s]

{'loss': 0.1836, 'learning_rate': 1.535234899328859e-05, 'epoch': 6.93}


 69%|██████▉   | 4140/5960 [24:30<10:10,  2.98it/s]

{'loss': 0.2879, 'learning_rate': 1.5268456375838928e-05, 'epoch': 6.95}


 70%|██████▉   | 4150/5960 [24:34<10:07,  2.98it/s]

{'loss': 0.3946, 'learning_rate': 1.5184563758389261e-05, 'epoch': 6.96}


 70%|██████▉   | 4160/5960 [24:37<10:02,  2.99it/s]

{'loss': 0.3796, 'learning_rate': 1.51006711409396e-05, 'epoch': 6.98}


 70%|██████▉   | 4170/5960 [24:40<09:58,  2.99it/s]

{'loss': 0.3196, 'learning_rate': 1.5016778523489932e-05, 'epoch': 7.0}


                                                   
 70%|███████   | 4172/5960 [24:55<09:56,  3.00it/s]

{'eval_loss': 0.4979461133480072, 'eval_runtime': 14.1202, 'eval_samples_per_second': 84.418, 'eval_steps_per_second': 10.552, 'epoch': 7.0}


 70%|███████   | 4180/5960 [25:02<23:14,  1.28it/s]  

{'loss': 0.1688, 'learning_rate': 1.493288590604027e-05, 'epoch': 7.01}


 70%|███████   | 4190/5960 [25:05<10:06,  2.92it/s]

{'loss': 0.1912, 'learning_rate': 1.4848993288590604e-05, 'epoch': 7.03}


 70%|███████   | 4200/5960 [25:09<09:42,  3.02it/s]

{'loss': 0.3045, 'learning_rate': 1.4765100671140942e-05, 'epoch': 7.05}


 71%|███████   | 4210/5960 [25:12<09:38,  3.02it/s]

{'loss': 0.1834, 'learning_rate': 1.4681208053691275e-05, 'epoch': 7.06}


 71%|███████   | 4220/5960 [25:15<09:36,  3.02it/s]

{'loss': 0.3369, 'learning_rate': 1.4597315436241613e-05, 'epoch': 7.08}


 71%|███████   | 4230/5960 [25:19<09:32,  3.02it/s]

{'loss': 0.3072, 'learning_rate': 1.4513422818791946e-05, 'epoch': 7.1}


 71%|███████   | 4240/5960 [25:22<09:28,  3.02it/s]

{'loss': 0.3562, 'learning_rate': 1.4429530201342285e-05, 'epoch': 7.11}


 71%|███████▏  | 4250/5960 [25:25<09:26,  3.02it/s]

{'loss': 0.3013, 'learning_rate': 1.4345637583892618e-05, 'epoch': 7.13}


 71%|███████▏  | 4260/5960 [25:29<09:22,  3.02it/s]

{'loss': 0.3414, 'learning_rate': 1.4261744966442953e-05, 'epoch': 7.15}


 72%|███████▏  | 4270/5960 [25:32<09:18,  3.02it/s]

{'loss': 0.2807, 'learning_rate': 1.4177852348993289e-05, 'epoch': 7.16}


 72%|███████▏  | 4280/5960 [25:35<09:16,  3.02it/s]

{'loss': 0.286, 'learning_rate': 1.4093959731543624e-05, 'epoch': 7.18}


 72%|███████▏  | 4290/5960 [25:38<09:13,  3.02it/s]

{'loss': 0.17, 'learning_rate': 1.401006711409396e-05, 'epoch': 7.2}


 72%|███████▏  | 4300/5960 [25:42<09:09,  3.02it/s]

{'loss': 0.2512, 'learning_rate': 1.3926174496644295e-05, 'epoch': 7.21}


 72%|███████▏  | 4310/5960 [25:45<09:06,  3.02it/s]

{'loss': 0.3873, 'learning_rate': 1.3842281879194632e-05, 'epoch': 7.23}


 72%|███████▏  | 4320/5960 [25:48<09:03,  3.02it/s]

{'loss': 0.3438, 'learning_rate': 1.3758389261744966e-05, 'epoch': 7.25}


 73%|███████▎  | 4330/5960 [25:52<08:58,  3.03it/s]

{'loss': 0.1178, 'learning_rate': 1.3674496644295303e-05, 'epoch': 7.27}


 73%|███████▎  | 4340/5960 [25:55<08:56,  3.02it/s]

{'loss': 0.1776, 'learning_rate': 1.3590604026845638e-05, 'epoch': 7.28}


 73%|███████▎  | 4350/5960 [25:58<08:52,  3.02it/s]

{'loss': 0.3391, 'learning_rate': 1.3506711409395974e-05, 'epoch': 7.3}


 73%|███████▎  | 4360/5960 [26:02<08:50,  3.02it/s]

{'loss': 0.1695, 'learning_rate': 1.3422818791946309e-05, 'epoch': 7.32}


 73%|███████▎  | 4370/5960 [26:05<08:48,  3.01it/s]

{'loss': 0.3497, 'learning_rate': 1.3338926174496644e-05, 'epoch': 7.33}


 73%|███████▎  | 4380/5960 [26:08<08:43,  3.02it/s]

{'loss': 0.3744, 'learning_rate': 1.325503355704698e-05, 'epoch': 7.35}


 74%|███████▎  | 4390/5960 [26:12<08:40,  3.02it/s]

{'loss': 0.1962, 'learning_rate': 1.3171140939597315e-05, 'epoch': 7.37}


 74%|███████▍  | 4400/5960 [26:15<08:37,  3.01it/s]

{'loss': 0.2375, 'learning_rate': 1.3087248322147652e-05, 'epoch': 7.38}


 74%|███████▍  | 4410/5960 [26:18<08:33,  3.02it/s]

{'loss': 0.1219, 'learning_rate': 1.3003355704697987e-05, 'epoch': 7.4}


 74%|███████▍  | 4420/5960 [26:22<08:30,  3.01it/s]

{'loss': 0.2644, 'learning_rate': 1.2919463087248323e-05, 'epoch': 7.42}


 74%|███████▍  | 4430/5960 [26:25<08:26,  3.02it/s]

{'loss': 0.2778, 'learning_rate': 1.2835570469798658e-05, 'epoch': 7.43}


 74%|███████▍  | 4440/5960 [26:28<08:23,  3.02it/s]

{'loss': 0.3147, 'learning_rate': 1.2751677852348994e-05, 'epoch': 7.45}


 75%|███████▍  | 4450/5960 [26:32<08:20,  3.02it/s]

{'loss': 0.503, 'learning_rate': 1.266778523489933e-05, 'epoch': 7.47}


 75%|███████▍  | 4460/5960 [26:35<08:16,  3.02it/s]

{'loss': 0.2782, 'learning_rate': 1.2583892617449666e-05, 'epoch': 7.48}


 75%|███████▌  | 4470/5960 [26:38<08:13,  3.02it/s]

{'loss': 0.0872, 'learning_rate': 1.25e-05, 'epoch': 7.5}


 75%|███████▌  | 4480/5960 [26:41<08:12,  3.01it/s]

{'loss': 0.273, 'learning_rate': 1.2416107382550337e-05, 'epoch': 7.52}


 75%|███████▌  | 4490/5960 [26:45<08:06,  3.02it/s]

{'loss': 0.2056, 'learning_rate': 1.2332214765100672e-05, 'epoch': 7.53}


 76%|███████▌  | 4500/5960 [26:48<08:03,  3.02it/s]

{'loss': 0.3044, 'learning_rate': 1.2248322147651008e-05, 'epoch': 7.55}


 76%|███████▌  | 4510/5960 [26:51<08:00,  3.02it/s]

{'loss': 0.1991, 'learning_rate': 1.2164429530201343e-05, 'epoch': 7.57}


 76%|███████▌  | 4520/5960 [26:55<07:56,  3.02it/s]

{'loss': 0.1629, 'learning_rate': 1.208053691275168e-05, 'epoch': 7.58}


 76%|███████▌  | 4530/5960 [26:58<07:54,  3.02it/s]

{'loss': 0.4406, 'learning_rate': 1.1996644295302013e-05, 'epoch': 7.6}


 76%|███████▌  | 4540/5960 [27:01<07:51,  3.01it/s]

{'loss': 0.37, 'learning_rate': 1.191275167785235e-05, 'epoch': 7.62}


 76%|███████▋  | 4550/5960 [27:05<07:46,  3.02it/s]

{'loss': 0.2162, 'learning_rate': 1.1828859060402684e-05, 'epoch': 7.63}


 77%|███████▋  | 4560/5960 [27:08<07:43,  3.02it/s]

{'loss': 0.3812, 'learning_rate': 1.174496644295302e-05, 'epoch': 7.65}


 77%|███████▋  | 4570/5960 [27:11<07:41,  3.01it/s]

{'loss': 0.2708, 'learning_rate': 1.1661073825503356e-05, 'epoch': 7.67}


 77%|███████▋  | 4580/5960 [27:15<07:36,  3.02it/s]

{'loss': 0.4097, 'learning_rate': 1.1577181208053692e-05, 'epoch': 7.68}


 77%|███████▋  | 4590/5960 [27:18<07:33,  3.02it/s]

{'loss': 0.1903, 'learning_rate': 1.1493288590604027e-05, 'epoch': 7.7}


 77%|███████▋  | 4600/5960 [27:21<07:30,  3.02it/s]

{'loss': 0.244, 'learning_rate': 1.1409395973154363e-05, 'epoch': 7.72}


 77%|███████▋  | 4610/5960 [27:25<07:26,  3.02it/s]

{'loss': 0.4632, 'learning_rate': 1.1325503355704698e-05, 'epoch': 7.73}


 78%|███████▊  | 4620/5960 [27:28<07:23,  3.02it/s]

{'loss': 0.3799, 'learning_rate': 1.1241610738255035e-05, 'epoch': 7.75}


 78%|███████▊  | 4630/5960 [27:31<07:21,  3.01it/s]

{'loss': 0.2711, 'learning_rate': 1.115771812080537e-05, 'epoch': 7.77}


 78%|███████▊  | 4640/5960 [27:35<07:16,  3.02it/s]

{'loss': 0.2275, 'learning_rate': 1.1073825503355706e-05, 'epoch': 7.79}


 78%|███████▊  | 4650/5960 [27:38<07:13,  3.02it/s]

{'loss': 0.2114, 'learning_rate': 1.098993288590604e-05, 'epoch': 7.8}


 78%|███████▊  | 4660/5960 [27:41<07:10,  3.02it/s]

{'loss': 0.2699, 'learning_rate': 1.0906040268456376e-05, 'epoch': 7.82}


 78%|███████▊  | 4670/5960 [27:45<07:11,  2.99it/s]

{'loss': 0.22, 'learning_rate': 1.0822147651006712e-05, 'epoch': 7.84}


 79%|███████▊  | 4680/5960 [27:48<07:06,  3.00it/s]

{'loss': 0.2164, 'learning_rate': 1.0738255033557047e-05, 'epoch': 7.85}


 79%|███████▊  | 4690/5960 [27:51<07:01,  3.02it/s]

{'loss': 0.246, 'learning_rate': 1.0654362416107383e-05, 'epoch': 7.87}


 79%|███████▉  | 4700/5960 [27:54<06:58,  3.01it/s]

{'loss': 0.2504, 'learning_rate': 1.0570469798657718e-05, 'epoch': 7.89}


 79%|███████▉  | 4710/5960 [27:58<06:54,  3.01it/s]

{'loss': 0.2902, 'learning_rate': 1.0486577181208055e-05, 'epoch': 7.9}


 79%|███████▉  | 4720/5960 [28:01<06:50,  3.02it/s]

{'loss': 0.2441, 'learning_rate': 1.040268456375839e-05, 'epoch': 7.92}


 79%|███████▉  | 4730/5960 [28:04<06:47,  3.02it/s]

{'loss': 0.1416, 'learning_rate': 1.0318791946308726e-05, 'epoch': 7.94}


 80%|███████▉  | 4740/5960 [28:08<06:44,  3.02it/s]

{'loss': 0.2348, 'learning_rate': 1.0234899328859061e-05, 'epoch': 7.95}


 80%|███████▉  | 4750/5960 [28:11<06:40,  3.02it/s]

{'loss': 0.1315, 'learning_rate': 1.0151006711409397e-05, 'epoch': 7.97}


 80%|███████▉  | 4760/5960 [28:14<06:36,  3.03it/s]

{'loss': 0.5138, 'learning_rate': 1.006711409395973e-05, 'epoch': 7.99}


                                                   
 80%|████████  | 4768/5960 [28:31<06:33,  3.03it/s]

{'eval_loss': 0.612006425857544, 'eval_runtime': 14.0205, 'eval_samples_per_second': 85.018, 'eval_steps_per_second': 10.627, 'epoch': 8.0}


 80%|████████  | 4770/5960 [28:36<1:23:37,  4.22s/it]

{'loss': 0.2218, 'learning_rate': 9.983221476510067e-06, 'epoch': 8.0}


 80%|████████  | 4780/5960 [28:39<08:39,  2.27it/s]  

{'loss': 0.1522, 'learning_rate': 9.899328859060402e-06, 'epoch': 8.02}


 80%|████████  | 4790/5960 [28:43<06:29,  3.00it/s]

{'loss': 0.2122, 'learning_rate': 9.815436241610738e-06, 'epoch': 8.04}


 81%|████████  | 4800/5960 [28:46<06:26,  3.01it/s]

{'loss': 0.192, 'learning_rate': 9.731543624161075e-06, 'epoch': 8.05}


 81%|████████  | 4810/5960 [28:49<06:23,  3.00it/s]

{'loss': 0.1022, 'learning_rate': 9.64765100671141e-06, 'epoch': 8.07}


 81%|████████  | 4820/5960 [28:53<06:19,  3.01it/s]

{'loss': 0.149, 'learning_rate': 9.563758389261746e-06, 'epoch': 8.09}


 81%|████████  | 4830/5960 [28:56<06:13,  3.02it/s]

{'loss': 0.2788, 'learning_rate': 9.479865771812081e-06, 'epoch': 8.1}


 81%|████████  | 4840/5960 [28:59<06:08,  3.04it/s]

{'loss': 0.1227, 'learning_rate': 9.395973154362418e-06, 'epoch': 8.12}


 81%|████████▏ | 4850/5960 [29:03<06:09,  3.01it/s]

{'loss': 0.5285, 'learning_rate': 9.312080536912752e-06, 'epoch': 8.14}


 82%|████████▏ | 4860/5960 [29:06<06:06,  3.00it/s]

{'loss': 0.152, 'learning_rate': 9.228187919463089e-06, 'epoch': 8.15}


 82%|████████▏ | 4870/5960 [29:09<06:03,  3.00it/s]

{'loss': 0.2014, 'learning_rate': 9.144295302013424e-06, 'epoch': 8.17}


 82%|████████▏ | 4880/5960 [29:13<06:00,  3.00it/s]

{'loss': 0.2192, 'learning_rate': 9.060402684563759e-06, 'epoch': 8.19}


 82%|████████▏ | 4890/5960 [29:16<05:57,  2.99it/s]

{'loss': 0.1508, 'learning_rate': 8.976510067114093e-06, 'epoch': 8.2}


 82%|████████▏ | 4900/5960 [29:19<05:53,  3.00it/s]

{'loss': 0.2746, 'learning_rate': 8.89261744966443e-06, 'epoch': 8.22}


 82%|████████▏ | 4910/5960 [29:23<05:49,  3.00it/s]

{'loss': 0.253, 'learning_rate': 8.808724832214765e-06, 'epoch': 8.24}


 83%|████████▎ | 4920/5960 [29:26<05:49,  2.97it/s]

{'loss': 0.2606, 'learning_rate': 8.724832214765101e-06, 'epoch': 8.26}


 83%|████████▎ | 4930/5960 [29:30<05:46,  2.98it/s]

{'loss': 0.1847, 'learning_rate': 8.640939597315436e-06, 'epoch': 8.27}


 83%|████████▎ | 4940/5960 [29:33<05:42,  2.98it/s]

{'loss': 0.461, 'learning_rate': 8.557046979865773e-06, 'epoch': 8.29}


 83%|████████▎ | 4950/5960 [29:36<05:39,  2.98it/s]

{'loss': 0.1919, 'learning_rate': 8.473154362416107e-06, 'epoch': 8.31}


 83%|████████▎ | 4960/5960 [29:40<05:34,  2.99it/s]

{'loss': 0.1449, 'learning_rate': 8.389261744966444e-06, 'epoch': 8.32}


 83%|████████▎ | 4970/5960 [29:43<05:31,  2.98it/s]

{'loss': 0.1508, 'learning_rate': 8.305369127516779e-06, 'epoch': 8.34}


 84%|████████▎ | 4980/5960 [29:46<05:28,  2.98it/s]

{'loss': 0.3692, 'learning_rate': 8.221476510067115e-06, 'epoch': 8.36}


 84%|████████▎ | 4990/5960 [29:50<05:25,  2.98it/s]

{'loss': 0.1311, 'learning_rate': 8.13758389261745e-06, 'epoch': 8.37}


 84%|████████▍ | 5000/5960 [29:53<05:21,  2.98it/s]

{'loss': 0.1868, 'learning_rate': 8.053691275167785e-06, 'epoch': 8.39}


 84%|████████▍ | 5010/5960 [29:56<05:18,  2.98it/s]

{'loss': 0.1936, 'learning_rate': 7.969798657718121e-06, 'epoch': 8.41}


 84%|████████▍ | 5020/5960 [30:00<05:14,  2.99it/s]

{'loss': 0.1971, 'learning_rate': 7.885906040268456e-06, 'epoch': 8.42}


 84%|████████▍ | 5030/5960 [30:03<05:11,  2.98it/s]

{'loss': 0.3081, 'learning_rate': 7.802013422818793e-06, 'epoch': 8.44}


 85%|████████▍ | 5040/5960 [30:06<05:08,  2.98it/s]

{'loss': 0.1768, 'learning_rate': 7.718120805369127e-06, 'epoch': 8.46}


 85%|████████▍ | 5050/5960 [30:10<05:05,  2.98it/s]

{'loss': 0.2013, 'learning_rate': 7.634228187919464e-06, 'epoch': 8.47}


 85%|████████▍ | 5060/5960 [30:13<05:00,  2.99it/s]

{'loss': 0.1296, 'learning_rate': 7.5503355704698e-06, 'epoch': 8.49}


 85%|████████▌ | 5070/5960 [30:17<04:58,  2.99it/s]

{'loss': 0.2118, 'learning_rate': 7.466442953020135e-06, 'epoch': 8.51}


 85%|████████▌ | 5080/5960 [30:20<04:55,  2.98it/s]

{'loss': 0.0845, 'learning_rate': 7.382550335570471e-06, 'epoch': 8.52}


 85%|████████▌ | 5090/5960 [30:23<04:51,  2.98it/s]

{'loss': 0.1631, 'learning_rate': 7.298657718120807e-06, 'epoch': 8.54}


 86%|████████▌ | 5100/5960 [30:27<04:44,  3.02it/s]

{'loss': 0.2385, 'learning_rate': 7.214765100671142e-06, 'epoch': 8.56}


 86%|████████▌ | 5110/5960 [30:30<04:39,  3.04it/s]

{'loss': 0.2563, 'learning_rate': 7.130872483221476e-06, 'epoch': 8.57}


 86%|████████▌ | 5120/5960 [30:33<04:36,  3.04it/s]

{'loss': 0.2674, 'learning_rate': 7.046979865771812e-06, 'epoch': 8.59}


 86%|████████▌ | 5130/5960 [30:36<04:33,  3.03it/s]

{'loss': 0.315, 'learning_rate': 6.963087248322148e-06, 'epoch': 8.61}


 86%|████████▌ | 5140/5960 [30:40<04:30,  3.04it/s]

{'loss': 0.0787, 'learning_rate': 6.879194630872483e-06, 'epoch': 8.62}


 86%|████████▋ | 5150/5960 [30:43<04:26,  3.04it/s]

{'loss': 0.253, 'learning_rate': 6.795302013422819e-06, 'epoch': 8.64}


 87%|████████▋ | 5160/5960 [30:46<04:23,  3.03it/s]

{'loss': 0.341, 'learning_rate': 6.7114093959731546e-06, 'epoch': 8.66}


 87%|████████▋ | 5170/5960 [30:50<04:20,  3.04it/s]

{'loss': 0.2187, 'learning_rate': 6.62751677852349e-06, 'epoch': 8.67}


 87%|████████▋ | 5180/5960 [30:53<04:17,  3.03it/s]

{'loss': 0.149, 'learning_rate': 6.543624161073826e-06, 'epoch': 8.69}


 87%|████████▋ | 5190/5960 [30:56<04:14,  3.03it/s]

{'loss': 0.3206, 'learning_rate': 6.4597315436241616e-06, 'epoch': 8.71}


 87%|████████▋ | 5200/5960 [31:00<04:11,  3.03it/s]

{'loss': 0.3014, 'learning_rate': 6.375838926174497e-06, 'epoch': 8.72}


 87%|████████▋ | 5210/5960 [31:03<04:07,  3.03it/s]

{'loss': 0.2833, 'learning_rate': 6.291946308724833e-06, 'epoch': 8.74}


 88%|████████▊ | 5220/5960 [31:06<04:04,  3.02it/s]

{'loss': 0.2564, 'learning_rate': 6.2080536912751686e-06, 'epoch': 8.76}


 88%|████████▊ | 5230/5960 [31:09<04:01,  3.02it/s]

{'loss': 0.1265, 'learning_rate': 6.124161073825504e-06, 'epoch': 8.78}


 88%|████████▊ | 5240/5960 [31:13<03:57,  3.03it/s]

{'loss': 0.204, 'learning_rate': 6.04026845637584e-06, 'epoch': 8.79}


 88%|████████▊ | 5250/5960 [31:16<03:54,  3.03it/s]

{'loss': 0.2073, 'learning_rate': 5.956375838926175e-06, 'epoch': 8.81}


 88%|████████▊ | 5260/5960 [31:19<03:51,  3.02it/s]

{'loss': 0.3219, 'learning_rate': 5.87248322147651e-06, 'epoch': 8.83}


 88%|████████▊ | 5270/5960 [31:23<03:48,  3.02it/s]

{'loss': 0.2673, 'learning_rate': 5.788590604026846e-06, 'epoch': 8.84}


 89%|████████▊ | 5280/5960 [31:26<03:45,  3.02it/s]

{'loss': 0.2351, 'learning_rate': 5.704697986577182e-06, 'epoch': 8.86}


 89%|████████▉ | 5290/5960 [31:29<03:41,  3.02it/s]

{'loss': 0.1802, 'learning_rate': 5.620805369127517e-06, 'epoch': 8.88}


 89%|████████▉ | 5300/5960 [31:33<03:38,  3.02it/s]

{'loss': 0.5277, 'learning_rate': 5.536912751677853e-06, 'epoch': 8.89}


 89%|████████▉ | 5310/5960 [31:36<03:35,  3.02it/s]

{'loss': 0.1913, 'learning_rate': 5.453020134228188e-06, 'epoch': 8.91}


 89%|████████▉ | 5320/5960 [31:39<03:32,  3.02it/s]

{'loss': 0.0931, 'learning_rate': 5.3691275167785235e-06, 'epoch': 8.93}


 89%|████████▉ | 5330/5960 [31:43<03:28,  3.02it/s]

{'loss': 0.216, 'learning_rate': 5.285234899328859e-06, 'epoch': 8.94}


 90%|████████▉ | 5340/5960 [31:46<03:25,  3.02it/s]

{'loss': 0.2033, 'learning_rate': 5.201342281879195e-06, 'epoch': 8.96}


 90%|████████▉ | 5350/5960 [31:49<03:24,  2.98it/s]

{'loss': 0.2488, 'learning_rate': 5.1174496644295305e-06, 'epoch': 8.98}


 90%|████████▉ | 5360/5960 [31:53<03:20,  2.99it/s]

{'loss': 0.2357, 'learning_rate': 5.033557046979865e-06, 'epoch': 8.99}


                                                   
 90%|█████████ | 5364/5960 [32:08<03:18,  3.00it/s]

{'eval_loss': 0.6066649556159973, 'eval_runtime': 14.0822, 'eval_samples_per_second': 84.646, 'eval_steps_per_second': 10.581, 'epoch': 9.0}


 90%|█████████ | 5370/5960 [32:14<12:27,  1.27s/it]

{'loss': 0.1796, 'learning_rate': 4.949664429530201e-06, 'epoch': 9.01}


 90%|█████████ | 5380/5960 [32:18<03:29,  2.77it/s]

{'loss': 0.0745, 'learning_rate': 4.8657718120805375e-06, 'epoch': 9.03}


 90%|█████████ | 5390/5960 [32:21<03:11,  2.98it/s]

{'loss': 0.1144, 'learning_rate': 4.781879194630873e-06, 'epoch': 9.04}


 91%|█████████ | 5400/5960 [32:25<03:07,  2.98it/s]

{'loss': 0.152, 'learning_rate': 4.697986577181209e-06, 'epoch': 9.06}


 91%|█████████ | 5410/5960 [32:28<03:04,  2.99it/s]

{'loss': 0.1171, 'learning_rate': 4.6140939597315445e-06, 'epoch': 9.08}


 91%|█████████ | 5420/5960 [32:31<03:00,  2.99it/s]

{'loss': 0.1238, 'learning_rate': 4.530201342281879e-06, 'epoch': 9.09}


 91%|█████████ | 5430/5960 [32:35<02:57,  2.99it/s]

{'loss': 0.0931, 'learning_rate': 4.446308724832215e-06, 'epoch': 9.11}


 91%|█████████▏| 5440/5960 [32:38<02:53,  2.99it/s]

{'loss': 0.1984, 'learning_rate': 4.362416107382551e-06, 'epoch': 9.13}


 91%|█████████▏| 5450/5960 [32:41<02:50,  2.99it/s]

{'loss': 0.0766, 'learning_rate': 4.278523489932886e-06, 'epoch': 9.14}


 92%|█████████▏| 5460/5960 [32:45<02:47,  2.99it/s]

{'loss': 0.1879, 'learning_rate': 4.194630872483222e-06, 'epoch': 9.16}


 92%|█████████▏| 5470/5960 [32:48<02:43,  2.99it/s]

{'loss': 0.372, 'learning_rate': 4.110738255033558e-06, 'epoch': 9.18}


 92%|█████████▏| 5480/5960 [32:51<02:40,  2.98it/s]

{'loss': 0.1683, 'learning_rate': 4.026845637583892e-06, 'epoch': 9.19}


 92%|█████████▏| 5490/5960 [32:55<02:37,  2.99it/s]

{'loss': 0.2257, 'learning_rate': 3.942953020134228e-06, 'epoch': 9.21}


 92%|█████████▏| 5500/5960 [32:58<02:33,  2.99it/s]

{'loss': 0.2552, 'learning_rate': 3.859060402684564e-06, 'epoch': 9.23}


 92%|█████████▏| 5510/5960 [33:01<02:31,  2.98it/s]

{'loss': 0.1818, 'learning_rate': 3.7751677852349e-06, 'epoch': 9.24}


 93%|█████████▎| 5520/5960 [33:05<02:27,  2.99it/s]

{'loss': 0.1292, 'learning_rate': 3.6912751677852355e-06, 'epoch': 9.26}


 93%|█████████▎| 5530/5960 [33:08<02:23,  2.99it/s]

{'loss': 0.065, 'learning_rate': 3.607382550335571e-06, 'epoch': 9.28}


 93%|█████████▎| 5540/5960 [33:11<02:20,  2.99it/s]

{'loss': 0.2156, 'learning_rate': 3.523489932885906e-06, 'epoch': 9.3}


 93%|█████████▎| 5550/5960 [33:15<02:17,  2.99it/s]

{'loss': 0.2939, 'learning_rate': 3.4395973154362416e-06, 'epoch': 9.31}


 93%|█████████▎| 5560/5960 [33:18<02:14,  2.98it/s]

{'loss': 0.1014, 'learning_rate': 3.3557046979865773e-06, 'epoch': 9.33}


 93%|█████████▎| 5570/5960 [33:21<02:10,  2.99it/s]

{'loss': 0.1242, 'learning_rate': 3.271812080536913e-06, 'epoch': 9.35}


 94%|█████████▎| 5580/5960 [33:25<02:06,  2.99it/s]

{'loss': 0.1932, 'learning_rate': 3.1879194630872486e-06, 'epoch': 9.36}


 94%|█████████▍| 5590/5960 [33:28<02:02,  3.01it/s]

{'loss': 0.1248, 'learning_rate': 3.1040268456375843e-06, 'epoch': 9.38}


 94%|█████████▍| 5600/5960 [33:31<01:59,  3.02it/s]

{'loss': 0.2797, 'learning_rate': 3.02013422818792e-06, 'epoch': 9.4}


 94%|█████████▍| 5610/5960 [33:35<01:55,  3.02it/s]

{'loss': 0.2365, 'learning_rate': 2.936241610738255e-06, 'epoch': 9.41}


 94%|█████████▍| 5620/5960 [33:38<01:52,  3.02it/s]

{'loss': 0.1907, 'learning_rate': 2.852348993288591e-06, 'epoch': 9.43}


 94%|█████████▍| 5630/5960 [33:41<01:49,  3.01it/s]

{'loss': 0.1991, 'learning_rate': 2.7684563758389265e-06, 'epoch': 9.45}


 95%|█████████▍| 5640/5960 [33:45<01:46,  3.01it/s]

{'loss': 0.4448, 'learning_rate': 2.6845637583892617e-06, 'epoch': 9.46}


 95%|█████████▍| 5650/5960 [33:48<01:43,  3.01it/s]

{'loss': 0.1939, 'learning_rate': 2.6006711409395974e-06, 'epoch': 9.48}


 95%|█████████▍| 5660/5960 [33:51<01:39,  3.01it/s]

{'loss': 0.0447, 'learning_rate': 2.5167785234899326e-06, 'epoch': 9.5}


 95%|█████████▌| 5670/5960 [33:55<01:36,  3.01it/s]

{'loss': 0.062, 'learning_rate': 2.4328859060402687e-06, 'epoch': 9.51}


 95%|█████████▌| 5680/5960 [33:58<01:33,  3.01it/s]

{'loss': 0.2365, 'learning_rate': 2.3489932885906044e-06, 'epoch': 9.53}


 95%|█████████▌| 5690/5960 [34:01<01:29,  3.01it/s]

{'loss': 0.1173, 'learning_rate': 2.2651006711409396e-06, 'epoch': 9.55}


 96%|█████████▌| 5700/5960 [34:05<01:26,  3.01it/s]

{'loss': 0.0327, 'learning_rate': 2.1812080536912753e-06, 'epoch': 9.56}


 96%|█████████▌| 5710/5960 [34:08<01:23,  3.01it/s]

{'loss': 0.0419, 'learning_rate': 2.097315436241611e-06, 'epoch': 9.58}


 96%|█████████▌| 5720/5960 [34:11<01:19,  3.01it/s]

{'loss': 0.0945, 'learning_rate': 2.013422818791946e-06, 'epoch': 9.6}


 96%|█████████▌| 5730/5960 [34:15<01:16,  3.01it/s]

{'loss': 0.2057, 'learning_rate': 1.929530201342282e-06, 'epoch': 9.61}


 96%|█████████▋| 5740/5960 [34:18<01:12,  3.02it/s]

{'loss': 0.1368, 'learning_rate': 1.8456375838926177e-06, 'epoch': 9.63}


 96%|█████████▋| 5750/5960 [34:21<01:09,  3.01it/s]

{'loss': 0.3272, 'learning_rate': 1.761744966442953e-06, 'epoch': 9.65}


 97%|█████████▋| 5760/5960 [34:25<01:06,  3.01it/s]

{'loss': 0.1031, 'learning_rate': 1.6778523489932886e-06, 'epoch': 9.66}


 97%|█████████▋| 5770/5960 [34:28<01:02,  3.02it/s]

{'loss': 0.1718, 'learning_rate': 1.5939597315436243e-06, 'epoch': 9.68}


 97%|█████████▋| 5780/5960 [34:31<00:59,  3.01it/s]

{'loss': 0.0219, 'learning_rate': 1.51006711409396e-06, 'epoch': 9.7}


 97%|█████████▋| 5790/5960 [34:35<00:56,  3.03it/s]

{'loss': 0.1587, 'learning_rate': 1.4261744966442954e-06, 'epoch': 9.71}


 97%|█████████▋| 5800/5960 [34:38<00:52,  3.04it/s]

{'loss': 0.377, 'learning_rate': 1.3422818791946309e-06, 'epoch': 9.73}


 97%|█████████▋| 5810/5960 [34:41<00:49,  3.04it/s]

{'loss': 0.2213, 'learning_rate': 1.2583892617449663e-06, 'epoch': 9.75}


 98%|█████████▊| 5820/5960 [34:45<00:46,  3.04it/s]

{'loss': 0.2736, 'learning_rate': 1.1744966442953022e-06, 'epoch': 9.77}


 98%|█████████▊| 5830/5960 [34:48<00:42,  3.03it/s]

{'loss': 0.3192, 'learning_rate': 1.0906040268456377e-06, 'epoch': 9.78}


 98%|█████████▊| 5840/5960 [34:51<00:39,  3.03it/s]

{'loss': 0.0537, 'learning_rate': 1.006711409395973e-06, 'epoch': 9.8}


 98%|█████████▊| 5850/5960 [34:54<00:36,  3.02it/s]

{'loss': 0.3204, 'learning_rate': 9.228187919463089e-07, 'epoch': 9.82}


 98%|█████████▊| 5860/5960 [34:58<00:33,  3.03it/s]

{'loss': 0.1941, 'learning_rate': 8.389261744966443e-07, 'epoch': 9.83}


 98%|█████████▊| 5870/5960 [35:01<00:29,  3.02it/s]

{'loss': 0.2051, 'learning_rate': 7.5503355704698e-07, 'epoch': 9.85}


 99%|█████████▊| 5880/5960 [35:04<00:26,  3.02it/s]

{'loss': 0.3094, 'learning_rate': 6.711409395973154e-07, 'epoch': 9.87}


 99%|█████████▉| 5890/5960 [35:08<00:23,  3.02it/s]

{'loss': 0.4387, 'learning_rate': 5.872483221476511e-07, 'epoch': 9.88}


 99%|█████████▉| 5900/5960 [35:11<00:19,  3.02it/s]

{'loss': 0.1926, 'learning_rate': 5.033557046979866e-07, 'epoch': 9.9}


 99%|█████████▉| 5910/5960 [35:14<00:16,  3.02it/s]

{'loss': 0.1, 'learning_rate': 4.1946308724832216e-07, 'epoch': 9.92}


 99%|█████████▉| 5920/5960 [35:18<00:13,  3.01it/s]

{'loss': 0.2259, 'learning_rate': 3.355704697986577e-07, 'epoch': 9.93}


 99%|█████████▉| 5930/5960 [35:21<00:09,  3.01it/s]

{'loss': 0.1194, 'learning_rate': 2.516778523489933e-07, 'epoch': 9.95}


100%|█████████▉| 5940/5960 [35:24<00:06,  3.02it/s]

{'loss': 0.2844, 'learning_rate': 1.6778523489932886e-07, 'epoch': 9.97}


100%|█████████▉| 5950/5960 [35:28<00:03,  3.02it/s]

{'loss': 0.1788, 'learning_rate': 8.389261744966443e-08, 'epoch': 9.98}


100%|██████████| 5960/5960 [35:31<00:00,  3.03it/s]

{'loss': 0.3228, 'learning_rate': 0.0, 'epoch': 10.0}


                                                   
100%|██████████| 5960/5960 [35:45<00:00,  3.03it/s]

{'eval_loss': 0.6484203934669495, 'eval_runtime': 14.0664, 'eval_samples_per_second': 84.741, 'eval_steps_per_second': 10.593, 'epoch': 10.0}


100%|██████████| 5960/5960 [35:49<00:00,  2.77it/s]

{'train_runtime': 2149.5857, 'train_samples_per_second': 22.181, 'train_steps_per_second': 2.773, 'train_loss': 0.38772331927246695, 'epoch': 10.0}





TrainOutput(global_step=5960, training_loss=0.38772331927246695, metrics={'train_runtime': 2149.5857, 'train_samples_per_second': 22.181, 'train_steps_per_second': 2.773, 'train_loss': 0.38772331927246695, 'epoch': 10.0})

In [9]:
import numpy as np
from sklearn.metrics import classification_report

predictions = trainer.predict(encoded_dataset["test"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print(classification_report(y_true, y_pred, target_names=["no", "yes"]))


100%|██████████| 149/149 [00:14<00:00, 10.55it/s]

              precision    recall  f1-score   support

          no       0.85      0.86      0.86       613
         yes       0.85      0.84      0.85       579

    accuracy                           0.85      1192
   macro avg       0.85      0.85      0.85      1192
weighted avg       0.85      0.85      0.85      1192






In [10]:
trainer.save_model("phobert_duration_hour_model")

# Test with dateset non hour

In [11]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model đã fine-tune
model_path = "./phobert_duration_hour_model"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # chuyển sang chế độ dự đoán

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Hàm chuyển nhãn
def id_to_label(pred):
    return "yes" if pred == 1 else "no"

# Hàm dự đoán nhãn cho từng option
def predict_labels(context, question, options):
    labels = []
    for option in options:
        text = context + " " + question + " " + option
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            labels.append(id_to_label(pred))
    return labels

# Đọc file test
test_data = []
with open("public_test.json", "r", encoding="utf-8") as f:
    for line in f:
        sample = json.loads(line.strip())
        test_data.append(sample)

# Dự đoán và ghi kết quả
with open("non_hour.txt", "w", encoding="utf-8") as f:
    for sample in test_data:
        predicted_labels = predict_labels(sample["context"], sample["question"], sample["options"])
        sample["labels"] = predicted_labels
        f.write(json.dumps(sample, ensure_ascii=False) + "\n")

print("✅ Dự đoán hoàn tất! Kết quả đã lưu vào file: non_hour.txt")


✅ Dự đoán hoàn tất! Kết quả đã lưu vào file: non_hour.txt


In [12]:
import random

# Đọc toàn bộ submission
with open("non_hour.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Chọn ngẫu nhiên 3 dòng
samples = random.sample(lines, k=min(3, len(lines)))

# In nội dung từng dòng
for idx, line in enumerate(samples, 1):
    sample = json.loads(line)
    print(f"\n🧩 Sample {idx} — QID: {sample['qid']}")
    print(f"📘 Context: {sample['context']}")
    print(f"❓ Question: {sample['question']}")
    print("🧠 Options + Labels:")
    for opt, label in zip(sample["options"], sample["labels"]):
        print(f"   - {opt:<15} → {label}")


🧩 Sample 1 — QID: 72
📘 Context: Trong một cuộc họp báo, một phóng viên đã hỏi về các nỗ lực cứu trợ sau một thảm họa thiên nhiên. Các lãnh đạo đã trình bày về công việc của họ, nhấn mạnh tầm quan trọng của việc hỗ trợ cộng đồng bị ảnh hưởng.
❓ Question: Mất bao lâu để hoàn thành các nỗ lực cứu trợ sau thảm họa thiên nhiên?
🧠 Options + Labels:
   - 3 tháng         → no
   - 1 tháng         → no
   - 2 ngày          → yes
   - 5 tháng         → no

🧩 Sample 2 — QID: 66
📘 Context: Trong một buổi biểu diễn nghệ thuật, các nghệ sĩ đã tạo ra những tác phẩm tuyệt đẹp bằng màu sắc và ánh sáng, khiến khán giả không khỏi trầm trồ. Mọi người đều cảm nhận được sự gắn kết và sáng tạo đến từ những tài năng trẻ tuổi.
❓ Question: Mất bao lâu để hoàn thành một buổi biểu diễn nghệ thuật như vậy?
🧠 Options + Labels:
   - 3 giờ           → no
   - 6 giờ           → no
   - 2 tuần          → no
   - 5 tháng         → no

🧩 Sample 3 — QID: 15
📘 Context: Trong một thành phố nhỏ, có một nhóm nghệ sĩ trẻ đang

In [15]:
import json
import re
import numpy as np
from sklearn.metrics import classification_report

def load_labels(path):
    """
    Đọc file JSON (mỗi dòng một object hoặc nối tiếp) và trả về dict: {qid: [0/1,...]}
    với 0 = "no", 1 = "yes".
    """
    text = open(path, 'r', encoding='utf-8').read()
    raw_objs = re.findall(r'\{.*?\}', text, re.DOTALL)
    data = {}
    for obj in raw_objs:
        entry = json.loads(obj)
        # Chuyển "yes"/"no" thành 1/0
        bin_labels = [1 if lab=="yes" else 0 for lab in entry['labels']]
        data[entry['qid']] = bin_labels
    return data

# Đường dẫn tới hai file
gt_path = 'public_test_full_labeled.txt'
sub_path = 'non_hour.txt'

# Load nhãn
gt = load_labels(gt_path)
sub = load_labels(sub_path)

# Xây dựng y_true, y_pred theo thứ tự qid nhất định (vd sort theo qid)
y_true, y_pred = [], []
for qid in sorted(gt.keys()):
    true = gt[qid]
    pred = sub.get(qid, [0]*len(true))
    y_true.extend(true)
    y_pred.extend(pred)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

# In classification report
print(classification_report(y_true, y_pred, target_names=["no", "yes"]))


              precision    recall  f1-score   support

          no       0.19      0.71      0.30       206
         yes       0.83      0.32      0.46       906

    accuracy                           0.39      1112
   macro avg       0.51      0.52      0.38      1112
weighted avg       0.71      0.39      0.43      1112



# Dataset Hour

In [16]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model đã fine-tune
model_path = "./phobert_duration_hour_model"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # chuyển sang chế độ dự đoán

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Hàm chuyển nhãn
def id_to_label(pred):
    return "yes" if pred == 1 else "no"

# Hàm dự đoán nhãn cho từng option
def predict_labels(context, question, options):
    labels = []
    for option in options:
        text = context + " " + question + " " + option
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            labels.append(id_to_label(pred))
    return labels

# Đọc file test
test_data = []
with open("public_test_normalize.json", "r", encoding="utf-8") as f:
    for line in f:
        sample = json.loads(line.strip())
        test_data.append(sample)

# Dự đoán và ghi kết quả
with open("hour.txt", "w", encoding="utf-8") as f:
    for sample in test_data:
        predicted_labels = predict_labels(sample["context"], sample["question"], sample["options"])
        sample["labels"] = predicted_labels
        f.write(json.dumps(sample, ensure_ascii=False) + "\n")

print("✅ Dự đoán hoàn tất! Kết quả đã lưu vào file: hour.txt")


✅ Dự đoán hoàn tất! Kết quả đã lưu vào file: hour.txt


In [17]:
import random

# Đọc toàn bộ submission
with open("hour.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Chọn ngẫu nhiên 3 dòng
samples = random.sample(lines, k=min(3, len(lines)))

# In nội dung từng dòng
for idx, line in enumerate(samples, 1):
    sample = json.loads(line)
    print(f"\n🧩 Sample {idx} — QID: {sample['qid']}")
    print(f"📘 Context: {sample['context']}")
    print(f"❓ Question: {sample['question']}")
    print("🧠 Options + Labels:")
    for opt, label in zip(sample["options"], sample["labels"]):
        print(f"   - {opt:<15} → {label}")


🧩 Sample 1 — QID: 131
📘 Context: Một nhóm phóng viên đang chuẩn bị cho một buổi phát sóng trực tiếp về một sự kiện quan trọng sắp diễn ra. Họ cần đảm bảo mọi thứ được sắp xếp hoàn hảo, từ thiết bị đến nội dung phỏng vấn.
❓ Question: Mất bao lâu để chuẩn bị cho một buổi phát sóng trực tiếp?
🧠 Options + Labels:
   - 2.0 giờ         → yes
   - 5.0 giờ         → yes
   - 504.0 giờ       → no
   - 240.0 giờ       → no

🧩 Sample 2 — QID: 163
📘 Context: Một ban nhạc đang chuẩn bị cho buổi trình diễn lớn nhất trong năm. Họ đã lên kế hoạch chi tiết cho từng tiết mục và tập luyện không ngừng nghỉ để mang đến một màn trình diễn xuất sắc.
❓ Question: Mất bao lâu để ban nhạc chuẩn bị cho buổi trình diễn lớn nhất trong năm?
🧠 Options + Labels:
   - 504.0 giờ       → yes
   - 168.0 giờ       → yes
   - 4320.0 giờ      → no
   - 336.0 giờ       → yes

🧩 Sample 3 — QID: 144
📘 Context: Trong một buổi hòa nhạc tại công viên, một nghệ sĩ nổi tiếng đã biểu diễn những bản nhạc quen thuộc khiến khán giả phấ

In [18]:
import json
import re
import numpy as np
from sklearn.metrics import classification_report

def load_labels(path):
    """
    Đọc file JSON (mỗi dòng một object hoặc nối tiếp) và trả về dict: {qid: [0/1,...]}
    với 0 = "no", 1 = "yes".
    """
    text = open(path, 'r', encoding='utf-8').read()
    raw_objs = re.findall(r'\{.*?\}', text, re.DOTALL)
    data = {}
    for obj in raw_objs:
        entry = json.loads(obj)
        # Chuyển "yes"/"no" thành 1/0
        bin_labels = [1 if lab=="yes" else 0 for lab in entry['labels']]
        data[entry['qid']] = bin_labels
    return data

# Đường dẫn tới hai file
gt_path = 'public_test_full_labeled.txt'
sub_path = 'hour.txt'

# Load nhãn
gt = load_labels(gt_path)
sub = load_labels(sub_path)

# Xây dựng y_true, y_pred theo thứ tự qid nhất định (vd sort theo qid)
y_true, y_pred = [], []
for qid in sorted(gt.keys()):
    true = gt[qid]
    pred = sub.get(qid, [0]*len(true))
    y_true.extend(true)
    y_pred.extend(pred)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

# In classification report
print(classification_report(y_true, y_pred, target_names=["no", "yes"]))


              precision    recall  f1-score   support

          no       0.31      0.44      0.36       206
         yes       0.86      0.78      0.82       906

    accuracy                           0.72      1112
   macro avg       0.59      0.61      0.59      1112
weighted avg       0.76      0.72      0.73      1112

