In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
import torch
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
import wandb

In [9]:
emotion_list = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

ekman_mapping = {
    "anger": ["anger", "annoyance", "disapproval"],
    "disgust": ["disgust"],
    "fear": ["fear", "nervousness", "confusion"],
    "joy": ["admiration", "amusement", "approval", "caring", "desire", "excitement",
            "gratitude", "joy", "love", "optimism", "pride", "relief"],
    "sadness": ["sadness", "disappointment", "embarrassment", "grief", "remorse"],
    "surprise": ["surprise", "realization", "curiosity"],
    "neutral": ["neutral"]
}
ekman_to_index = {k: i for i, k in enumerate(ekman_mapping)}

def map_labels(raw_labels):
    new_ids = []
    for lid in raw_labels.split(','):
        emo = emotion_list[int(lid)]
        mapped = [ekman_to_index[k] for k, v in ekman_mapping.items() if emo in v]
        new_ids.extend(mapped)
    return list(set(new_ids))

def load_and_process(file):
    df = pd.read_csv(file, sep='\t', header=None, names=['text', 'labels', 'id'])
    df.drop(columns='id', inplace=True)
    df['labels'] = df['labels'].apply(map_labels)
    return df

train_df = load_and_process("train.tsv")
dev_df = load_and_process("dev.tsv")
test_df = load_and_process("test.tsv")

mlb = MultiLabelBinarizer(classes=list(range(7)))
train_df['labels'] = mlb.fit_transform(train_df['labels']).tolist()
dev_df['labels'] = mlb.transform(dev_df['labels']).tolist()
test_df['labels'] = mlb.transform(test_df['labels']).tolist()

In [9]:
train_ds = Dataset.from_pandas(train_df)
dev_ds = Dataset.from_pandas(dev_df)
test_ds = Dataset.from_pandas(test_df)

In [4]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
dev_ds = dev_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

Map: 100%|██████████| 43410/43410 [00:03<00:00, 13207.45 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 14475.42 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 14385.31 examples/s]


In [5]:
def convert_labels(example):
    example["labels"] = [float(x) for x in example["labels"]]
    return example

train_ds = train_ds.map(convert_labels)
dev_ds = dev_ds.map(convert_labels)
test_ds = test_ds.map(convert_labels)

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
dev_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 43410/43410 [00:02<00:00, 15132.78 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 9634.30 examples/s] 
Map: 100%|██████████| 5427/5427 [00:00<00:00, 15323.86 examples/s]


In [6]:
from datasets import Dataset
import pandas as pd

def df_to_dataset(df):
    return Dataset.from_dict({
        "text": df["text"].tolist(),
        "labels": df["labels"].apply(lambda x: [float(i) for i in x]).tolist()
    })

train_ds = df_to_dataset(train_df)
dev_ds = df_to_dataset(dev_df)
test_ds = df_to_dataset(test_df)


In [7]:
train_ds = train_ds.map(tokenize, batched=True)
dev_ds = dev_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
dev_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 43410/43410 [00:03<00:00, 13445.54 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 9193.47 examples/s] 
Map: 100%|██████████| 5427/5427 [00:00<00:00, 14473.67 examples/s]


In [8]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=7,
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")


def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    probs = 1 / (1 + np.exp(-logits))  # sigmoid

    # Dự đoán: nếu xác suất > 0.5 thì là nhãn 1
    preds = (probs >= 0.5).astype(int)

    return {
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
        "accuracy": accuracy_score(labels, preds),
    }


In [10]:
training_args = TrainingArguments(
    output_dir="./bert-goemotions",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=50,
    report_to="wandb",
    run_name="bert-goemotions-ekman"
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [12]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdaiphat2312[0m ([33mdaiphat2312-fpt-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 51/43412 [00:11<2:26:17,  4.94it/s]

{'loss': 0.4918, 'grad_norm': 1.6902152299880981, 'learning_rate': 1.9976964894499217e-05, 'epoch': 0.0}


  0%|          | 101/43412 [00:21<2:28:48,  4.85it/s]

{'loss': 0.3991, 'grad_norm': 2.3046875, 'learning_rate': 1.9953929788998436e-05, 'epoch': 0.01}


  0%|          | 151/43412 [00:31<2:29:45,  4.81it/s]

{'loss': 0.3866, 'grad_norm': 2.742119073867798, 'learning_rate': 1.993089468349765e-05, 'epoch': 0.01}


  0%|          | 201/43412 [00:42<2:28:58,  4.83it/s]

{'loss': 0.3576, 'grad_norm': 2.009446144104004, 'learning_rate': 1.990785957799687e-05, 'epoch': 0.02}


  1%|          | 251/43412 [00:52<2:29:38,  4.81it/s]

{'loss': 0.3323, 'grad_norm': 1.8190757036209106, 'learning_rate': 1.9884824472496085e-05, 'epoch': 0.02}


  1%|          | 301/43412 [01:02<2:29:15,  4.81it/s]

{'loss': 0.3262, 'grad_norm': 2.136814594268799, 'learning_rate': 1.9861789366995303e-05, 'epoch': 0.03}


  1%|          | 351/43412 [01:13<2:29:36,  4.80it/s]

{'loss': 0.2876, 'grad_norm': 1.080847144126892, 'learning_rate': 1.983875426149452e-05, 'epoch': 0.03}


  1%|          | 401/43412 [01:23<2:28:43,  4.82it/s]

{'loss': 0.3237, 'grad_norm': 3.5263726711273193, 'learning_rate': 1.9815719155993734e-05, 'epoch': 0.04}


  1%|          | 451/43412 [01:33<2:29:29,  4.79it/s]

{'loss': 0.2925, 'grad_norm': 2.8807578086853027, 'learning_rate': 1.9792684050492953e-05, 'epoch': 0.04}


  1%|          | 501/43412 [01:44<2:28:36,  4.81it/s]

{'loss': 0.2885, 'grad_norm': 0.8966577649116516, 'learning_rate': 1.976964894499217e-05, 'epoch': 0.05}


  1%|▏         | 551/43412 [01:54<2:28:54,  4.80it/s]

{'loss': 0.2825, 'grad_norm': 1.4907050132751465, 'learning_rate': 1.9746613839491387e-05, 'epoch': 0.05}


  1%|▏         | 601/43412 [02:05<2:28:31,  4.80it/s]

{'loss': 0.2473, 'grad_norm': 2.7129032611846924, 'learning_rate': 1.9723578733990602e-05, 'epoch': 0.06}


  1%|▏         | 651/43412 [02:15<2:28:13,  4.81it/s]

{'loss': 0.3002, 'grad_norm': 2.157209873199463, 'learning_rate': 1.970054362848982e-05, 'epoch': 0.06}


  2%|▏         | 701/43412 [02:25<2:27:31,  4.83it/s]

{'loss': 0.2833, 'grad_norm': 2.726534843444824, 'learning_rate': 1.9677508522989036e-05, 'epoch': 0.06}


  2%|▏         | 751/43412 [02:36<2:27:58,  4.80it/s]

{'loss': 0.2773, 'grad_norm': 3.9825339317321777, 'learning_rate': 1.9654473417488254e-05, 'epoch': 0.07}


  2%|▏         | 801/43412 [02:46<2:27:34,  4.81it/s]

{'loss': 0.2975, 'grad_norm': 2.14339542388916, 'learning_rate': 1.963143831198747e-05, 'epoch': 0.07}


  2%|▏         | 851/43412 [02:56<2:27:34,  4.81it/s]

{'loss': 0.2848, 'grad_norm': 2.4039363861083984, 'learning_rate': 1.960840320648669e-05, 'epoch': 0.08}


  2%|▏         | 901/43412 [03:07<2:27:08,  4.82it/s]

{'loss': 0.2589, 'grad_norm': 1.4000890254974365, 'learning_rate': 1.9585368100985904e-05, 'epoch': 0.08}


  2%|▏         | 951/43412 [03:17<2:27:57,  4.78it/s]

{'loss': 0.2551, 'grad_norm': 2.1252808570861816, 'learning_rate': 1.956233299548512e-05, 'epoch': 0.09}


  2%|▏         | 1001/43412 [03:28<2:26:17,  4.83it/s]

{'loss': 0.2499, 'grad_norm': 1.8473738431930542, 'learning_rate': 1.9539297889984338e-05, 'epoch': 0.09}


  2%|▏         | 1051/43412 [03:38<2:26:33,  4.82it/s]

{'loss': 0.2768, 'grad_norm': 4.655198097229004, 'learning_rate': 1.9516262784483556e-05, 'epoch': 0.1}


  3%|▎         | 1101/43412 [03:48<2:26:55,  4.80it/s]

{'loss': 0.2698, 'grad_norm': 2.638652801513672, 'learning_rate': 1.949322767898277e-05, 'epoch': 0.1}


  3%|▎         | 1151/43412 [03:59<2:27:01,  4.79it/s]

{'loss': 0.27, 'grad_norm': 2.463749408721924, 'learning_rate': 1.9470192573481987e-05, 'epoch': 0.11}


  3%|▎         | 1201/43412 [04:09<2:26:23,  4.81it/s]

{'loss': 0.2672, 'grad_norm': 1.6150062084197998, 'learning_rate': 1.9447157467981205e-05, 'epoch': 0.11}


  3%|▎         | 1251/43412 [04:20<2:26:05,  4.81it/s]

{'loss': 0.2967, 'grad_norm': 4.104585647583008, 'learning_rate': 1.942412236248042e-05, 'epoch': 0.12}


  3%|▎         | 1301/43412 [04:30<2:26:26,  4.79it/s]

{'loss': 0.2765, 'grad_norm': 1.2618186473846436, 'learning_rate': 1.940108725697964e-05, 'epoch': 0.12}


  3%|▎         | 1351/43412 [04:40<2:25:52,  4.81it/s]

{'loss': 0.3, 'grad_norm': 4.001315116882324, 'learning_rate': 1.9378052151478855e-05, 'epoch': 0.12}


  3%|▎         | 1401/43412 [04:51<2:26:19,  4.79it/s]

{'loss': 0.2607, 'grad_norm': 1.8682241439819336, 'learning_rate': 1.9355017045978073e-05, 'epoch': 0.13}


  3%|▎         | 1451/43412 [05:01<2:26:02,  4.79it/s]

{'loss': 0.2827, 'grad_norm': 3.7870302200317383, 'learning_rate': 1.933198194047729e-05, 'epoch': 0.13}


  3%|▎         | 1501/43412 [05:12<2:25:15,  4.81it/s]

{'loss': 0.2839, 'grad_norm': 2.511258602142334, 'learning_rate': 1.9308946834976504e-05, 'epoch': 0.14}


  4%|▎         | 1551/43412 [05:22<2:24:26,  4.83it/s]

{'loss': 0.2318, 'grad_norm': 2.724836587905884, 'learning_rate': 1.9285911729475722e-05, 'epoch': 0.14}


  4%|▎         | 1601/43412 [05:33<2:25:06,  4.80it/s]

{'loss': 0.258, 'grad_norm': 1.8693286180496216, 'learning_rate': 1.926287662397494e-05, 'epoch': 0.15}


  4%|▍         | 1651/43412 [05:43<2:25:43,  4.78it/s]

{'loss': 0.253, 'grad_norm': 1.0116519927978516, 'learning_rate': 1.9239841518474156e-05, 'epoch': 0.15}


  4%|▍         | 1701/43412 [05:53<2:25:06,  4.79it/s]

{'loss': 0.2489, 'grad_norm': 1.837225079536438, 'learning_rate': 1.921680641297337e-05, 'epoch': 0.16}


  4%|▍         | 1751/43412 [06:04<2:24:27,  4.81it/s]

{'loss': 0.2639, 'grad_norm': 3.271200180053711, 'learning_rate': 1.919377130747259e-05, 'epoch': 0.16}


  4%|▍         | 1801/43412 [06:14<2:24:48,  4.79it/s]

{'loss': 0.2521, 'grad_norm': 1.3633297681808472, 'learning_rate': 1.9170736201971806e-05, 'epoch': 0.17}


  4%|▍         | 1851/43412 [06:25<2:24:17,  4.80it/s]

{'loss': 0.2649, 'grad_norm': 1.9267644882202148, 'learning_rate': 1.9147701096471024e-05, 'epoch': 0.17}


  4%|▍         | 1901/43412 [06:35<2:24:00,  4.80it/s]

{'loss': 0.2533, 'grad_norm': 4.327198505401611, 'learning_rate': 1.912466599097024e-05, 'epoch': 0.18}


  4%|▍         | 1951/43412 [06:45<2:24:22,  4.79it/s]

{'loss': 0.2692, 'grad_norm': 1.68106210231781, 'learning_rate': 1.9101630885469458e-05, 'epoch': 0.18}


  5%|▍         | 2001/43412 [06:56<2:23:46,  4.80it/s]

{'loss': 0.2413, 'grad_norm': 1.6984962224960327, 'learning_rate': 1.9078595779968673e-05, 'epoch': 0.18}


  5%|▍         | 2051/43412 [07:06<2:23:28,  4.80it/s]

{'loss': 0.2588, 'grad_norm': 2.5691542625427246, 'learning_rate': 1.905556067446789e-05, 'epoch': 0.19}


  5%|▍         | 2101/43412 [07:17<2:23:48,  4.79it/s]

{'loss': 0.2418, 'grad_norm': 2.6985819339752197, 'learning_rate': 1.9032525568967107e-05, 'epoch': 0.19}


  5%|▍         | 2151/43412 [07:27<2:23:24,  4.80it/s]

{'loss': 0.2672, 'grad_norm': 2.0782086849212646, 'learning_rate': 1.9009490463466326e-05, 'epoch': 0.2}


  5%|▌         | 2201/43412 [07:37<2:23:03,  4.80it/s]

{'loss': 0.248, 'grad_norm': 1.5940122604370117, 'learning_rate': 1.898645535796554e-05, 'epoch': 0.2}


  5%|▌         | 2251/43412 [07:48<2:22:58,  4.80it/s]

{'loss': 0.2547, 'grad_norm': 1.8556535243988037, 'learning_rate': 1.8963420252464757e-05, 'epoch': 0.21}


  5%|▌         | 2301/43412 [07:58<2:22:40,  4.80it/s]

{'loss': 0.2222, 'grad_norm': 3.664294481277466, 'learning_rate': 1.8940385146963975e-05, 'epoch': 0.21}


  5%|▌         | 2351/43412 [08:09<2:23:04,  4.78it/s]

{'loss': 0.2621, 'grad_norm': 3.4073808193206787, 'learning_rate': 1.891735004146319e-05, 'epoch': 0.22}


  6%|▌         | 2401/43412 [08:19<2:22:26,  4.80it/s]

{'loss': 0.2483, 'grad_norm': 3.339101791381836, 'learning_rate': 1.889431493596241e-05, 'epoch': 0.22}


  6%|▌         | 2451/43412 [08:30<2:23:00,  4.77it/s]

{'loss': 0.2642, 'grad_norm': 2.7978146076202393, 'learning_rate': 1.8871279830461624e-05, 'epoch': 0.23}


  6%|▌         | 2501/43412 [08:40<2:21:52,  4.81it/s]

{'loss': 0.2627, 'grad_norm': 1.7868123054504395, 'learning_rate': 1.8848244724960843e-05, 'epoch': 0.23}


  6%|▌         | 2551/43412 [08:50<2:22:37,  4.77it/s]

{'loss': 0.2437, 'grad_norm': 1.6282106637954712, 'learning_rate': 1.882520961946006e-05, 'epoch': 0.23}


  6%|▌         | 2601/43412 [09:01<2:22:14,  4.78it/s]

{'loss': 0.2553, 'grad_norm': 2.6196980476379395, 'learning_rate': 1.8802174513959274e-05, 'epoch': 0.24}


  6%|▌         | 2651/43412 [09:11<2:21:21,  4.81it/s]

{'loss': 0.2338, 'grad_norm': 2.6504809856414795, 'learning_rate': 1.8779139408458492e-05, 'epoch': 0.24}


  6%|▌         | 2701/43412 [09:22<2:21:50,  4.78it/s]

{'loss': 0.2235, 'grad_norm': 1.5317893028259277, 'learning_rate': 1.875610430295771e-05, 'epoch': 0.25}


  6%|▋         | 2751/43412 [09:32<2:21:17,  4.80it/s]

{'loss': 0.2868, 'grad_norm': 4.503927707672119, 'learning_rate': 1.8733069197456926e-05, 'epoch': 0.25}


  6%|▋         | 2801/43412 [09:42<2:21:07,  4.80it/s]

{'loss': 0.2182, 'grad_norm': 1.0684596300125122, 'learning_rate': 1.871003409195614e-05, 'epoch': 0.26}


  7%|▋         | 2851/43412 [09:53<2:20:47,  4.80it/s]

{'loss': 0.2427, 'grad_norm': 4.740328788757324, 'learning_rate': 1.868699898645536e-05, 'epoch': 0.26}


  7%|▋         | 2901/43412 [10:03<2:21:00,  4.79it/s]

{'loss': 0.2665, 'grad_norm': 2.0959436893463135, 'learning_rate': 1.8663963880954575e-05, 'epoch': 0.27}


  7%|▋         | 2951/43412 [10:14<2:21:23,  4.77it/s]

{'loss': 0.2395, 'grad_norm': 3.1231019496917725, 'learning_rate': 1.8640928775453794e-05, 'epoch': 0.27}


  7%|▋         | 3001/43412 [10:24<2:20:19,  4.80it/s]

{'loss': 0.2538, 'grad_norm': 2.38501238822937, 'learning_rate': 1.861789366995301e-05, 'epoch': 0.28}


  7%|▋         | 3051/43412 [10:35<2:20:08,  4.80it/s]

{'loss': 0.2543, 'grad_norm': 1.6416022777557373, 'learning_rate': 1.8594858564452228e-05, 'epoch': 0.28}


  7%|▋         | 3101/43412 [10:45<2:20:09,  4.79it/s]

{'loss': 0.2189, 'grad_norm': 2.2831952571868896, 'learning_rate': 1.8571823458951443e-05, 'epoch': 0.29}


  7%|▋         | 3151/43412 [47:31<2:20:25,  4.78it/s]    

{'loss': 0.238, 'grad_norm': 2.2390637397766113, 'learning_rate': 1.854878835345066e-05, 'epoch': 0.29}


  7%|▋         | 3201/43412 [47:41<2:20:05,  4.78it/s]

{'loss': 0.2591, 'grad_norm': 2.2197976112365723, 'learning_rate': 1.8525753247949877e-05, 'epoch': 0.29}


  7%|▋         | 3251/43412 [47:52<2:19:21,  4.80it/s]

{'loss': 0.2689, 'grad_norm': 4.793027877807617, 'learning_rate': 1.8502718142449096e-05, 'epoch': 0.3}


  8%|▊         | 3301/43412 [48:02<2:19:26,  4.79it/s]

{'loss': 0.2336, 'grad_norm': 1.235954761505127, 'learning_rate': 1.847968303694831e-05, 'epoch': 0.3}


  8%|▊         | 3351/43412 [48:13<2:19:50,  4.77it/s]

{'loss': 0.2436, 'grad_norm': 4.5718278884887695, 'learning_rate': 1.8456647931447526e-05, 'epoch': 0.31}


  8%|▊         | 3401/43412 [48:23<2:19:25,  4.78it/s]

{'loss': 0.236, 'grad_norm': 1.3774398565292358, 'learning_rate': 1.8433612825946745e-05, 'epoch': 0.31}


  8%|▊         | 3451/43412 [48:34<2:18:51,  4.80it/s]

{'loss': 0.2412, 'grad_norm': 4.858974933624268, 'learning_rate': 1.841057772044596e-05, 'epoch': 0.32}


  8%|▊         | 3501/43412 [48:44<2:17:51,  4.83it/s]

{'loss': 0.2245, 'grad_norm': 1.0998872518539429, 'learning_rate': 1.838754261494518e-05, 'epoch': 0.32}


  8%|▊         | 3551/43412 [48:54<2:18:07,  4.81it/s]

{'loss': 0.2331, 'grad_norm': 2.237596273422241, 'learning_rate': 1.8364507509444394e-05, 'epoch': 0.33}


  8%|▊         | 3601/43412 [49:05<2:19:39,  4.75it/s]

{'loss': 0.2317, 'grad_norm': 3.084317445755005, 'learning_rate': 1.8341472403943613e-05, 'epoch': 0.33}


  8%|▊         | 3651/43412 [49:15<2:17:24,  4.82it/s]

{'loss': 0.2666, 'grad_norm': 2.34248423576355, 'learning_rate': 1.8318437298442828e-05, 'epoch': 0.34}


  9%|▊         | 3701/43412 [49:26<2:18:44,  4.77it/s]

{'loss': 0.2371, 'grad_norm': 3.3053390979766846, 'learning_rate': 1.8295402192942043e-05, 'epoch': 0.34}


  9%|▊         | 3751/43412 [49:36<2:18:23,  4.78it/s]

{'loss': 0.2423, 'grad_norm': 3.9976415634155273, 'learning_rate': 1.8272367087441262e-05, 'epoch': 0.35}


  9%|▉         | 3801/43412 [49:47<2:18:07,  4.78it/s]

{'loss': 0.2238, 'grad_norm': 2.249002456665039, 'learning_rate': 1.8249331981940477e-05, 'epoch': 0.35}


  9%|▉         | 3851/43412 [49:57<2:16:53,  4.82it/s]

{'loss': 0.2362, 'grad_norm': 2.512295961380005, 'learning_rate': 1.8226296876439696e-05, 'epoch': 0.35}


  9%|▉         | 3901/43412 [50:07<2:17:17,  4.80it/s]

{'loss': 0.2382, 'grad_norm': 2.9264137744903564, 'learning_rate': 1.820326177093891e-05, 'epoch': 0.36}


  9%|▉         | 3951/43412 [50:18<2:16:57,  4.80it/s]

{'loss': 0.241, 'grad_norm': 2.5316784381866455, 'learning_rate': 1.818022666543813e-05, 'epoch': 0.36}


  9%|▉         | 4001/43412 [50:28<2:17:07,  4.79it/s]

{'loss': 0.2084, 'grad_norm': 4.760222434997559, 'learning_rate': 1.8157191559937345e-05, 'epoch': 0.37}


  9%|▉         | 4051/43412 [50:39<2:16:13,  4.82it/s]

{'loss': 0.2399, 'grad_norm': 2.7673025131225586, 'learning_rate': 1.8134156454436564e-05, 'epoch': 0.37}


  9%|▉         | 4101/43412 [50:49<2:17:26,  4.77it/s]

{'loss': 0.2586, 'grad_norm': 2.7226674556732178, 'learning_rate': 1.811112134893578e-05, 'epoch': 0.38}


 10%|▉         | 4151/43412 [51:00<2:17:20,  4.76it/s]

{'loss': 0.2409, 'grad_norm': 3.4257469177246094, 'learning_rate': 1.8088086243434998e-05, 'epoch': 0.38}


 10%|▉         | 4201/43412 [51:10<2:15:52,  4.81it/s]

{'loss': 0.2306, 'grad_norm': 2.4903345108032227, 'learning_rate': 1.8065051137934213e-05, 'epoch': 0.39}


 10%|▉         | 4251/43412 [51:20<2:16:35,  4.78it/s]

{'loss': 0.2199, 'grad_norm': 1.2628098726272583, 'learning_rate': 1.804201603243343e-05, 'epoch': 0.39}


 10%|▉         | 4301/43412 [51:31<2:16:03,  4.79it/s]

{'loss': 0.2443, 'grad_norm': 2.0667290687561035, 'learning_rate': 1.8018980926932647e-05, 'epoch': 0.4}


 10%|█         | 4351/43412 [51:41<2:15:51,  4.79it/s]

{'loss': 0.218, 'grad_norm': 2.920431613922119, 'learning_rate': 1.7995945821431862e-05, 'epoch': 0.4}


 10%|█         | 4401/43412 [51:52<2:16:00,  4.78it/s]

{'loss': 0.2411, 'grad_norm': 0.5997795462608337, 'learning_rate': 1.797291071593108e-05, 'epoch': 0.41}


 10%|█         | 4451/43412 [52:02<2:15:14,  4.80it/s]

{'loss': 0.2264, 'grad_norm': 3.1593024730682373, 'learning_rate': 1.7949875610430296e-05, 'epoch': 0.41}


 10%|█         | 4501/43412 [52:13<2:15:32,  4.78it/s]

{'loss': 0.1992, 'grad_norm': 1.9720895290374756, 'learning_rate': 1.7926840504929515e-05, 'epoch': 0.41}


 10%|█         | 4551/43412 [52:23<2:14:04,  4.83it/s]

{'loss': 0.2766, 'grad_norm': 4.009130954742432, 'learning_rate': 1.790380539942873e-05, 'epoch': 0.42}


 11%|█         | 4601/43412 [52:33<2:12:12,  4.89it/s]

{'loss': 0.2222, 'grad_norm': 2.0839200019836426, 'learning_rate': 1.7880770293927945e-05, 'epoch': 0.42}


 11%|█         | 4651/43412 [52:44<2:12:01,  4.89it/s]

{'loss': 0.2122, 'grad_norm': 2.9916017055511475, 'learning_rate': 1.7857735188427164e-05, 'epoch': 0.43}


 11%|█         | 4701/43412 [52:54<2:11:24,  4.91it/s]

{'loss': 0.2449, 'grad_norm': 3.3250203132629395, 'learning_rate': 1.7834700082926383e-05, 'epoch': 0.43}


 11%|█         | 4751/43412 [53:04<2:11:45,  4.89it/s]

{'loss': 0.2495, 'grad_norm': 1.5192136764526367, 'learning_rate': 1.7811664977425598e-05, 'epoch': 0.44}


 11%|█         | 4801/43412 [53:14<2:11:08,  4.91it/s]

{'loss': 0.2247, 'grad_norm': 1.8955243825912476, 'learning_rate': 1.7788629871924813e-05, 'epoch': 0.44}


 11%|█         | 4851/43412 [53:24<2:11:59,  4.87it/s]

{'loss': 0.2694, 'grad_norm': 1.7628954648971558, 'learning_rate': 1.7765594766424032e-05, 'epoch': 0.45}


 11%|█▏        | 4901/43412 [53:34<2:11:44,  4.87it/s]

{'loss': 0.2698, 'grad_norm': 2.3391072750091553, 'learning_rate': 1.7742559660923247e-05, 'epoch': 0.45}


 11%|█▏        | 4951/43412 [53:45<2:11:23,  4.88it/s]

{'loss': 0.261, 'grad_norm': 1.0169237852096558, 'learning_rate': 1.7719524555422466e-05, 'epoch': 0.46}


 12%|█▏        | 5001/43412 [53:55<2:10:42,  4.90it/s]

{'loss': 0.2362, 'grad_norm': 6.000956058502197, 'learning_rate': 1.769648944992168e-05, 'epoch': 0.46}


 12%|█▏        | 5051/43412 [54:05<2:10:36,  4.90it/s]

{'loss': 0.2196, 'grad_norm': 4.470835208892822, 'learning_rate': 1.76734543444209e-05, 'epoch': 0.47}


 12%|█▏        | 5101/43412 [54:15<2:09:30,  4.93it/s]

{'loss': 0.2258, 'grad_norm': 3.1567137241363525, 'learning_rate': 1.7650419238920115e-05, 'epoch': 0.47}


 12%|█▏        | 5151/43412 [54:25<2:10:12,  4.90it/s]

{'loss': 0.2684, 'grad_norm': 2.3255786895751953, 'learning_rate': 1.762738413341933e-05, 'epoch': 0.47}


 12%|█▏        | 5201/43412 [54:35<2:10:15,  4.89it/s]

{'loss': 0.2145, 'grad_norm': 0.8067518472671509, 'learning_rate': 1.760434902791855e-05, 'epoch': 0.48}


 12%|█▏        | 5251/43412 [54:46<2:09:39,  4.91it/s]

{'loss': 0.2572, 'grad_norm': 3.198807954788208, 'learning_rate': 1.7581313922417768e-05, 'epoch': 0.48}


 12%|█▏        | 5301/43412 [54:56<2:09:29,  4.91it/s]

{'loss': 0.2392, 'grad_norm': 2.9545249938964844, 'learning_rate': 1.7558278816916983e-05, 'epoch': 0.49}


 12%|█▏        | 5351/43412 [55:06<2:09:47,  4.89it/s]

{'loss': 0.2433, 'grad_norm': 2.6412558555603027, 'learning_rate': 1.7535243711416198e-05, 'epoch': 0.49}


 12%|█▏        | 5401/43412 [55:16<2:08:54,  4.91it/s]

{'loss': 0.254, 'grad_norm': 3.2617108821868896, 'learning_rate': 1.7512208605915417e-05, 'epoch': 0.5}


 13%|█▎        | 5451/43412 [55:26<2:09:24,  4.89it/s]

{'loss': 0.2486, 'grad_norm': 4.016153335571289, 'learning_rate': 1.7489173500414632e-05, 'epoch': 0.5}


 13%|█▎        | 5501/43412 [55:36<2:09:21,  4.88it/s]

{'loss': 0.2559, 'grad_norm': 4.142897129058838, 'learning_rate': 1.746613839491385e-05, 'epoch': 0.51}


 13%|█▎        | 5551/43412 [55:47<2:08:39,  4.90it/s]

{'loss': 0.2296, 'grad_norm': 3.356024980545044, 'learning_rate': 1.7443103289413066e-05, 'epoch': 0.51}


 13%|█▎        | 5601/43412 [55:57<2:08:48,  4.89it/s]

{'loss': 0.2192, 'grad_norm': 2.17366623878479, 'learning_rate': 1.7420068183912285e-05, 'epoch': 0.52}


 13%|█▎        | 5651/43412 [56:07<2:09:05,  4.88it/s]

{'loss': 0.25, 'grad_norm': 2.96246337890625, 'learning_rate': 1.73970330784115e-05, 'epoch': 0.52}


 13%|█▎        | 5701/43412 [56:17<2:08:20,  4.90it/s]

{'loss': 0.2386, 'grad_norm': 2.2792413234710693, 'learning_rate': 1.7373997972910715e-05, 'epoch': 0.53}


 13%|█▎        | 5751/43412 [56:27<2:08:08,  4.90it/s]

{'loss': 0.2306, 'grad_norm': 2.3850696086883545, 'learning_rate': 1.7350962867409934e-05, 'epoch': 0.53}


 13%|█▎        | 5801/43412 [56:37<2:07:50,  4.90it/s]

{'loss': 0.2428, 'grad_norm': 3.1436619758605957, 'learning_rate': 1.7327927761909153e-05, 'epoch': 0.53}


 13%|█▎        | 5851/43412 [56:48<2:07:26,  4.91it/s]

{'loss': 0.2319, 'grad_norm': 2.8351988792419434, 'learning_rate': 1.7304892656408368e-05, 'epoch': 0.54}


 14%|█▎        | 5901/43412 [56:58<2:07:46,  4.89it/s]

{'loss': 0.2363, 'grad_norm': 2.183912515640259, 'learning_rate': 1.7281857550907583e-05, 'epoch': 0.54}


 14%|█▎        | 5951/43412 [57:08<2:06:35,  4.93it/s]

{'loss': 0.2375, 'grad_norm': 3.9790868759155273, 'learning_rate': 1.7258822445406802e-05, 'epoch': 0.55}


 14%|█▍        | 6001/43412 [57:18<2:07:14,  4.90it/s]

{'loss': 0.2451, 'grad_norm': 3.3237411975860596, 'learning_rate': 1.7235787339906017e-05, 'epoch': 0.55}


 14%|█▍        | 6051/43412 [57:28<2:07:11,  4.90it/s]

{'loss': 0.2257, 'grad_norm': 1.9711472988128662, 'learning_rate': 1.7212752234405236e-05, 'epoch': 0.56}


 14%|█▍        | 6101/43412 [57:38<2:06:39,  4.91it/s]

{'loss': 0.2595, 'grad_norm': 1.5306071043014526, 'learning_rate': 1.7189717128904454e-05, 'epoch': 0.56}


 14%|█▍        | 6151/43412 [57:49<2:06:43,  4.90it/s]

{'loss': 0.2126, 'grad_norm': 2.8920958042144775, 'learning_rate': 1.716668202340367e-05, 'epoch': 0.57}


 14%|█▍        | 6201/43412 [57:59<2:06:04,  4.92it/s]

{'loss': 0.212, 'grad_norm': 5.1683831214904785, 'learning_rate': 1.7143646917902885e-05, 'epoch': 0.57}


 14%|█▍        | 6251/43412 [58:09<2:05:38,  4.93it/s]

{'loss': 0.2553, 'grad_norm': 2.672288656234741, 'learning_rate': 1.71206118124021e-05, 'epoch': 0.58}


 15%|█▍        | 6301/43412 [58:19<2:06:11,  4.90it/s]

{'loss': 0.2528, 'grad_norm': 2.154684066772461, 'learning_rate': 1.709757670690132e-05, 'epoch': 0.58}


 15%|█▍        | 6351/43412 [58:29<2:06:13,  4.89it/s]

{'loss': 0.2316, 'grad_norm': 1.8523527383804321, 'learning_rate': 1.7074541601400538e-05, 'epoch': 0.59}


 15%|█▍        | 6401/43412 [58:39<2:04:42,  4.95it/s]

{'loss': 0.2344, 'grad_norm': 2.074516534805298, 'learning_rate': 1.7051506495899753e-05, 'epoch': 0.59}


 15%|█▍        | 6451/43412 [58:49<2:05:24,  4.91it/s]

{'loss': 0.2297, 'grad_norm': 1.8223143815994263, 'learning_rate': 1.7028471390398968e-05, 'epoch': 0.59}


 15%|█▍        | 6501/43412 [59:00<2:05:45,  4.89it/s]

{'loss': 0.2495, 'grad_norm': 2.3172738552093506, 'learning_rate': 1.7005436284898187e-05, 'epoch': 0.6}


 15%|█▌        | 6551/43412 [59:10<2:05:31,  4.89it/s]

{'loss': 0.2273, 'grad_norm': 4.962860107421875, 'learning_rate': 1.6982401179397402e-05, 'epoch': 0.6}


 15%|█▌        | 6601/43412 [59:20<2:04:49,  4.91it/s]

{'loss': 0.2596, 'grad_norm': 2.270873785018921, 'learning_rate': 1.695936607389662e-05, 'epoch': 0.61}


 15%|█▌        | 6651/43412 [59:30<2:04:35,  4.92it/s]

{'loss': 0.2608, 'grad_norm': 4.242998123168945, 'learning_rate': 1.693633096839584e-05, 'epoch': 0.61}


 15%|█▌        | 6701/43412 [59:40<2:04:49,  4.90it/s]

{'loss': 0.2401, 'grad_norm': 2.415242910385132, 'learning_rate': 1.6913295862895055e-05, 'epoch': 0.62}


 16%|█▌        | 6751/43412 [59:50<2:04:46,  4.90it/s]

{'loss': 0.2529, 'grad_norm': 1.042907476425171, 'learning_rate': 1.689026075739427e-05, 'epoch': 0.62}


 16%|█▌        | 6801/43412 [1:00:01<2:04:34,  4.90it/s]

{'loss': 0.2284, 'grad_norm': 1.9694498777389526, 'learning_rate': 1.6867225651893485e-05, 'epoch': 0.63}


 16%|█▌        | 6851/43412 [1:00:11<2:03:02,  4.95it/s]

{'loss': 0.229, 'grad_norm': 4.274985313415527, 'learning_rate': 1.6844190546392704e-05, 'epoch': 0.63}


 16%|█▌        | 6901/43412 [1:00:21<2:04:07,  4.90it/s]

{'loss': 0.2389, 'grad_norm': 2.34611177444458, 'learning_rate': 1.6821155440891922e-05, 'epoch': 0.64}


 16%|█▌        | 6951/43412 [1:00:31<2:04:11,  4.89it/s]

{'loss': 0.2089, 'grad_norm': 3.7443337440490723, 'learning_rate': 1.6798120335391138e-05, 'epoch': 0.64}


 16%|█▌        | 7001/43412 [1:00:41<2:03:47,  4.90it/s]

{'loss': 0.2474, 'grad_norm': 2.226158857345581, 'learning_rate': 1.6775085229890353e-05, 'epoch': 0.64}


 16%|█▌        | 7051/43412 [1:00:51<2:03:33,  4.90it/s]

{'loss': 0.224, 'grad_norm': 1.9158769845962524, 'learning_rate': 1.675205012438957e-05, 'epoch': 0.65}


 16%|█▋        | 7101/43412 [1:01:02<2:03:51,  4.89it/s]

{'loss': 0.2378, 'grad_norm': 1.336637258529663, 'learning_rate': 1.6729015018888787e-05, 'epoch': 0.65}


 16%|█▋        | 7151/43412 [1:01:12<2:02:30,  4.93it/s]

{'loss': 0.2371, 'grad_norm': 2.421381950378418, 'learning_rate': 1.6705979913388006e-05, 'epoch': 0.66}


 17%|█▋        | 7201/43412 [1:01:22<2:02:46,  4.92it/s]

{'loss': 0.248, 'grad_norm': 8.307866096496582, 'learning_rate': 1.668294480788722e-05, 'epoch': 0.66}


 17%|█▋        | 7251/43412 [1:01:32<2:01:41,  4.95it/s]

{'loss': 0.2291, 'grad_norm': 3.369091272354126, 'learning_rate': 1.665990970238644e-05, 'epoch': 0.67}


 17%|█▋        | 7301/43412 [1:01:42<2:03:04,  4.89it/s]

{'loss': 0.2107, 'grad_norm': 1.8226317167282104, 'learning_rate': 1.6636874596885655e-05, 'epoch': 0.67}


 17%|█▋        | 7351/43412 [1:01:52<2:02:49,  4.89it/s]

{'loss': 0.2427, 'grad_norm': 1.6438877582550049, 'learning_rate': 1.661383949138487e-05, 'epoch': 0.68}


 17%|█▋        | 7401/43412 [1:02:02<2:02:44,  4.89it/s]

{'loss': 0.2389, 'grad_norm': 1.235880732536316, 'learning_rate': 1.659080438588409e-05, 'epoch': 0.68}


 17%|█▋        | 7451/43412 [1:02:13<2:02:41,  4.88it/s]

{'loss': 0.2167, 'grad_norm': 0.43669888377189636, 'learning_rate': 1.6567769280383307e-05, 'epoch': 0.69}


 17%|█▋        | 7501/43412 [1:02:23<2:02:06,  4.90it/s]

{'loss': 0.2333, 'grad_norm': 2.1393184661865234, 'learning_rate': 1.6544734174882523e-05, 'epoch': 0.69}


 17%|█▋        | 7551/43412 [1:02:33<2:02:07,  4.89it/s]

{'loss': 0.2213, 'grad_norm': 5.694092750549316, 'learning_rate': 1.6521699069381738e-05, 'epoch': 0.7}


 18%|█▊        | 7601/43412 [1:02:43<2:01:58,  4.89it/s]

{'loss': 0.2229, 'grad_norm': 2.5796892642974854, 'learning_rate': 1.6498663963880957e-05, 'epoch': 0.7}


 18%|█▊        | 7651/43412 [1:02:53<2:01:47,  4.89it/s]

{'loss': 0.2567, 'grad_norm': 4.359739303588867, 'learning_rate': 1.6475628858380172e-05, 'epoch': 0.7}


 18%|█▊        | 7701/43412 [1:03:03<2:01:36,  4.89it/s]

{'loss': 0.264, 'grad_norm': 5.455024242401123, 'learning_rate': 1.645259375287939e-05, 'epoch': 0.71}


 18%|█▊        | 7751/43412 [1:03:14<2:00:50,  4.92it/s]

{'loss': 0.2389, 'grad_norm': 6.800411701202393, 'learning_rate': 1.6429558647378606e-05, 'epoch': 0.71}


 18%|█▊        | 7801/43412 [1:03:24<2:00:31,  4.92it/s]

{'loss': 0.2238, 'grad_norm': 3.020657539367676, 'learning_rate': 1.6406523541877824e-05, 'epoch': 0.72}


 18%|█▊        | 7851/43412 [1:03:34<2:00:33,  4.92it/s]

{'loss': 0.232, 'grad_norm': 1.888205885887146, 'learning_rate': 1.638348843637704e-05, 'epoch': 0.72}


 18%|█▊        | 7901/43412 [1:03:44<2:00:53,  4.90it/s]

{'loss': 0.2137, 'grad_norm': 3.302611827850342, 'learning_rate': 1.6360453330876255e-05, 'epoch': 0.73}


 18%|█▊        | 7951/43412 [1:03:54<2:00:33,  4.90it/s]

{'loss': 0.1921, 'grad_norm': 1.1145063638687134, 'learning_rate': 1.6337418225375474e-05, 'epoch': 0.73}


 18%|█▊        | 8001/43412 [1:04:04<2:00:34,  4.89it/s]

{'loss': 0.2095, 'grad_norm': 3.560364007949829, 'learning_rate': 1.631438311987469e-05, 'epoch': 0.74}


 19%|█▊        | 8051/43412 [1:04:15<2:00:17,  4.90it/s]

{'loss': 0.2451, 'grad_norm': 3.340733051300049, 'learning_rate': 1.6291348014373908e-05, 'epoch': 0.74}


 19%|█▊        | 8101/43412 [1:04:25<1:59:39,  4.92it/s]

{'loss': 0.2476, 'grad_norm': 2.1823220252990723, 'learning_rate': 1.6268312908873123e-05, 'epoch': 0.75}


 19%|█▉        | 8151/43412 [1:04:35<1:59:03,  4.94it/s]

{'loss': 0.2119, 'grad_norm': 2.289562463760376, 'learning_rate': 1.624527780337234e-05, 'epoch': 0.75}


 19%|█▉        | 8201/43412 [1:04:45<1:59:32,  4.91it/s]

{'loss': 0.237, 'grad_norm': 3.9395382404327393, 'learning_rate': 1.6222242697871557e-05, 'epoch': 0.76}


 19%|█▉        | 8251/43412 [1:04:55<1:59:55,  4.89it/s]

{'loss': 0.216, 'grad_norm': 2.531402111053467, 'learning_rate': 1.6199207592370772e-05, 'epoch': 0.76}


 19%|█▉        | 8301/43412 [1:05:05<1:58:13,  4.95it/s]

{'loss': 0.2454, 'grad_norm': 1.4436770677566528, 'learning_rate': 1.617617248686999e-05, 'epoch': 0.76}


 19%|█▉        | 8351/43412 [1:05:15<1:59:19,  4.90it/s]

{'loss': 0.2275, 'grad_norm': 2.9155025482177734, 'learning_rate': 1.615313738136921e-05, 'epoch': 0.77}


 19%|█▉        | 8401/43412 [1:05:26<1:59:01,  4.90it/s]

{'loss': 0.2618, 'grad_norm': 1.458097219467163, 'learning_rate': 1.6130102275868425e-05, 'epoch': 0.77}


 19%|█▉        | 8451/43412 [1:05:36<1:58:58,  4.90it/s]

{'loss': 0.2364, 'grad_norm': 3.3636481761932373, 'learning_rate': 1.610706717036764e-05, 'epoch': 0.78}


 20%|█▉        | 8501/43412 [1:05:46<1:58:32,  4.91it/s]

{'loss': 0.2418, 'grad_norm': 0.6307826638221741, 'learning_rate': 1.608403206486686e-05, 'epoch': 0.78}


 20%|█▉        | 8551/43412 [1:05:56<1:58:55,  4.89it/s]

{'loss': 0.1971, 'grad_norm': 3.105567693710327, 'learning_rate': 1.6060996959366074e-05, 'epoch': 0.79}


 20%|█▉        | 8601/43412 [1:06:06<1:59:00,  4.87it/s]

{'loss': 0.2383, 'grad_norm': 1.882135033607483, 'learning_rate': 1.6037961853865292e-05, 'epoch': 0.79}


 20%|█▉        | 8651/43412 [1:06:16<1:57:20,  4.94it/s]

{'loss': 0.2203, 'grad_norm': 2.2976369857788086, 'learning_rate': 1.601492674836451e-05, 'epoch': 0.8}


 20%|██        | 8701/43412 [1:06:27<1:58:28,  4.88it/s]

{'loss': 0.2461, 'grad_norm': 5.214986324310303, 'learning_rate': 1.5991891642863726e-05, 'epoch': 0.8}


 20%|██        | 8751/43412 [1:06:37<1:57:09,  4.93it/s]

{'loss': 0.2265, 'grad_norm': 1.481217384338379, 'learning_rate': 1.596885653736294e-05, 'epoch': 0.81}


 20%|██        | 8801/43412 [1:06:47<1:56:37,  4.95it/s]

{'loss': 0.2034, 'grad_norm': 1.0445771217346191, 'learning_rate': 1.5945821431862157e-05, 'epoch': 0.81}


 20%|██        | 8851/43412 [1:06:57<1:56:37,  4.94it/s]

{'loss': 0.2315, 'grad_norm': 3.4376683235168457, 'learning_rate': 1.5922786326361376e-05, 'epoch': 0.82}


 21%|██        | 8901/43412 [1:07:07<1:58:06,  4.87it/s]

{'loss': 0.206, 'grad_norm': 2.1912312507629395, 'learning_rate': 1.5899751220860594e-05, 'epoch': 0.82}


 21%|██        | 8951/43412 [1:07:17<1:57:09,  4.90it/s]

{'loss': 0.246, 'grad_norm': 2.0351059436798096, 'learning_rate': 1.587671611535981e-05, 'epoch': 0.82}


 21%|██        | 9001/43412 [1:07:27<1:56:26,  4.93it/s]

{'loss': 0.216, 'grad_norm': 0.6552067995071411, 'learning_rate': 1.5853681009859025e-05, 'epoch': 0.83}


 21%|██        | 9051/43412 [1:07:38<1:56:21,  4.92it/s]

{'loss': 0.2304, 'grad_norm': 3.4069597721099854, 'learning_rate': 1.5830645904358243e-05, 'epoch': 0.83}


 21%|██        | 9101/43412 [1:07:48<1:56:44,  4.90it/s]

{'loss': 0.2219, 'grad_norm': 2.6805102825164795, 'learning_rate': 1.580761079885746e-05, 'epoch': 0.84}


 21%|██        | 9151/43412 [1:07:58<1:56:45,  4.89it/s]

{'loss': 0.2315, 'grad_norm': 3.4943885803222656, 'learning_rate': 1.5784575693356677e-05, 'epoch': 0.84}


 21%|██        | 9201/43412 [1:08:08<1:55:48,  4.92it/s]

{'loss': 0.2554, 'grad_norm': 1.0652549266815186, 'learning_rate': 1.5761540587855896e-05, 'epoch': 0.85}


 21%|██▏       | 9251/43412 [1:08:18<1:56:11,  4.90it/s]

{'loss': 0.2317, 'grad_norm': 2.03448486328125, 'learning_rate': 1.573850548235511e-05, 'epoch': 0.85}


 21%|██▏       | 9301/43412 [1:08:28<1:56:19,  4.89it/s]

{'loss': 0.2336, 'grad_norm': 3.844271659851074, 'learning_rate': 1.5715470376854327e-05, 'epoch': 0.86}


 22%|██▏       | 9351/43412 [1:08:39<1:55:48,  4.90it/s]

{'loss': 0.2134, 'grad_norm': 1.5015017986297607, 'learning_rate': 1.5692435271353542e-05, 'epoch': 0.86}


 22%|██▏       | 9401/43412 [1:08:49<1:54:20,  4.96it/s]

{'loss': 0.2289, 'grad_norm': 2.278475046157837, 'learning_rate': 1.566940016585276e-05, 'epoch': 0.87}


 22%|██▏       | 9451/43412 [1:08:59<1:55:03,  4.92it/s]

{'loss': 0.2353, 'grad_norm': 3.0160491466522217, 'learning_rate': 1.564636506035198e-05, 'epoch': 0.87}


 22%|██▏       | 9501/43412 [1:09:09<1:55:13,  4.90it/s]

{'loss': 0.2224, 'grad_norm': 1.1037689447402954, 'learning_rate': 1.5623329954851194e-05, 'epoch': 0.88}


 22%|██▏       | 9551/43412 [1:09:19<1:55:31,  4.89it/s]

{'loss': 0.2152, 'grad_norm': 2.29057240486145, 'learning_rate': 1.560029484935041e-05, 'epoch': 0.88}


 22%|██▏       | 9601/43412 [1:09:29<1:54:57,  4.90it/s]

{'loss': 0.2231, 'grad_norm': 2.2715165615081787, 'learning_rate': 1.557725974384963e-05, 'epoch': 0.88}


 22%|██▏       | 9651/43412 [1:09:39<1:54:45,  4.90it/s]

{'loss': 0.255, 'grad_norm': 3.4290575981140137, 'learning_rate': 1.5554224638348844e-05, 'epoch': 0.89}


 22%|██▏       | 9701/43412 [1:09:50<1:54:28,  4.91it/s]

{'loss': 0.2321, 'grad_norm': 3.801220178604126, 'learning_rate': 1.5531189532848062e-05, 'epoch': 0.89}


 22%|██▏       | 9751/43412 [1:10:00<1:53:51,  4.93it/s]

{'loss': 0.2468, 'grad_norm': 1.5751131772994995, 'learning_rate': 1.550815442734728e-05, 'epoch': 0.9}


 23%|██▎       | 9801/43412 [1:10:10<1:54:03,  4.91it/s]

{'loss': 0.2276, 'grad_norm': 2.758354425430298, 'learning_rate': 1.5485119321846496e-05, 'epoch': 0.9}


 23%|██▎       | 9851/43412 [1:10:20<1:53:49,  4.91it/s]

{'loss': 0.2258, 'grad_norm': 2.7947170734405518, 'learning_rate': 1.546208421634571e-05, 'epoch': 0.91}


 23%|██▎       | 9901/43412 [1:10:30<1:53:39,  4.91it/s]

{'loss': 0.2367, 'grad_norm': 1.1007342338562012, 'learning_rate': 1.5439049110844927e-05, 'epoch': 0.91}


 23%|██▎       | 9951/43412 [1:10:40<1:54:01,  4.89it/s]

{'loss': 0.279, 'grad_norm': 4.173100471496582, 'learning_rate': 1.5416014005344145e-05, 'epoch': 0.92}


 23%|██▎       | 10001/43412 [1:10:51<1:53:10,  4.92it/s]

{'loss': 0.2311, 'grad_norm': 2.5066897869110107, 'learning_rate': 1.5392978899843364e-05, 'epoch': 0.92}


 23%|██▎       | 10051/43412 [1:11:01<1:53:42,  4.89it/s]

{'loss': 0.209, 'grad_norm': 1.5182418823242188, 'learning_rate': 1.536994379434258e-05, 'epoch': 0.93}


 23%|██▎       | 10101/43412 [1:11:11<1:52:53,  4.92it/s]

{'loss': 0.2148, 'grad_norm': 2.2450485229492188, 'learning_rate': 1.5346908688841795e-05, 'epoch': 0.93}


 23%|██▎       | 10151/43412 [1:11:21<1:53:06,  4.90it/s]

{'loss': 0.2209, 'grad_norm': 1.8436545133590698, 'learning_rate': 1.5323873583341013e-05, 'epoch': 0.94}


 23%|██▎       | 10201/43412 [1:11:31<1:52:33,  4.92it/s]

{'loss': 0.2592, 'grad_norm': 1.6713329553604126, 'learning_rate': 1.530083847784023e-05, 'epoch': 0.94}


 24%|██▎       | 10251/43412 [1:11:41<1:53:01,  4.89it/s]

{'loss': 0.2168, 'grad_norm': 2.7412679195404053, 'learning_rate': 1.5277803372339447e-05, 'epoch': 0.94}


 24%|██▎       | 10301/43412 [1:11:51<1:52:37,  4.90it/s]

{'loss': 0.2417, 'grad_norm': 1.6050148010253906, 'learning_rate': 1.5254768266838664e-05, 'epoch': 0.95}


 24%|██▍       | 10351/43412 [1:12:02<1:51:05,  4.96it/s]

{'loss': 0.2179, 'grad_norm': 2.966545581817627, 'learning_rate': 1.5231733161337881e-05, 'epoch': 0.95}


 24%|██▍       | 10401/43412 [1:12:12<1:51:21,  4.94it/s]

{'loss': 0.2278, 'grad_norm': 3.6356163024902344, 'learning_rate': 1.5208698055837096e-05, 'epoch': 0.96}


 24%|██▍       | 10451/43412 [1:12:22<1:51:57,  4.91it/s]

{'loss': 0.2425, 'grad_norm': 1.4645545482635498, 'learning_rate': 1.5185662950336313e-05, 'epoch': 0.96}


 24%|██▍       | 10501/43412 [1:12:32<1:51:53,  4.90it/s]

{'loss': 0.2267, 'grad_norm': 1.9457252025604248, 'learning_rate': 1.5162627844835532e-05, 'epoch': 0.97}


 24%|██▍       | 10551/43412 [1:12:42<1:51:39,  4.90it/s]

{'loss': 0.2507, 'grad_norm': 2.912144660949707, 'learning_rate': 1.5139592739334747e-05, 'epoch': 0.97}


 24%|██▍       | 10601/43412 [1:12:52<1:50:50,  4.93it/s]

{'loss': 0.2164, 'grad_norm': 0.990433394908905, 'learning_rate': 1.5116557633833964e-05, 'epoch': 0.98}


 25%|██▍       | 10651/43412 [1:13:03<1:51:11,  4.91it/s]

{'loss': 0.2334, 'grad_norm': 2.371603488922119, 'learning_rate': 1.5093522528333183e-05, 'epoch': 0.98}


 25%|██▍       | 10701/43412 [1:13:13<1:51:17,  4.90it/s]

{'loss': 0.2, 'grad_norm': 2.195317268371582, 'learning_rate': 1.5070487422832398e-05, 'epoch': 0.99}


 25%|██▍       | 10751/43412 [1:13:23<1:50:58,  4.90it/s]

{'loss': 0.2285, 'grad_norm': 0.971666157245636, 'learning_rate': 1.5047452317331615e-05, 'epoch': 0.99}


 25%|██▍       | 10801/43412 [1:13:33<1:50:58,  4.90it/s]

{'loss': 0.2089, 'grad_norm': 3.7462081909179688, 'learning_rate': 1.502441721183083e-05, 'epoch': 1.0}


 25%|██▍       | 10851/43412 [1:13:43<1:50:26,  4.91it/s]

{'loss': 0.2221, 'grad_norm': 2.373548984527588, 'learning_rate': 1.5001382106330049e-05, 'epoch': 1.0}


                                                         
 25%|██▌       | 10853/43412 [1:15:13<1:45:52,  5.13it/s]

{'eval_loss': 0.22523243725299835, 'eval_f1_macro': 0.5856264184285974, 'eval_precision_macro': 0.6613206976829644, 'eval_recall_macro': 0.5303943182837044, 'eval_accuracy': 0.5820125322521195, 'eval_runtime': 89.0815, 'eval_samples_per_second': 60.91, 'eval_steps_per_second': 7.622, 'epoch': 1.0}


 25%|██▌       | 10901/43412 [1:15:24<1:49:45,  4.94it/s]  

{'loss': 0.1748, 'grad_norm': 2.9115395545959473, 'learning_rate': 1.4978347000829266e-05, 'epoch': 1.0}


 25%|██▌       | 10951/43412 [1:15:34<1:50:12,  4.91it/s]

{'loss': 0.1748, 'grad_norm': 1.6854310035705566, 'learning_rate': 1.4955311895328481e-05, 'epoch': 1.01}


 25%|██▌       | 11001/43412 [1:15:44<1:49:31,  4.93it/s]

{'loss': 0.201, 'grad_norm': 0.5506563782691956, 'learning_rate': 1.4932276789827698e-05, 'epoch': 1.01}


 25%|██▌       | 11051/43412 [1:15:54<1:49:55,  4.91it/s]

{'loss': 0.2165, 'grad_norm': 2.304295301437378, 'learning_rate': 1.4909241684326915e-05, 'epoch': 1.02}


 26%|██▌       | 11101/43412 [1:16:05<1:50:12,  4.89it/s]

{'loss': 0.1888, 'grad_norm': 2.115323305130005, 'learning_rate': 1.4886206578826132e-05, 'epoch': 1.02}


 26%|██▌       | 11151/43412 [1:16:15<1:49:42,  4.90it/s]

{'loss': 0.2334, 'grad_norm': 3.1458539962768555, 'learning_rate': 1.486317147332535e-05, 'epoch': 1.03}


 26%|██▌       | 11201/43412 [1:16:25<1:49:31,  4.90it/s]

{'loss': 0.197, 'grad_norm': 1.6162445545196533, 'learning_rate': 1.4840136367824566e-05, 'epoch': 1.03}


 26%|██▌       | 11251/43412 [1:16:35<1:49:21,  4.90it/s]

{'loss': 0.1773, 'grad_norm': 4.607941150665283, 'learning_rate': 1.4817101262323783e-05, 'epoch': 1.04}


 26%|██▌       | 11301/43412 [1:16:45<1:49:28,  4.89it/s]

{'loss': 0.1943, 'grad_norm': 4.647823810577393, 'learning_rate': 1.4794066156823e-05, 'epoch': 1.04}


 26%|██▌       | 11351/43412 [1:16:55<1:49:08,  4.90it/s]

{'loss': 0.2049, 'grad_norm': 1.5217442512512207, 'learning_rate': 1.4771031051322215e-05, 'epoch': 1.05}


 26%|██▋       | 11401/43412 [1:17:05<1:47:51,  4.95it/s]

{'loss': 0.175, 'grad_norm': 2.810051679611206, 'learning_rate': 1.4747995945821434e-05, 'epoch': 1.05}


 26%|██▋       | 11451/43412 [1:17:16<1:48:35,  4.91it/s]

{'loss': 0.191, 'grad_norm': 4.099118709564209, 'learning_rate': 1.472496084032065e-05, 'epoch': 1.06}


 26%|██▋       | 11501/43412 [1:17:26<1:48:28,  4.90it/s]

{'loss': 0.2011, 'grad_norm': 3.179434061050415, 'learning_rate': 1.4701925734819866e-05, 'epoch': 1.06}


 27%|██▋       | 11551/43412 [1:17:36<1:48:13,  4.91it/s]

{'loss': 0.1923, 'grad_norm': 2.0020947456359863, 'learning_rate': 1.4678890629319083e-05, 'epoch': 1.06}


 27%|██▋       | 11601/43412 [1:17:46<1:47:12,  4.95it/s]

{'loss': 0.2038, 'grad_norm': 2.816498041152954, 'learning_rate': 1.46558555238183e-05, 'epoch': 1.07}


 27%|██▋       | 11651/43412 [1:17:56<1:47:51,  4.91it/s]

{'loss': 0.1828, 'grad_norm': 0.8389655351638794, 'learning_rate': 1.4632820418317517e-05, 'epoch': 1.07}


 27%|██▋       | 11701/43412 [1:18:06<1:47:03,  4.94it/s]

{'loss': 0.2304, 'grad_norm': 4.703863620758057, 'learning_rate': 1.4609785312816734e-05, 'epoch': 1.08}


 27%|██▋       | 11751/43412 [1:18:17<1:47:16,  4.92it/s]

{'loss': 0.1865, 'grad_norm': 3.2018508911132812, 'learning_rate': 1.4586750207315951e-05, 'epoch': 1.08}


 27%|██▋       | 11801/43412 [1:18:27<1:47:22,  4.91it/s]

{'loss': 0.1921, 'grad_norm': 6.733506202697754, 'learning_rate': 1.4563715101815168e-05, 'epoch': 1.09}


 27%|██▋       | 11851/43412 [1:18:37<1:47:30,  4.89it/s]

{'loss': 0.1829, 'grad_norm': 4.792984962463379, 'learning_rate': 1.4540679996314383e-05, 'epoch': 1.09}


 27%|██▋       | 11901/43412 [1:18:47<1:47:33,  4.88it/s]

{'loss': 0.2014, 'grad_norm': 4.010373115539551, 'learning_rate': 1.45176448908136e-05, 'epoch': 1.1}


 28%|██▊       | 11951/43412 [1:18:57<1:46:59,  4.90it/s]

{'loss': 0.192, 'grad_norm': 3.0262694358825684, 'learning_rate': 1.4494609785312819e-05, 'epoch': 1.1}


 28%|██▊       | 12001/43412 [1:19:07<1:46:04,  4.94it/s]

{'loss': 0.1898, 'grad_norm': 5.881275177001953, 'learning_rate': 1.4471574679812034e-05, 'epoch': 1.11}


 28%|██▊       | 12051/43412 [1:19:18<1:46:19,  4.92it/s]

{'loss': 0.1946, 'grad_norm': 4.562563419342041, 'learning_rate': 1.4448539574311251e-05, 'epoch': 1.11}


 28%|██▊       | 12101/43412 [1:19:28<1:46:32,  4.90it/s]

{'loss': 0.1952, 'grad_norm': 2.571702241897583, 'learning_rate': 1.4425504468810468e-05, 'epoch': 1.11}


 28%|██▊       | 12151/43412 [1:19:38<1:45:30,  4.94it/s]

{'loss': 0.2062, 'grad_norm': 1.8499395847320557, 'learning_rate': 1.4402469363309685e-05, 'epoch': 1.12}


 28%|██▊       | 12201/43412 [1:19:48<1:45:12,  4.94it/s]

{'loss': 0.1974, 'grad_norm': 2.8708691596984863, 'learning_rate': 1.4379434257808902e-05, 'epoch': 1.12}


 28%|██▊       | 12251/43412 [1:19:58<1:45:33,  4.92it/s]

{'loss': 0.1938, 'grad_norm': 4.577641010284424, 'learning_rate': 1.4356399152308117e-05, 'epoch': 1.13}


 28%|██▊       | 12301/43412 [1:20:08<1:45:46,  4.90it/s]

{'loss': 0.1916, 'grad_norm': 1.3704235553741455, 'learning_rate': 1.4333364046807336e-05, 'epoch': 1.13}


 28%|██▊       | 12351/43412 [1:20:18<1:45:33,  4.90it/s]

{'loss': 0.1966, 'grad_norm': 5.10786247253418, 'learning_rate': 1.4310328941306553e-05, 'epoch': 1.14}


 29%|██▊       | 12401/43412 [1:20:29<1:45:04,  4.92it/s]

{'loss': 0.2191, 'grad_norm': 3.451268196105957, 'learning_rate': 1.4287293835805768e-05, 'epoch': 1.14}


 29%|██▊       | 12451/43412 [1:20:39<1:45:15,  4.90it/s]

{'loss': 0.1757, 'grad_norm': 2.7119178771972656, 'learning_rate': 1.4264258730304985e-05, 'epoch': 1.15}


 29%|██▉       | 12501/43412 [1:20:49<1:44:47,  4.92it/s]

{'loss': 0.1778, 'grad_norm': 1.9585586786270142, 'learning_rate': 1.4241223624804204e-05, 'epoch': 1.15}


 29%|██▉       | 12551/43412 [1:20:59<1:44:55,  4.90it/s]

{'loss': 0.1964, 'grad_norm': 4.5779547691345215, 'learning_rate': 1.4218188519303419e-05, 'epoch': 1.16}


 29%|██▉       | 12601/43412 [1:21:09<1:43:47,  4.95it/s]

{'loss': 0.2057, 'grad_norm': 2.464442253112793, 'learning_rate': 1.4195153413802636e-05, 'epoch': 1.16}


 29%|██▉       | 12651/43412 [1:21:19<1:45:02,  4.88it/s]

{'loss': 0.2043, 'grad_norm': 1.2963114976882935, 'learning_rate': 1.4172118308301851e-05, 'epoch': 1.17}


 29%|██▉       | 12701/43412 [1:21:30<1:43:56,  4.92it/s]

{'loss': 0.1567, 'grad_norm': 2.2964885234832764, 'learning_rate': 1.414908320280107e-05, 'epoch': 1.17}


 29%|██▉       | 12751/43412 [1:21:40<1:44:13,  4.90it/s]

{'loss': 0.2047, 'grad_norm': 0.7448352575302124, 'learning_rate': 1.4126048097300287e-05, 'epoch': 1.17}


 29%|██▉       | 12801/43412 [1:21:50<1:44:03,  4.90it/s]

{'loss': 0.2033, 'grad_norm': 4.43700647354126, 'learning_rate': 1.4103012991799502e-05, 'epoch': 1.18}


 30%|██▉       | 12851/43412 [1:22:00<1:44:09,  4.89it/s]

{'loss': 0.1959, 'grad_norm': 1.7228803634643555, 'learning_rate': 1.4079977886298721e-05, 'epoch': 1.18}


 30%|██▉       | 12901/43412 [1:22:10<1:42:55,  4.94it/s]

{'loss': 0.1786, 'grad_norm': 6.013265132904053, 'learning_rate': 1.4056942780797938e-05, 'epoch': 1.19}


 30%|██▉       | 12951/43412 [1:22:20<1:42:47,  4.94it/s]

{'loss': 0.1975, 'grad_norm': 4.473275184631348, 'learning_rate': 1.4033907675297153e-05, 'epoch': 1.19}


 30%|██▉       | 13001/43412 [1:22:30<1:43:01,  4.92it/s]

{'loss': 0.1737, 'grad_norm': 2.3170626163482666, 'learning_rate': 1.401087256979637e-05, 'epoch': 1.2}


 30%|███       | 13051/43412 [1:22:41<1:43:11,  4.90it/s]

{'loss': 0.2109, 'grad_norm': 0.3282393217086792, 'learning_rate': 1.3987837464295589e-05, 'epoch': 1.2}


 30%|███       | 13101/43412 [1:22:51<1:43:15,  4.89it/s]

{'loss': 0.1659, 'grad_norm': 3.1268763542175293, 'learning_rate': 1.3964802358794804e-05, 'epoch': 1.21}


 30%|███       | 13151/43412 [1:23:01<1:43:19,  4.88it/s]

{'loss': 0.2289, 'grad_norm': 0.8932040929794312, 'learning_rate': 1.3941767253294021e-05, 'epoch': 1.21}


 30%|███       | 13201/43412 [1:23:11<1:42:25,  4.92it/s]

{'loss': 0.1774, 'grad_norm': 1.3839948177337646, 'learning_rate': 1.391873214779324e-05, 'epoch': 1.22}


 31%|███       | 13251/43412 [1:23:21<1:42:26,  4.91it/s]

{'loss': 0.1912, 'grad_norm': 3.921354293823242, 'learning_rate': 1.3895697042292455e-05, 'epoch': 1.22}


 31%|███       | 13301/43412 [1:23:31<1:42:06,  4.91it/s]

{'loss': 0.2423, 'grad_norm': 0.730868935585022, 'learning_rate': 1.3872661936791672e-05, 'epoch': 1.23}


 31%|███       | 13351/43412 [1:23:41<1:42:22,  4.89it/s]

{'loss': 0.1762, 'grad_norm': 3.494786262512207, 'learning_rate': 1.3849626831290887e-05, 'epoch': 1.23}


 31%|███       | 13401/43412 [1:23:52<1:42:11,  4.89it/s]

{'loss': 0.1877, 'grad_norm': 1.0840650796890259, 'learning_rate': 1.3826591725790106e-05, 'epoch': 1.23}


 31%|███       | 13451/43412 [1:24:02<1:41:53,  4.90it/s]

{'loss': 0.1809, 'grad_norm': 0.7203525900840759, 'learning_rate': 1.3803556620289323e-05, 'epoch': 1.24}


 31%|███       | 13501/43412 [1:24:12<1:41:34,  4.91it/s]

{'loss': 0.2065, 'grad_norm': 3.3194210529327393, 'learning_rate': 1.3780521514788538e-05, 'epoch': 1.24}


 31%|███       | 13551/43412 [1:24:22<1:41:26,  4.91it/s]

{'loss': 0.2008, 'grad_norm': 1.693170189857483, 'learning_rate': 1.3757486409287755e-05, 'epoch': 1.25}


 31%|███▏      | 13601/43412 [1:24:32<1:40:39,  4.94it/s]

{'loss': 0.1992, 'grad_norm': 1.571789026260376, 'learning_rate': 1.3734451303786974e-05, 'epoch': 1.25}


 31%|███▏      | 13651/43412 [1:24:42<1:41:07,  4.90it/s]

{'loss': 0.1968, 'grad_norm': 3.8151183128356934, 'learning_rate': 1.3711416198286189e-05, 'epoch': 1.26}


 32%|███▏      | 13701/43412 [1:24:53<1:40:58,  4.90it/s]

{'loss': 0.1958, 'grad_norm': 2.810535430908203, 'learning_rate': 1.3688381092785406e-05, 'epoch': 1.26}


 32%|███▏      | 13751/43412 [1:25:03<1:40:21,  4.93it/s]

{'loss': 0.197, 'grad_norm': 1.528368353843689, 'learning_rate': 1.3665345987284625e-05, 'epoch': 1.27}


 32%|███▏      | 13801/43412 [1:25:13<1:40:39,  4.90it/s]

{'loss': 0.1881, 'grad_norm': 5.791752815246582, 'learning_rate': 1.364231088178384e-05, 'epoch': 1.27}


 32%|███▏      | 13851/43412 [1:25:23<1:40:26,  4.90it/s]

{'loss': 0.1908, 'grad_norm': 5.607288360595703, 'learning_rate': 1.3619275776283057e-05, 'epoch': 1.28}


 32%|███▏      | 13901/43412 [1:25:33<1:40:38,  4.89it/s]

{'loss': 0.2216, 'grad_norm': 3.9384963512420654, 'learning_rate': 1.3596240670782272e-05, 'epoch': 1.28}


 32%|███▏      | 13951/43412 [1:25:43<1:40:37,  4.88it/s]

{'loss': 0.2016, 'grad_norm': 4.9247050285339355, 'learning_rate': 1.357320556528149e-05, 'epoch': 1.29}


 32%|███▏      | 14001/43412 [1:25:53<1:39:33,  4.92it/s]

{'loss': 0.2113, 'grad_norm': 0.22875991463661194, 'learning_rate': 1.3550170459780708e-05, 'epoch': 1.29}


 32%|███▏      | 14051/43412 [1:26:04<1:40:15,  4.88it/s]

{'loss': 0.2026, 'grad_norm': 4.549035549163818, 'learning_rate': 1.3527135354279923e-05, 'epoch': 1.29}


 32%|███▏      | 14101/43412 [1:26:14<1:39:45,  4.90it/s]

{'loss': 0.1744, 'grad_norm': 1.7070362567901611, 'learning_rate': 1.350410024877914e-05, 'epoch': 1.3}


 33%|███▎      | 14151/43412 [1:26:24<1:39:34,  4.90it/s]

{'loss': 0.1793, 'grad_norm': 4.5198283195495605, 'learning_rate': 1.3481065143278359e-05, 'epoch': 1.3}


 33%|███▎      | 14201/43412 [1:26:34<1:39:14,  4.91it/s]

{'loss': 0.2183, 'grad_norm': 3.863882064819336, 'learning_rate': 1.3458030037777574e-05, 'epoch': 1.31}


 33%|███▎      | 14251/43412 [1:26:44<1:39:25,  4.89it/s]

{'loss': 0.1826, 'grad_norm': 2.0633203983306885, 'learning_rate': 1.343499493227679e-05, 'epoch': 1.31}


 33%|███▎      | 14301/43412 [1:26:54<1:38:17,  4.94it/s]

{'loss': 0.2137, 'grad_norm': 7.21061897277832, 'learning_rate': 1.341195982677601e-05, 'epoch': 1.32}


 33%|███▎      | 14351/43412 [1:27:04<1:38:32,  4.92it/s]

{'loss': 0.1848, 'grad_norm': 2.55047345161438, 'learning_rate': 1.3388924721275225e-05, 'epoch': 1.32}


 33%|███▎      | 14401/43412 [1:27:15<1:38:24,  4.91it/s]

{'loss': 0.2214, 'grad_norm': 3.7685387134552, 'learning_rate': 1.3365889615774442e-05, 'epoch': 1.33}


 33%|███▎      | 14451/43412 [1:27:25<1:38:06,  4.92it/s]

{'loss': 0.1806, 'grad_norm': 1.3184840679168701, 'learning_rate': 1.3342854510273657e-05, 'epoch': 1.33}


 33%|███▎      | 14501/43412 [1:27:35<1:38:38,  4.88it/s]

{'loss': 0.2204, 'grad_norm': 4.436152935028076, 'learning_rate': 1.3319819404772876e-05, 'epoch': 1.34}


 34%|███▎      | 14551/43412 [1:27:45<1:37:11,  4.95it/s]

{'loss': 0.1782, 'grad_norm': 2.368149995803833, 'learning_rate': 1.3296784299272093e-05, 'epoch': 1.34}


 34%|███▎      | 14601/43412 [1:27:55<1:38:09,  4.89it/s]

{'loss': 0.1848, 'grad_norm': 1.12394380569458, 'learning_rate': 1.3273749193771308e-05, 'epoch': 1.35}


 34%|███▎      | 14651/43412 [1:28:05<1:38:06,  4.89it/s]

{'loss': 0.201, 'grad_norm': 2.617398500442505, 'learning_rate': 1.3250714088270525e-05, 'epoch': 1.35}


 34%|███▍      | 14701/43412 [1:28:16<1:36:58,  4.93it/s]

{'loss': 0.216, 'grad_norm': 2.3023154735565186, 'learning_rate': 1.3227678982769744e-05, 'epoch': 1.35}


 34%|███▍      | 14751/43412 [1:28:26<1:37:01,  4.92it/s]

{'loss': 0.1833, 'grad_norm': 7.344982147216797, 'learning_rate': 1.3204643877268959e-05, 'epoch': 1.36}


 34%|███▍      | 14801/43412 [1:28:36<1:37:05,  4.91it/s]

{'loss': 0.2054, 'grad_norm': 1.6736689805984497, 'learning_rate': 1.3181608771768176e-05, 'epoch': 1.36}


 34%|███▍      | 14851/43412 [1:28:46<1:37:08,  4.90it/s]

{'loss': 0.2053, 'grad_norm': 0.7639241814613342, 'learning_rate': 1.3158573666267393e-05, 'epoch': 1.37}


 34%|███▍      | 14901/43412 [1:28:56<1:37:04,  4.90it/s]

{'loss': 0.2118, 'grad_norm': 1.9577009677886963, 'learning_rate': 1.313553856076661e-05, 'epoch': 1.37}


 34%|███▍      | 14951/43412 [1:29:06<1:36:05,  4.94it/s]

{'loss': 0.2, 'grad_norm': 1.6679821014404297, 'learning_rate': 1.3112503455265827e-05, 'epoch': 1.38}


 35%|███▍      | 15001/43412 [1:29:16<1:35:39,  4.95it/s]

{'loss': 0.1936, 'grad_norm': 1.9182184934616089, 'learning_rate': 1.3089468349765042e-05, 'epoch': 1.38}


 35%|███▍      | 15051/43412 [1:29:27<1:36:28,  4.90it/s]

{'loss': 0.1905, 'grad_norm': 6.360689640045166, 'learning_rate': 1.306643324426426e-05, 'epoch': 1.39}


 35%|███▍      | 15101/43412 [1:29:37<1:36:23,  4.89it/s]

{'loss': 0.1997, 'grad_norm': 5.810698986053467, 'learning_rate': 1.3043398138763478e-05, 'epoch': 1.39}


 35%|███▍      | 15151/43412 [1:29:47<1:35:39,  4.92it/s]

{'loss': 0.1992, 'grad_norm': 4.614441871643066, 'learning_rate': 1.3020363033262693e-05, 'epoch': 1.4}


 35%|███▌      | 15201/43412 [1:29:57<1:36:21,  4.88it/s]

{'loss': 0.2012, 'grad_norm': 3.443448305130005, 'learning_rate': 1.299732792776191e-05, 'epoch': 1.4}


 35%|███▌      | 15251/43412 [1:30:07<1:35:29,  4.91it/s]

{'loss': 0.1991, 'grad_norm': 3.5140938758850098, 'learning_rate': 1.2974292822261127e-05, 'epoch': 1.41}


 35%|███▌      | 15301/43412 [1:30:17<1:35:28,  4.91it/s]

{'loss': 0.1883, 'grad_norm': 5.4078874588012695, 'learning_rate': 1.2951257716760344e-05, 'epoch': 1.41}


 35%|███▌      | 15351/43412 [1:30:28<1:35:30,  4.90it/s]

{'loss': 0.1841, 'grad_norm': 2.058065176010132, 'learning_rate': 1.292822261125956e-05, 'epoch': 1.41}


 35%|███▌      | 15401/43412 [1:30:38<1:35:45,  4.88it/s]

{'loss': 0.2281, 'grad_norm': 4.159677505493164, 'learning_rate': 1.2905187505758778e-05, 'epoch': 1.42}


 36%|███▌      | 15451/43412 [1:30:48<1:34:56,  4.91it/s]

{'loss': 0.2262, 'grad_norm': 2.118015766143799, 'learning_rate': 1.2882152400257995e-05, 'epoch': 1.42}


 36%|███▌      | 15501/43412 [1:30:58<1:34:50,  4.90it/s]

{'loss': 0.2002, 'grad_norm': 2.196775197982788, 'learning_rate': 1.285911729475721e-05, 'epoch': 1.43}


 36%|███▌      | 15551/43412 [1:31:08<1:33:54,  4.94it/s]

{'loss': 0.2014, 'grad_norm': 5.497758388519287, 'learning_rate': 1.2836082189256427e-05, 'epoch': 1.43}


 36%|███▌      | 15601/43412 [1:31:18<1:34:28,  4.91it/s]

{'loss': 0.177, 'grad_norm': 13.198683738708496, 'learning_rate': 1.2813047083755645e-05, 'epoch': 1.44}


 36%|███▌      | 15651/43412 [1:31:29<1:34:24,  4.90it/s]

{'loss': 0.1492, 'grad_norm': 1.9078304767608643, 'learning_rate': 1.279001197825486e-05, 'epoch': 1.44}


 36%|███▌      | 15701/43412 [1:31:39<1:34:21,  4.89it/s]

{'loss': 0.2392, 'grad_norm': 1.0981967449188232, 'learning_rate': 1.2766976872754078e-05, 'epoch': 1.45}


 36%|███▋      | 15751/43412 [1:31:49<1:34:18,  4.89it/s]

{'loss': 0.1817, 'grad_norm': 4.002441883087158, 'learning_rate': 1.2743941767253296e-05, 'epoch': 1.45}


 36%|███▋      | 15801/43412 [1:31:59<1:33:06,  4.94it/s]

{'loss': 0.2209, 'grad_norm': 2.3486709594726562, 'learning_rate': 1.2720906661752512e-05, 'epoch': 1.46}


 37%|███▋      | 15851/43412 [1:32:09<1:33:47,  4.90it/s]

{'loss': 0.1963, 'grad_norm': 2.6935479640960693, 'learning_rate': 1.2697871556251729e-05, 'epoch': 1.46}


 37%|███▋      | 15901/43412 [1:32:19<1:34:04,  4.87it/s]

{'loss': 0.1861, 'grad_norm': 0.15928590297698975, 'learning_rate': 1.2674836450750944e-05, 'epoch': 1.47}


 37%|███▋      | 15951/43412 [1:32:29<1:33:33,  4.89it/s]

{'loss': 0.2375, 'grad_norm': 3.796588182449341, 'learning_rate': 1.2651801345250163e-05, 'epoch': 1.47}


 37%|███▋      | 16001/43412 [1:32:40<1:32:30,  4.94it/s]

{'loss': 0.202, 'grad_norm': 3.0368032455444336, 'learning_rate': 1.262876623974938e-05, 'epoch': 1.47}


 37%|███▋      | 16051/43412 [1:32:50<1:33:07,  4.90it/s]

{'loss': 0.1624, 'grad_norm': 0.3953187167644501, 'learning_rate': 1.2605731134248595e-05, 'epoch': 1.48}


 37%|███▋      | 16101/43412 [1:33:00<1:32:56,  4.90it/s]

{'loss': 0.2059, 'grad_norm': 4.459767818450928, 'learning_rate': 1.2582696028747812e-05, 'epoch': 1.48}


 37%|███▋      | 16151/43412 [1:33:10<1:32:55,  4.89it/s]

{'loss': 0.1973, 'grad_norm': 5.633009910583496, 'learning_rate': 1.255966092324703e-05, 'epoch': 1.49}


 37%|███▋      | 16201/43412 [1:33:20<1:32:45,  4.89it/s]

{'loss': 0.2192, 'grad_norm': 2.2316861152648926, 'learning_rate': 1.2536625817746246e-05, 'epoch': 1.49}


 37%|███▋      | 16251/43412 [1:33:30<1:32:41,  4.88it/s]

{'loss': 0.1907, 'grad_norm': 3.017327070236206, 'learning_rate': 1.2513590712245463e-05, 'epoch': 1.5}


 38%|███▊      | 16301/43412 [1:33:41<1:32:04,  4.91it/s]

{'loss': 0.1915, 'grad_norm': 3.7867226600646973, 'learning_rate': 1.2490555606744681e-05, 'epoch': 1.5}


 38%|███▊      | 16351/43412 [1:33:51<1:31:39,  4.92it/s]

{'loss': 0.2197, 'grad_norm': 4.592007160186768, 'learning_rate': 1.2467520501243897e-05, 'epoch': 1.51}


 38%|███▊      | 16401/43412 [1:34:01<1:31:20,  4.93it/s]

{'loss': 0.2071, 'grad_norm': 1.2304083108901978, 'learning_rate': 1.2444485395743114e-05, 'epoch': 1.51}


 38%|███▊      | 16451/43412 [1:34:11<1:32:09,  4.88it/s]

{'loss': 0.2212, 'grad_norm': 2.5797183513641357, 'learning_rate': 1.2421450290242329e-05, 'epoch': 1.52}


 38%|███▊      | 16501/43412 [1:34:21<1:31:52,  4.88it/s]

{'loss': 0.1912, 'grad_norm': 0.9561740159988403, 'learning_rate': 1.2398415184741547e-05, 'epoch': 1.52}


 38%|███▊      | 16551/43412 [1:34:31<1:31:09,  4.91it/s]

{'loss': 0.2008, 'grad_norm': 1.75593101978302, 'learning_rate': 1.2375380079240764e-05, 'epoch': 1.52}


 38%|███▊      | 16601/43412 [1:34:41<1:30:20,  4.95it/s]

{'loss': 0.2091, 'grad_norm': 5.942599296569824, 'learning_rate': 1.235234497373998e-05, 'epoch': 1.53}


 38%|███▊      | 16651/43412 [1:34:52<1:30:52,  4.91it/s]

{'loss': 0.1844, 'grad_norm': 1.2825267314910889, 'learning_rate': 1.2329309868239197e-05, 'epoch': 1.53}


 38%|███▊      | 16701/43412 [1:35:02<1:30:48,  4.90it/s]

{'loss': 0.1952, 'grad_norm': 2.5324785709381104, 'learning_rate': 1.2306274762738415e-05, 'epoch': 1.54}


 39%|███▊      | 16751/43412 [1:35:12<1:30:45,  4.90it/s]

{'loss': 0.1998, 'grad_norm': 6.307563781738281, 'learning_rate': 1.228323965723763e-05, 'epoch': 1.54}


 39%|███▊      | 16801/43412 [1:35:22<1:30:30,  4.90it/s]

{'loss': 0.1842, 'grad_norm': 8.318245887756348, 'learning_rate': 1.2260204551736848e-05, 'epoch': 1.55}


 39%|███▉      | 16851/43412 [1:35:32<1:30:06,  4.91it/s]

{'loss': 0.1965, 'grad_norm': 4.31054162979126, 'learning_rate': 1.2237169446236066e-05, 'epoch': 1.55}


 39%|███▉      | 16901/43412 [1:35:42<1:30:07,  4.90it/s]

{'loss': 0.1992, 'grad_norm': 0.7065297365188599, 'learning_rate': 1.2214134340735281e-05, 'epoch': 1.56}


 39%|███▉      | 16951/43412 [1:35:53<1:30:03,  4.90it/s]

{'loss': 0.2105, 'grad_norm': 4.262235641479492, 'learning_rate': 1.2191099235234498e-05, 'epoch': 1.56}


 39%|███▉      | 17001/43412 [1:36:03<1:29:02,  4.94it/s]

{'loss': 0.2193, 'grad_norm': 4.3501482009887695, 'learning_rate': 1.2168064129733714e-05, 'epoch': 1.57}


 39%|███▉      | 17051/43412 [1:36:13<1:29:08,  4.93it/s]

{'loss': 0.1612, 'grad_norm': 2.743947982788086, 'learning_rate': 1.2145029024232932e-05, 'epoch': 1.57}


 39%|███▉      | 17101/43412 [1:36:23<1:29:40,  4.89it/s]

{'loss': 0.1981, 'grad_norm': 2.2285916805267334, 'learning_rate': 1.212199391873215e-05, 'epoch': 1.58}


 40%|███▉      | 17151/43412 [1:36:33<1:29:33,  4.89it/s]

{'loss': 0.1891, 'grad_norm': 4.336328029632568, 'learning_rate': 1.2098958813231365e-05, 'epoch': 1.58}


 40%|███▉      | 17201/43412 [1:36:43<1:29:10,  4.90it/s]

{'loss': 0.1847, 'grad_norm': 5.953268051147461, 'learning_rate': 1.2075923707730582e-05, 'epoch': 1.58}


 40%|███▉      | 17251/43412 [1:36:54<1:28:47,  4.91it/s]

{'loss': 0.1883, 'grad_norm': 1.1962968111038208, 'learning_rate': 1.20528886022298e-05, 'epoch': 1.59}


 40%|███▉      | 17301/43412 [1:37:04<1:28:49,  4.90it/s]

{'loss': 0.1835, 'grad_norm': 4.958972454071045, 'learning_rate': 1.2029853496729016e-05, 'epoch': 1.59}


 40%|███▉      | 17351/43412 [1:37:14<1:28:45,  4.89it/s]

{'loss': 0.205, 'grad_norm': 3.371800184249878, 'learning_rate': 1.2006818391228232e-05, 'epoch': 1.6}


 40%|████      | 17401/43412 [1:37:24<1:28:21,  4.91it/s]

{'loss': 0.1792, 'grad_norm': 4.855918884277344, 'learning_rate': 1.1983783285727451e-05, 'epoch': 1.6}


 40%|████      | 17451/43412 [1:37:34<1:28:00,  4.92it/s]

{'loss': 0.2118, 'grad_norm': 3.74322772026062, 'learning_rate': 1.1960748180226666e-05, 'epoch': 1.61}


 40%|████      | 17501/43412 [1:37:44<1:28:01,  4.91it/s]

{'loss': 0.1821, 'grad_norm': 3.5530569553375244, 'learning_rate': 1.1937713074725883e-05, 'epoch': 1.61}


 40%|████      | 17551/43412 [1:37:54<1:27:21,  4.93it/s]

{'loss': 0.1952, 'grad_norm': 2.2140793800354004, 'learning_rate': 1.1914677969225099e-05, 'epoch': 1.62}


 41%|████      | 17601/43412 [1:38:05<1:27:59,  4.89it/s]

{'loss': 0.1882, 'grad_norm': 4.74271297454834, 'learning_rate': 1.1891642863724317e-05, 'epoch': 1.62}


 41%|████      | 17651/43412 [1:38:15<1:27:39,  4.90it/s]

{'loss': 0.2127, 'grad_norm': 3.86942982673645, 'learning_rate': 1.1868607758223534e-05, 'epoch': 1.63}


 41%|████      | 17701/43412 [1:38:25<1:27:21,  4.90it/s]

{'loss': 0.1976, 'grad_norm': 4.719777584075928, 'learning_rate': 1.184557265272275e-05, 'epoch': 1.63}


 41%|████      | 17751/43412 [1:38:35<1:27:22,  4.89it/s]

{'loss': 0.1938, 'grad_norm': 4.7506890296936035, 'learning_rate': 1.1822537547221968e-05, 'epoch': 1.64}


 41%|████      | 17801/43412 [1:38:45<1:26:50,  4.91it/s]

{'loss': 0.1649, 'grad_norm': 0.9909067749977112, 'learning_rate': 1.1799502441721185e-05, 'epoch': 1.64}


 41%|████      | 17851/43412 [1:38:55<1:26:50,  4.91it/s]

{'loss': 0.2275, 'grad_norm': 2.299159526824951, 'learning_rate': 1.17764673362204e-05, 'epoch': 1.64}


 41%|████      | 17901/43412 [1:39:06<1:26:53,  4.89it/s]

{'loss': 0.188, 'grad_norm': 2.751593828201294, 'learning_rate': 1.1753432230719617e-05, 'epoch': 1.65}


 41%|████▏     | 17951/43412 [1:39:16<1:26:38,  4.90it/s]

{'loss': 0.1809, 'grad_norm': 2.2278218269348145, 'learning_rate': 1.1730397125218836e-05, 'epoch': 1.65}


 41%|████▏     | 18001/43412 [1:39:26<1:26:21,  4.90it/s]

{'loss': 0.2261, 'grad_norm': 1.7470465898513794, 'learning_rate': 1.1707362019718051e-05, 'epoch': 1.66}


 42%|████▏     | 18051/43412 [1:39:36<1:26:23,  4.89it/s]

{'loss': 0.1829, 'grad_norm': 3.0897369384765625, 'learning_rate': 1.1684326914217268e-05, 'epoch': 1.66}


 42%|████▏     | 18101/43412 [1:39:46<1:25:59,  4.91it/s]

{'loss': 0.1614, 'grad_norm': 1.8788303136825562, 'learning_rate': 1.1661291808716484e-05, 'epoch': 1.67}


 42%|████▏     | 18151/43412 [1:39:56<1:26:15,  4.88it/s]

{'loss': 0.2158, 'grad_norm': 1.7932546138763428, 'learning_rate': 1.1638256703215702e-05, 'epoch': 1.67}


 42%|████▏     | 18201/43412 [1:40:07<1:25:18,  4.93it/s]

{'loss': 0.1687, 'grad_norm': 3.6314520835876465, 'learning_rate': 1.161522159771492e-05, 'epoch': 1.68}


 42%|████▏     | 18251/43412 [1:40:17<1:25:38,  4.90it/s]

{'loss': 0.1719, 'grad_norm': 2.7691757678985596, 'learning_rate': 1.1592186492214134e-05, 'epoch': 1.68}


 42%|████▏     | 18301/43412 [1:40:27<1:24:52,  4.93it/s]

{'loss': 0.2111, 'grad_norm': 2.539226531982422, 'learning_rate': 1.1569151386713353e-05, 'epoch': 1.69}


 42%|████▏     | 18351/43412 [1:40:37<1:25:07,  4.91it/s]

{'loss': 0.1703, 'grad_norm': 0.816163182258606, 'learning_rate': 1.154611628121257e-05, 'epoch': 1.69}


 42%|████▏     | 18401/43412 [1:40:47<1:24:58,  4.91it/s]

{'loss': 0.1972, 'grad_norm': 4.272106170654297, 'learning_rate': 1.1523081175711785e-05, 'epoch': 1.7}


 43%|████▎     | 18451/43412 [1:40:57<1:24:50,  4.90it/s]

{'loss': 0.1859, 'grad_norm': 2.6597161293029785, 'learning_rate': 1.1500046070211002e-05, 'epoch': 1.7}


 43%|████▎     | 18501/43412 [1:41:08<1:24:58,  4.89it/s]

{'loss': 0.1919, 'grad_norm': 2.3859188556671143, 'learning_rate': 1.1477010964710221e-05, 'epoch': 1.7}


 43%|████▎     | 18551/43412 [1:41:18<1:24:00,  4.93it/s]

{'loss': 0.1973, 'grad_norm': 3.7283759117126465, 'learning_rate': 1.1453975859209436e-05, 'epoch': 1.71}


 43%|████▎     | 18601/43412 [1:41:28<1:24:35,  4.89it/s]

{'loss': 0.2082, 'grad_norm': 2.57909893989563, 'learning_rate': 1.1430940753708653e-05, 'epoch': 1.71}


 43%|████▎     | 18651/43412 [1:41:38<1:24:13,  4.90it/s]

{'loss': 0.178, 'grad_norm': 6.745700359344482, 'learning_rate': 1.1407905648207868e-05, 'epoch': 1.72}


 43%|████▎     | 18701/43412 [1:41:48<1:24:25,  4.88it/s]

{'loss': 0.1869, 'grad_norm': 3.424677610397339, 'learning_rate': 1.1384870542707087e-05, 'epoch': 1.72}


 43%|████▎     | 18751/43412 [1:41:58<1:23:32,  4.92it/s]

{'loss': 0.1799, 'grad_norm': 5.8753252029418945, 'learning_rate': 1.1361835437206304e-05, 'epoch': 1.73}


 43%|████▎     | 18801/43412 [1:42:08<1:23:36,  4.91it/s]

{'loss': 0.1735, 'grad_norm': 2.1563966274261475, 'learning_rate': 1.133880033170552e-05, 'epoch': 1.73}


 43%|████▎     | 18851/43412 [1:42:19<1:23:27,  4.90it/s]

{'loss': 0.2003, 'grad_norm': 3.244096040725708, 'learning_rate': 1.1315765226204738e-05, 'epoch': 1.74}


 44%|████▎     | 18901/43412 [1:42:29<1:23:28,  4.89it/s]

{'loss': 0.175, 'grad_norm': 2.5641045570373535, 'learning_rate': 1.1292730120703953e-05, 'epoch': 1.74}


 44%|████▎     | 18951/43412 [1:42:39<1:23:07,  4.90it/s]

{'loss': 0.1921, 'grad_norm': 4.738533020019531, 'learning_rate': 1.126969501520317e-05, 'epoch': 1.75}


 44%|████▍     | 19001/43412 [1:42:49<1:22:53,  4.91it/s]

{'loss': 0.191, 'grad_norm': 4.256807327270508, 'learning_rate': 1.1246659909702387e-05, 'epoch': 1.75}


 44%|████▍     | 19051/43412 [1:42:59<1:22:56,  4.90it/s]

{'loss': 0.2035, 'grad_norm': 3.9792275428771973, 'learning_rate': 1.1223624804201604e-05, 'epoch': 1.76}


 44%|████▍     | 19101/43412 [1:43:09<1:22:58,  4.88it/s]

{'loss': 0.2023, 'grad_norm': 1.4060121774673462, 'learning_rate': 1.1200589698700821e-05, 'epoch': 1.76}


 44%|████▍     | 19151/43412 [1:43:20<1:22:13,  4.92it/s]

{'loss': 0.2244, 'grad_norm': 8.184252738952637, 'learning_rate': 1.1177554593200038e-05, 'epoch': 1.76}


 44%|████▍     | 19201/43412 [1:43:30<1:22:06,  4.91it/s]

{'loss': 0.156, 'grad_norm': 1.6384447813034058, 'learning_rate': 1.1154519487699253e-05, 'epoch': 1.77}


 44%|████▍     | 19251/43412 [1:43:40<1:22:09,  4.90it/s]

{'loss': 0.1998, 'grad_norm': 4.375560760498047, 'learning_rate': 1.1131484382198472e-05, 'epoch': 1.77}


 44%|████▍     | 19301/43412 [1:43:50<1:22:01,  4.90it/s]

{'loss': 0.2058, 'grad_norm': 2.372298240661621, 'learning_rate': 1.1108449276697687e-05, 'epoch': 1.78}


 45%|████▍     | 19351/43412 [1:44:00<1:21:53,  4.90it/s]

{'loss': 0.172, 'grad_norm': 3.6749985218048096, 'learning_rate': 1.1085414171196904e-05, 'epoch': 1.78}


 45%|████▍     | 19401/43412 [1:44:10<1:20:50,  4.95it/s]

{'loss': 0.1801, 'grad_norm': 0.5627486109733582, 'learning_rate': 1.1062379065696123e-05, 'epoch': 1.79}


 45%|████▍     | 19451/43412 [1:44:20<1:21:27,  4.90it/s]

{'loss': 0.2022, 'grad_norm': 7.352284908294678, 'learning_rate': 1.1039343960195338e-05, 'epoch': 1.79}


 45%|████▍     | 19501/43412 [1:44:31<1:21:10,  4.91it/s]

{'loss': 0.1863, 'grad_norm': 0.5762551426887512, 'learning_rate': 1.1016308854694555e-05, 'epoch': 1.8}


 45%|████▌     | 19551/43412 [1:44:41<1:20:51,  4.92it/s]

{'loss': 0.171, 'grad_norm': 6.317849636077881, 'learning_rate': 1.0993273749193772e-05, 'epoch': 1.8}


 45%|████▌     | 19601/43412 [1:44:51<1:20:54,  4.90it/s]

{'loss': 0.1668, 'grad_norm': 4.081766605377197, 'learning_rate': 1.0970238643692989e-05, 'epoch': 1.81}


 45%|████▌     | 19651/43412 [1:45:01<1:20:45,  4.90it/s]

{'loss': 0.1532, 'grad_norm': 0.39270398020744324, 'learning_rate': 1.0947203538192206e-05, 'epoch': 1.81}


 45%|████▌     | 19701/43412 [1:45:11<1:20:39,  4.90it/s]

{'loss': 0.1676, 'grad_norm': 3.413625478744507, 'learning_rate': 1.0924168432691421e-05, 'epoch': 1.82}


 45%|████▌     | 19751/43412 [1:45:21<1:20:24,  4.90it/s]

{'loss': 0.207, 'grad_norm': 4.671474933624268, 'learning_rate': 1.0901133327190638e-05, 'epoch': 1.82}


 46%|████▌     | 19801/43412 [1:45:31<1:20:20,  4.90it/s]

{'loss': 0.207, 'grad_norm': 3.7340476512908936, 'learning_rate': 1.0878098221689857e-05, 'epoch': 1.82}


 46%|████▌     | 19851/43412 [1:45:42<1:20:38,  4.87it/s]

{'loss': 0.1766, 'grad_norm': 2.9949047565460205, 'learning_rate': 1.0855063116189072e-05, 'epoch': 1.83}


 46%|████▌     | 19901/43412 [1:45:52<1:19:40,  4.92it/s]

{'loss': 0.1852, 'grad_norm': 8.371624946594238, 'learning_rate': 1.083202801068829e-05, 'epoch': 1.83}


 46%|████▌     | 19951/43412 [1:46:02<1:19:26,  4.92it/s]

{'loss': 0.1767, 'grad_norm': 2.7851977348327637, 'learning_rate': 1.0808992905187508e-05, 'epoch': 1.84}


 46%|████▌     | 20001/43412 [1:46:12<1:19:16,  4.92it/s]

{'loss': 0.2144, 'grad_norm': 6.1466546058654785, 'learning_rate': 1.0785957799686723e-05, 'epoch': 1.84}


 46%|████▌     | 20051/43412 [1:46:22<1:19:47,  4.88it/s]

{'loss': 0.2029, 'grad_norm': 3.498090982437134, 'learning_rate': 1.076292269418594e-05, 'epoch': 1.85}


 46%|████▋     | 20101/43412 [1:46:32<1:19:01,  4.92it/s]

{'loss': 0.1905, 'grad_norm': 3.157963275909424, 'learning_rate': 1.0739887588685155e-05, 'epoch': 1.85}


 46%|████▋     | 20151/43412 [1:46:43<1:19:05,  4.90it/s]

{'loss': 0.2079, 'grad_norm': 2.1001641750335693, 'learning_rate': 1.0716852483184374e-05, 'epoch': 1.86}


 47%|████▋     | 20201/43412 [1:46:53<1:18:55,  4.90it/s]

{'loss': 0.1774, 'grad_norm': 2.5282604694366455, 'learning_rate': 1.0693817377683591e-05, 'epoch': 1.86}


 47%|████▋     | 20251/43412 [1:47:03<1:18:24,  4.92it/s]

{'loss': 0.1768, 'grad_norm': 5.158560752868652, 'learning_rate': 1.0670782272182806e-05, 'epoch': 1.87}


 47%|████▋     | 20301/43412 [1:47:13<1:18:58,  4.88it/s]

{'loss': 0.1963, 'grad_norm': 3.3381872177124023, 'learning_rate': 1.0647747166682025e-05, 'epoch': 1.87}


 47%|████▋     | 20351/43412 [1:47:23<1:18:35,  4.89it/s]

{'loss': 0.2218, 'grad_norm': 1.960223913192749, 'learning_rate': 1.0624712061181242e-05, 'epoch': 1.88}


 47%|████▋     | 20401/43412 [1:47:33<1:18:19,  4.90it/s]

{'loss': 0.2158, 'grad_norm': 3.2612664699554443, 'learning_rate': 1.0601676955680457e-05, 'epoch': 1.88}


 47%|████▋     | 20451/43412 [1:47:44<1:18:20,  4.89it/s]

{'loss': 0.2229, 'grad_norm': 4.747956275939941, 'learning_rate': 1.0578641850179674e-05, 'epoch': 1.88}


 47%|████▋     | 20501/43412 [1:47:54<1:17:52,  4.90it/s]

{'loss': 0.1876, 'grad_norm': 1.080031156539917, 'learning_rate': 1.0555606744678893e-05, 'epoch': 1.89}


 47%|████▋     | 20551/43412 [1:48:04<1:17:41,  4.90it/s]

{'loss': 0.1842, 'grad_norm': 3.3065576553344727, 'learning_rate': 1.0532571639178108e-05, 'epoch': 1.89}


 47%|████▋     | 20601/43412 [1:48:14<1:17:34,  4.90it/s]

{'loss': 0.1638, 'grad_norm': 5.468786716461182, 'learning_rate': 1.0509536533677325e-05, 'epoch': 1.9}


 48%|████▊     | 20651/43412 [1:48:24<1:17:33,  4.89it/s]

{'loss': 0.184, 'grad_norm': 2.9604666233062744, 'learning_rate': 1.048650142817654e-05, 'epoch': 1.9}


 48%|████▊     | 20701/43412 [1:48:34<1:17:36,  4.88it/s]

{'loss': 0.1872, 'grad_norm': 3.497537136077881, 'learning_rate': 1.0463466322675759e-05, 'epoch': 1.91}


 48%|████▊     | 20751/43412 [1:48:45<1:17:12,  4.89it/s]

{'loss': 0.2219, 'grad_norm': 3.5372045040130615, 'learning_rate': 1.0440431217174976e-05, 'epoch': 1.91}


 48%|████▊     | 20801/43412 [1:48:55<1:17:14,  4.88it/s]

{'loss': 0.1779, 'grad_norm': 3.975267171859741, 'learning_rate': 1.0417396111674191e-05, 'epoch': 1.92}


 48%|████▊     | 20851/43412 [1:49:05<1:16:44,  4.90it/s]

{'loss': 0.1886, 'grad_norm': 1.8448467254638672, 'learning_rate': 1.039436100617341e-05, 'epoch': 1.92}


 48%|████▊     | 20901/43412 [1:49:15<1:16:34,  4.90it/s]

{'loss': 0.2018, 'grad_norm': 7.144069194793701, 'learning_rate': 1.0371325900672627e-05, 'epoch': 1.93}


 48%|████▊     | 20951/43412 [1:49:25<1:16:21,  4.90it/s]

{'loss': 0.1801, 'grad_norm': 3.510725736618042, 'learning_rate': 1.0348290795171842e-05, 'epoch': 1.93}


 48%|████▊     | 21001/43412 [1:49:35<1:16:07,  4.91it/s]

{'loss': 0.1813, 'grad_norm': 0.15736490488052368, 'learning_rate': 1.0325255689671059e-05, 'epoch': 1.93}


 48%|████▊     | 21051/43412 [1:49:45<1:16:05,  4.90it/s]

{'loss': 0.1912, 'grad_norm': 2.6708736419677734, 'learning_rate': 1.0302220584170278e-05, 'epoch': 1.94}


 49%|████▊     | 21101/43412 [1:49:56<1:15:43,  4.91it/s]

{'loss': 0.2095, 'grad_norm': 5.063161373138428, 'learning_rate': 1.0279185478669493e-05, 'epoch': 1.94}


 49%|████▊     | 21151/43412 [1:50:06<1:15:36,  4.91it/s]

{'loss': 0.1547, 'grad_norm': 2.7088212966918945, 'learning_rate': 1.025615037316871e-05, 'epoch': 1.95}


 49%|████▉     | 21201/43412 [1:50:16<1:15:41,  4.89it/s]

{'loss': 0.1883, 'grad_norm': 2.9208149909973145, 'learning_rate': 1.0233115267667925e-05, 'epoch': 1.95}


 49%|████▉     | 21251/43412 [1:50:26<1:14:54,  4.93it/s]

{'loss': 0.1932, 'grad_norm': 1.7979531288146973, 'learning_rate': 1.0210080162167144e-05, 'epoch': 1.96}


 49%|████▉     | 21301/43412 [1:50:36<1:15:01,  4.91it/s]

{'loss': 0.1647, 'grad_norm': 1.723827838897705, 'learning_rate': 1.018704505666636e-05, 'epoch': 1.96}


 49%|████▉     | 21351/43412 [1:50:46<1:15:14,  4.89it/s]

{'loss': 0.2081, 'grad_norm': 1.2868729829788208, 'learning_rate': 1.0164009951165576e-05, 'epoch': 1.97}


 49%|████▉     | 21401/43412 [1:50:57<1:14:50,  4.90it/s]

{'loss': 0.1921, 'grad_norm': 2.9331822395324707, 'learning_rate': 1.0140974845664795e-05, 'epoch': 1.97}


 49%|████▉     | 21451/43412 [1:51:07<1:14:38,  4.90it/s]

{'loss': 0.2096, 'grad_norm': 1.8382854461669922, 'learning_rate': 1.0117939740164012e-05, 'epoch': 1.98}


 50%|████▉     | 21501/43412 [1:51:17<1:14:38,  4.89it/s]

{'loss': 0.1708, 'grad_norm': 3.750246524810791, 'learning_rate': 1.0094904634663227e-05, 'epoch': 1.98}


 50%|████▉     | 21551/43412 [1:51:27<1:14:32,  4.89it/s]

{'loss': 0.2127, 'grad_norm': 3.5371692180633545, 'learning_rate': 1.0071869529162444e-05, 'epoch': 1.99}


 50%|████▉     | 21601/43412 [1:51:37<1:13:39,  4.93it/s]

{'loss': 0.1919, 'grad_norm': 3.73898983001709, 'learning_rate': 1.0048834423661663e-05, 'epoch': 1.99}


 50%|████▉     | 21651/43412 [1:51:47<1:13:55,  4.91it/s]

{'loss': 0.1732, 'grad_norm': 7.290979385375977, 'learning_rate': 1.0025799318160878e-05, 'epoch': 1.99}


 50%|████▉     | 21701/43412 [1:51:57<1:13:56,  4.89it/s]

{'loss': 0.1982, 'grad_norm': 5.035738945007324, 'learning_rate': 1.0002764212660095e-05, 'epoch': 2.0}


                                                         
 50%|█████     | 21706/43412 [1:53:28<1:07:38,  5.35it/s]

{'eval_loss': 0.23143357038497925, 'eval_f1_macro': 0.5979381825978577, 'eval_precision_macro': 0.6696825950206956, 'eval_recall_macro': 0.5450548875644221, 'eval_accuracy': 0.6135274603759676, 'eval_runtime': 89.2, 'eval_samples_per_second': 60.83, 'eval_steps_per_second': 7.612, 'epoch': 2.0}


 50%|█████     | 21751/43412 [1:53:38<1:14:09,  4.87it/s]  

{'loss': 0.1574, 'grad_norm': 1.6832317113876343, 'learning_rate': 9.979729107159312e-06, 'epoch': 2.0}


 50%|█████     | 21801/43412 [1:53:49<1:13:36,  4.89it/s]

{'loss': 0.1216, 'grad_norm': 0.6656749844551086, 'learning_rate': 9.956694001658529e-06, 'epoch': 2.01}


 50%|█████     | 21851/43412 [1:53:59<1:13:03,  4.92it/s]

{'loss': 0.1134, 'grad_norm': 0.6962255239486694, 'learning_rate': 9.933658896157746e-06, 'epoch': 2.01}


 50%|█████     | 21901/43412 [1:54:09<1:13:14,  4.89it/s]

{'loss': 0.1427, 'grad_norm': 2.548092842102051, 'learning_rate': 9.910623790656963e-06, 'epoch': 2.02}


 51%|█████     | 21951/43412 [1:54:19<1:12:14,  4.95it/s]

{'loss': 0.1631, 'grad_norm': 4.119977951049805, 'learning_rate': 9.887588685156178e-06, 'epoch': 2.02}


 51%|█████     | 22001/43412 [1:54:29<1:12:46,  4.90it/s]

{'loss': 0.1345, 'grad_norm': 1.1083934307098389, 'learning_rate': 9.864553579655397e-06, 'epoch': 2.03}


 51%|█████     | 22051/43412 [1:54:39<1:12:33,  4.91it/s]

{'loss': 0.1384, 'grad_norm': 1.059301495552063, 'learning_rate': 9.841518474154612e-06, 'epoch': 2.03}


 51%|█████     | 22101/43412 [1:54:50<1:12:36,  4.89it/s]

{'loss': 0.1359, 'grad_norm': 2.75880765914917, 'learning_rate': 9.818483368653829e-06, 'epoch': 2.04}


 51%|█████     | 22151/43412 [1:55:00<1:12:32,  4.88it/s]

{'loss': 0.1165, 'grad_norm': 4.633487224578857, 'learning_rate': 9.795448263153046e-06, 'epoch': 2.04}


 51%|█████     | 22201/43412 [1:55:10<1:11:53,  4.92it/s]

{'loss': 0.1622, 'grad_norm': 2.505427122116089, 'learning_rate': 9.772413157652263e-06, 'epoch': 2.05}


 51%|█████▏    | 22251/43412 [1:55:20<1:11:52,  4.91it/s]

{'loss': 0.1491, 'grad_norm': 0.12959708273410797, 'learning_rate': 9.74937805215148e-06, 'epoch': 2.05}


 51%|█████▏    | 22301/43412 [1:55:30<1:11:50,  4.90it/s]

{'loss': 0.129, 'grad_norm': 11.928836822509766, 'learning_rate': 9.726342946650697e-06, 'epoch': 2.05}


 51%|█████▏    | 22351/43412 [1:55:40<1:11:57,  4.88it/s]

{'loss': 0.138, 'grad_norm': 3.137711763381958, 'learning_rate': 9.703307841149914e-06, 'epoch': 2.06}


 52%|█████▏    | 22401/43412 [1:55:50<1:11:07,  4.92it/s]

{'loss': 0.168, 'grad_norm': 1.3350862264633179, 'learning_rate': 9.68027273564913e-06, 'epoch': 2.06}


 52%|█████▏    | 22451/43412 [1:56:01<1:11:16,  4.90it/s]

{'loss': 0.1193, 'grad_norm': 1.3244447708129883, 'learning_rate': 9.657237630148348e-06, 'epoch': 2.07}


 52%|█████▏    | 22501/43412 [1:56:11<1:10:47,  4.92it/s]

{'loss': 0.1227, 'grad_norm': 0.142120823264122, 'learning_rate': 9.634202524647563e-06, 'epoch': 2.07}


 52%|█████▏    | 22551/43412 [1:56:21<1:11:08,  4.89it/s]

{'loss': 0.1272, 'grad_norm': 4.968024253845215, 'learning_rate': 9.611167419146782e-06, 'epoch': 2.08}


 52%|█████▏    | 22601/43412 [1:56:31<1:10:45,  4.90it/s]

{'loss': 0.1195, 'grad_norm': 1.4538490772247314, 'learning_rate': 9.588132313645997e-06, 'epoch': 2.08}


 52%|█████▏    | 22651/43412 [1:56:41<1:10:06,  4.94it/s]

{'loss': 0.1215, 'grad_norm': 6.874516487121582, 'learning_rate': 9.565097208145214e-06, 'epoch': 2.09}


 52%|█████▏    | 22701/43412 [1:56:51<1:10:37,  4.89it/s]

{'loss': 0.1834, 'grad_norm': 5.485065937042236, 'learning_rate': 9.54206210264443e-06, 'epoch': 2.09}


 52%|█████▏    | 22751/43412 [1:57:02<1:09:50,  4.93it/s]

{'loss': 0.1563, 'grad_norm': 0.418700635433197, 'learning_rate': 9.519026997143648e-06, 'epoch': 2.1}


 53%|█████▎    | 22801/43412 [1:57:12<1:10:23,  4.88it/s]

{'loss': 0.1344, 'grad_norm': 4.725342273712158, 'learning_rate': 9.495991891642865e-06, 'epoch': 2.1}


 53%|█████▎    | 22851/43412 [1:57:22<1:10:00,  4.89it/s]

{'loss': 0.1382, 'grad_norm': 7.36399507522583, 'learning_rate': 9.472956786142082e-06, 'epoch': 2.11}


 53%|█████▎    | 22901/43412 [1:57:32<1:09:55,  4.89it/s]

{'loss': 0.1711, 'grad_norm': 5.195883750915527, 'learning_rate': 9.449921680641299e-06, 'epoch': 2.11}


 53%|█████▎    | 22951/43412 [1:57:42<1:09:46,  4.89it/s]

{'loss': 0.1523, 'grad_norm': 2.1475133895874023, 'learning_rate': 9.426886575140516e-06, 'epoch': 2.11}


 53%|█████▎    | 23001/43412 [1:57:52<1:09:16,  4.91it/s]

{'loss': 0.1473, 'grad_norm': 1.668877124786377, 'learning_rate': 9.403851469639733e-06, 'epoch': 2.12}


 53%|█████▎    | 23051/43412 [1:58:02<1:09:07,  4.91it/s]

{'loss': 0.1277, 'grad_norm': 10.31229305267334, 'learning_rate': 9.380816364138948e-06, 'epoch': 2.12}


 53%|█████▎    | 23101/43412 [1:58:13<1:09:02,  4.90it/s]

{'loss': 0.1631, 'grad_norm': 5.553696155548096, 'learning_rate': 9.357781258638165e-06, 'epoch': 2.13}


 53%|█████▎    | 23151/43412 [1:58:23<1:08:51,  4.90it/s]

{'loss': 0.1144, 'grad_norm': 0.510472297668457, 'learning_rate': 9.334746153137382e-06, 'epoch': 2.13}


 53%|█████▎    | 23201/43412 [1:58:33<1:08:39,  4.91it/s]

{'loss': 0.1358, 'grad_norm': 6.42477560043335, 'learning_rate': 9.311711047636599e-06, 'epoch': 2.14}


 54%|█████▎    | 23251/43412 [1:58:43<1:08:36,  4.90it/s]

{'loss': 0.1379, 'grad_norm': 14.195420265197754, 'learning_rate': 9.288675942135816e-06, 'epoch': 2.14}


 54%|█████▎    | 23301/43412 [1:58:53<1:08:19,  4.91it/s]

{'loss': 0.1167, 'grad_norm': 3.451159954071045, 'learning_rate': 9.265640836635033e-06, 'epoch': 2.15}


 54%|█████▍    | 23351/43412 [1:59:03<1:07:54,  4.92it/s]

{'loss': 0.1118, 'grad_norm': 10.550317764282227, 'learning_rate': 9.24260573113425e-06, 'epoch': 2.15}


 54%|█████▍    | 23401/43412 [1:59:14<1:07:42,  4.93it/s]

{'loss': 0.1331, 'grad_norm': 3.4085073471069336, 'learning_rate': 9.219570625633467e-06, 'epoch': 2.16}


 54%|█████▍    | 23451/43412 [1:59:24<1:07:36,  4.92it/s]

{'loss': 0.1394, 'grad_norm': 4.081443786621094, 'learning_rate': 9.196535520132684e-06, 'epoch': 2.16}


 54%|█████▍    | 23501/43412 [1:59:34<1:07:42,  4.90it/s]

{'loss': 0.1459, 'grad_norm': 0.44442424178123474, 'learning_rate': 9.173500414631899e-06, 'epoch': 2.17}


 54%|█████▍    | 23551/43412 [1:59:44<1:08:00,  4.87it/s]

{'loss': 0.1675, 'grad_norm': 1.056051254272461, 'learning_rate': 9.150465309131117e-06, 'epoch': 2.17}


 54%|█████▍    | 23601/43412 [1:59:54<1:07:26,  4.90it/s]

{'loss': 0.14, 'grad_norm': 9.33300495147705, 'learning_rate': 9.127430203630333e-06, 'epoch': 2.17}


 54%|█████▍    | 23651/43412 [2:00:04<1:06:38,  4.94it/s]

{'loss': 0.1459, 'grad_norm': 0.9269686341285706, 'learning_rate': 9.10439509812955e-06, 'epoch': 2.18}


 55%|█████▍    | 23701/43412 [2:00:14<1:07:06,  4.90it/s]

{'loss': 0.1663, 'grad_norm': 1.0004525184631348, 'learning_rate': 9.081359992628767e-06, 'epoch': 2.18}


 55%|█████▍    | 23751/43412 [2:00:25<1:06:35,  4.92it/s]

{'loss': 0.129, 'grad_norm': 0.1518564373254776, 'learning_rate': 9.058324887127984e-06, 'epoch': 2.19}


 55%|█████▍    | 23801/43412 [2:00:35<1:06:42,  4.90it/s]

{'loss': 0.1353, 'grad_norm': 4.550553321838379, 'learning_rate': 9.0352897816272e-06, 'epoch': 2.19}


 55%|█████▍    | 23851/43412 [2:00:45<1:06:38,  4.89it/s]

{'loss': 0.1406, 'grad_norm': 2.6692862510681152, 'learning_rate': 9.012254676126418e-06, 'epoch': 2.2}


 55%|█████▌    | 23901/43412 [2:00:55<1:06:20,  4.90it/s]

{'loss': 0.128, 'grad_norm': 4.689181327819824, 'learning_rate': 8.989219570625635e-06, 'epoch': 2.2}


 55%|█████▌    | 23951/43412 [2:01:05<1:05:51,  4.93it/s]

{'loss': 0.1625, 'grad_norm': 4.962934970855713, 'learning_rate': 8.966184465124851e-06, 'epoch': 2.21}


 55%|█████▌    | 24001/43412 [2:01:15<1:06:08,  4.89it/s]

{'loss': 0.1378, 'grad_norm': 9.42220401763916, 'learning_rate': 8.943149359624068e-06, 'epoch': 2.21}


 55%|█████▌    | 24051/43412 [2:01:25<1:06:03,  4.88it/s]

{'loss': 0.1402, 'grad_norm': 2.6576106548309326, 'learning_rate': 8.920114254123284e-06, 'epoch': 2.22}


 56%|█████▌    | 24101/43412 [2:01:36<1:05:40,  4.90it/s]

{'loss': 0.1532, 'grad_norm': 1.9636731147766113, 'learning_rate': 8.897079148622502e-06, 'epoch': 2.22}


 56%|█████▌    | 24151/43412 [2:01:46<1:05:18,  4.92it/s]

{'loss': 0.1351, 'grad_norm': 1.2257736921310425, 'learning_rate': 8.874044043121718e-06, 'epoch': 2.23}


 56%|█████▌    | 24201/43412 [2:01:56<1:04:53,  4.93it/s]

{'loss': 0.1208, 'grad_norm': 4.763906002044678, 'learning_rate': 8.851008937620935e-06, 'epoch': 2.23}


 56%|█████▌    | 24251/43412 [2:02:06<1:05:08,  4.90it/s]

{'loss': 0.1467, 'grad_norm': 3.7722675800323486, 'learning_rate': 8.827973832120152e-06, 'epoch': 2.23}


 56%|█████▌    | 24301/43412 [2:02:16<1:04:56,  4.90it/s]

{'loss': 0.136, 'grad_norm': 3.1529290676116943, 'learning_rate': 8.804938726619369e-06, 'epoch': 2.24}


 56%|█████▌    | 24351/43412 [2:02:26<1:04:48,  4.90it/s]

{'loss': 0.1458, 'grad_norm': 6.565517425537109, 'learning_rate': 8.781903621118586e-06, 'epoch': 2.24}


 56%|█████▌    | 24401/43412 [2:02:37<1:04:42,  4.90it/s]

{'loss': 0.1509, 'grad_norm': 8.815553665161133, 'learning_rate': 8.758868515617802e-06, 'epoch': 2.25}


 56%|█████▋    | 24451/43412 [2:02:47<1:04:11,  4.92it/s]

{'loss': 0.1224, 'grad_norm': 1.6192781925201416, 'learning_rate': 8.73583341011702e-06, 'epoch': 2.25}


 56%|█████▋    | 24501/43412 [2:02:57<1:04:03,  4.92it/s]

{'loss': 0.1684, 'grad_norm': 5.8355607986450195, 'learning_rate': 8.712798304616236e-06, 'epoch': 2.26}


 57%|█████▋    | 24551/43412 [2:03:07<1:04:07,  4.90it/s]

{'loss': 0.1754, 'grad_norm': 6.764474868774414, 'learning_rate': 8.689763199115453e-06, 'epoch': 2.26}


 57%|█████▋    | 24601/43412 [2:03:17<1:03:49,  4.91it/s]

{'loss': 0.1532, 'grad_norm': 6.1416754722595215, 'learning_rate': 8.666728093614669e-06, 'epoch': 2.27}


 57%|█████▋    | 24651/43412 [2:03:27<1:03:06,  4.95it/s]

{'loss': 0.1401, 'grad_norm': 2.1367504596710205, 'learning_rate': 8.643692988113887e-06, 'epoch': 2.27}


 57%|█████▋    | 24701/43412 [2:03:37<1:03:17,  4.93it/s]

{'loss': 0.1378, 'grad_norm': 7.600679874420166, 'learning_rate': 8.620657882613103e-06, 'epoch': 2.28}


 57%|█████▋    | 24751/43412 [2:03:48<1:03:13,  4.92it/s]

{'loss': 0.1529, 'grad_norm': 3.7029824256896973, 'learning_rate': 8.59762277711232e-06, 'epoch': 2.28}


 57%|█████▋    | 24801/43412 [2:03:58<1:03:11,  4.91it/s]

{'loss': 0.1304, 'grad_norm': 3.742795944213867, 'learning_rate': 8.574587671611536e-06, 'epoch': 2.29}


 57%|█████▋    | 24851/43412 [2:04:08<1:02:54,  4.92it/s]

{'loss': 0.1274, 'grad_norm': 3.9557607173919678, 'learning_rate': 8.551552566110753e-06, 'epoch': 2.29}


 57%|█████▋    | 24901/43412 [2:04:18<1:03:00,  4.90it/s]

{'loss': 0.167, 'grad_norm': 3.018594741821289, 'learning_rate': 8.52851746060997e-06, 'epoch': 2.29}


 57%|█████▋    | 24951/43412 [2:04:28<1:02:46,  4.90it/s]

{'loss': 0.1276, 'grad_norm': 6.851688385009766, 'learning_rate': 8.505482355109187e-06, 'epoch': 2.3}


 58%|█████▊    | 25001/43412 [2:04:38<1:02:39,  4.90it/s]

{'loss': 0.1583, 'grad_norm': 2.5862770080566406, 'learning_rate': 8.482447249608404e-06, 'epoch': 2.3}


 58%|█████▊    | 25051/43412 [2:04:48<1:01:53,  4.95it/s]

{'loss': 0.1446, 'grad_norm': 6.434348106384277, 'learning_rate': 8.45941214410762e-06, 'epoch': 2.31}


 58%|█████▊    | 25101/43412 [2:04:59<1:02:20,  4.89it/s]

{'loss': 0.1611, 'grad_norm': 2.0557546615600586, 'learning_rate': 8.436377038606838e-06, 'epoch': 2.31}


 58%|█████▊    | 25151/43412 [2:05:09<1:02:12,  4.89it/s]

{'loss': 0.1479, 'grad_norm': 6.579443454742432, 'learning_rate': 8.413341933106054e-06, 'epoch': 2.32}


 58%|█████▊    | 25201/43412 [2:05:19<1:01:54,  4.90it/s]

{'loss': 0.1675, 'grad_norm': 2.919447422027588, 'learning_rate': 8.39030682760527e-06, 'epoch': 2.32}


 58%|█████▊    | 25251/43412 [2:05:29<1:01:53,  4.89it/s]

{'loss': 0.1366, 'grad_norm': 0.4393904507160187, 'learning_rate': 8.367271722104487e-06, 'epoch': 2.33}


 58%|█████▊    | 25301/43412 [2:05:39<1:01:48,  4.88it/s]

{'loss': 0.1818, 'grad_norm': 0.9696736335754395, 'learning_rate': 8.344236616603704e-06, 'epoch': 2.33}


 58%|█████▊    | 25351/43412 [2:05:49<1:01:25,  4.90it/s]

{'loss': 0.1585, 'grad_norm': 2.3762264251708984, 'learning_rate': 8.321201511102921e-06, 'epoch': 2.34}


 59%|█████▊    | 25401/43412 [2:06:00<1:01:03,  4.92it/s]

{'loss': 0.143, 'grad_norm': 0.6616250872612, 'learning_rate': 8.298166405602138e-06, 'epoch': 2.34}


 59%|█████▊    | 25451/43412 [2:06:10<1:00:59,  4.91it/s]

{'loss': 0.1365, 'grad_norm': 3.1077146530151367, 'learning_rate': 8.275131300101355e-06, 'epoch': 2.34}


 59%|█████▊    | 25501/43412 [2:06:20<1:00:51,  4.90it/s]

{'loss': 0.1278, 'grad_norm': 3.020357608795166, 'learning_rate': 8.252096194600572e-06, 'epoch': 2.35}


 59%|█████▉    | 25551/43412 [2:06:30<1:00:24,  4.93it/s]

{'loss': 0.1354, 'grad_norm': 10.236324310302734, 'learning_rate': 8.22906108909979e-06, 'epoch': 2.35}


 59%|█████▉    | 25601/43412 [2:06:40<1:00:25,  4.91it/s]

{'loss': 0.1365, 'grad_norm': 14.905143737792969, 'learning_rate': 8.206025983599005e-06, 'epoch': 2.36}


 59%|█████▉    | 25651/43412 [2:06:50<1:00:16,  4.91it/s]

{'loss': 0.1396, 'grad_norm': 1.5484217405319214, 'learning_rate': 8.182990878098223e-06, 'epoch': 2.36}


 59%|█████▉    | 25701/43412 [2:07:00<1:00:06,  4.91it/s]

{'loss': 0.1335, 'grad_norm': 7.809057235717773, 'learning_rate': 8.159955772597438e-06, 'epoch': 2.37}


 59%|█████▉    | 25751/43412 [2:07:11<1:00:07,  4.89it/s]

{'loss': 0.1288, 'grad_norm': 0.29759395122528076, 'learning_rate': 8.136920667096655e-06, 'epoch': 2.37}


 59%|█████▉    | 25801/43412 [2:07:21<59:37,  4.92it/s]  

{'loss': 0.1012, 'grad_norm': 5.710853099822998, 'learning_rate': 8.113885561595872e-06, 'epoch': 2.38}


 60%|█████▉    | 25851/43412 [2:07:31<59:53,  4.89it/s]

{'loss': 0.1539, 'grad_norm': 0.13958792388439178, 'learning_rate': 8.09085045609509e-06, 'epoch': 2.38}


 60%|█████▉    | 25901/43412 [2:07:41<59:33,  4.90it/s]

{'loss': 0.1687, 'grad_norm': 2.4640767574310303, 'learning_rate': 8.067815350594306e-06, 'epoch': 2.39}


 60%|█████▉    | 25951/43412 [2:07:51<59:21,  4.90it/s]

{'loss': 0.1562, 'grad_norm': 15.652271270751953, 'learning_rate': 8.044780245093523e-06, 'epoch': 2.39}


 60%|█████▉    | 26001/43412 [2:08:01<58:46,  4.94it/s]

{'loss': 0.1633, 'grad_norm': 0.1447736918926239, 'learning_rate': 8.02174513959274e-06, 'epoch': 2.4}


 60%|██████    | 26051/43412 [2:08:12<58:48,  4.92it/s]

{'loss': 0.1446, 'grad_norm': 2.268129587173462, 'learning_rate': 7.998710034091957e-06, 'epoch': 2.4}


 60%|██████    | 26101/43412 [2:08:22<58:48,  4.91it/s]

{'loss': 0.16, 'grad_norm': 2.612213611602783, 'learning_rate': 7.975674928591174e-06, 'epoch': 2.4}


 60%|██████    | 26151/43412 [2:08:32<58:46,  4.89it/s]

{'loss': 0.1535, 'grad_norm': 7.0189409255981445, 'learning_rate': 7.95263982309039e-06, 'epoch': 2.41}


 60%|██████    | 26201/43412 [2:08:42<58:28,  4.91it/s]

{'loss': 0.1379, 'grad_norm': 1.1084829568862915, 'learning_rate': 7.929604717589608e-06, 'epoch': 2.41}


 60%|██████    | 26251/43412 [2:08:52<58:28,  4.89it/s]

{'loss': 0.178, 'grad_norm': 3.3345258235931396, 'learning_rate': 7.906569612088823e-06, 'epoch': 2.42}


 61%|██████    | 26301/43412 [2:09:02<58:00,  4.92it/s]

{'loss': 0.1143, 'grad_norm': 2.4004414081573486, 'learning_rate': 7.88353450658804e-06, 'epoch': 2.42}


 61%|██████    | 26351/43412 [2:09:12<58:04,  4.90it/s]

{'loss': 0.1414, 'grad_norm': 1.1220345497131348, 'learning_rate': 7.860499401087259e-06, 'epoch': 2.43}


 61%|██████    | 26401/43412 [2:09:23<58:02,  4.89it/s]

{'loss': 0.1439, 'grad_norm': 11.000067710876465, 'learning_rate': 7.837464295586474e-06, 'epoch': 2.43}


 61%|██████    | 26451/43412 [2:09:33<57:59,  4.87it/s]

{'loss': 0.1345, 'grad_norm': 2.1821115016937256, 'learning_rate': 7.814429190085691e-06, 'epoch': 2.44}


 61%|██████    | 26501/43412 [2:09:43<57:38,  4.89it/s]

{'loss': 0.1613, 'grad_norm': 0.1038414016366005, 'learning_rate': 7.791394084584908e-06, 'epoch': 2.44}


 61%|██████    | 26551/43412 [2:09:53<57:26,  4.89it/s]

{'loss': 0.1333, 'grad_norm': 4.919327735900879, 'learning_rate': 7.768358979084125e-06, 'epoch': 2.45}


 61%|██████▏   | 26601/43412 [2:10:03<57:22,  4.88it/s]

{'loss': 0.1467, 'grad_norm': 12.05640983581543, 'learning_rate': 7.745323873583342e-06, 'epoch': 2.45}


 61%|██████▏   | 26651/43412 [2:10:13<57:07,  4.89it/s]

{'loss': 0.1129, 'grad_norm': 2.0059714317321777, 'learning_rate': 7.722288768082559e-06, 'epoch': 2.46}


 62%|██████▏   | 26701/43412 [2:10:24<56:28,  4.93it/s]

{'loss': 0.1573, 'grad_norm': 13.849766731262207, 'learning_rate': 7.699253662581774e-06, 'epoch': 2.46}


 62%|██████▏   | 26751/43412 [2:10:34<56:34,  4.91it/s]

{'loss': 0.166, 'grad_norm': 8.999760627746582, 'learning_rate': 7.676218557080991e-06, 'epoch': 2.46}


 62%|██████▏   | 26801/43412 [2:10:44<56:12,  4.93it/s]

{'loss': 0.1879, 'grad_norm': 7.4819416999816895, 'learning_rate': 7.653183451580208e-06, 'epoch': 2.47}


 62%|██████▏   | 26851/43412 [2:10:54<56:11,  4.91it/s]

{'loss': 0.1579, 'grad_norm': 10.90843677520752, 'learning_rate': 7.630148346079425e-06, 'epoch': 2.47}


 62%|██████▏   | 26901/43412 [2:11:04<55:52,  4.93it/s]

{'loss': 0.1276, 'grad_norm': 3.8281476497650146, 'learning_rate': 7.607113240578643e-06, 'epoch': 2.48}


 62%|██████▏   | 26951/43412 [2:11:14<55:54,  4.91it/s]

{'loss': 0.1494, 'grad_norm': 4.871445655822754, 'learning_rate': 7.584078135077859e-06, 'epoch': 2.48}


 62%|██████▏   | 27001/43412 [2:11:24<55:34,  4.92it/s]

{'loss': 0.1068, 'grad_norm': 0.27028459310531616, 'learning_rate': 7.561043029577076e-06, 'epoch': 2.49}


 62%|██████▏   | 27051/43412 [2:11:35<55:32,  4.91it/s]

{'loss': 0.1585, 'grad_norm': 5.26657247543335, 'learning_rate': 7.538007924076292e-06, 'epoch': 2.49}


 62%|██████▏   | 27101/43412 [2:11:45<55:23,  4.91it/s]

{'loss': 0.1801, 'grad_norm': 3.821913242340088, 'learning_rate': 7.51497281857551e-06, 'epoch': 2.5}


 63%|██████▎   | 27151/43412 [2:11:55<55:14,  4.91it/s]

{'loss': 0.1353, 'grad_norm': 3.4920191764831543, 'learning_rate': 7.491937713074726e-06, 'epoch': 2.5}


 63%|██████▎   | 27201/43412 [2:12:05<54:50,  4.93it/s]

{'loss': 0.1769, 'grad_norm': 3.240116596221924, 'learning_rate': 7.468902607573943e-06, 'epoch': 2.51}


 63%|██████▎   | 27251/43412 [2:12:15<54:35,  4.93it/s]

{'loss': 0.1586, 'grad_norm': 9.102235794067383, 'learning_rate': 7.445867502073159e-06, 'epoch': 2.51}


 63%|██████▎   | 27301/43412 [2:12:25<54:42,  4.91it/s]

{'loss': 0.1699, 'grad_norm': 0.5531442165374756, 'learning_rate': 7.422832396572377e-06, 'epoch': 2.52}


 63%|██████▎   | 27351/43412 [2:12:35<54:44,  4.89it/s]

{'loss': 0.1168, 'grad_norm': 1.0109055042266846, 'learning_rate': 7.399797291071593e-06, 'epoch': 2.52}


 63%|██████▎   | 27401/43412 [2:12:46<54:33,  4.89it/s]

{'loss': 0.1652, 'grad_norm': 7.978381633758545, 'learning_rate': 7.37676218557081e-06, 'epoch': 2.52}


 63%|██████▎   | 27451/43412 [2:12:56<54:20,  4.89it/s]

{'loss': 0.167, 'grad_norm': 1.4639371633529663, 'learning_rate': 7.353727080070028e-06, 'epoch': 2.53}


 63%|██████▎   | 27501/43412 [2:13:06<54:15,  4.89it/s]

{'loss': 0.1469, 'grad_norm': 3.0572872161865234, 'learning_rate': 7.330691974569244e-06, 'epoch': 2.53}


 63%|██████▎   | 27551/43412 [2:13:16<53:44,  4.92it/s]

{'loss': 0.1513, 'grad_norm': 9.3152494430542, 'learning_rate': 7.307656869068461e-06, 'epoch': 2.54}


 64%|██████▎   | 27601/43412 [2:13:26<53:39,  4.91it/s]

{'loss': 0.1582, 'grad_norm': 5.968315601348877, 'learning_rate': 7.284621763567677e-06, 'epoch': 2.54}


 64%|██████▎   | 27651/43412 [2:13:36<53:21,  4.92it/s]

{'loss': 0.1071, 'grad_norm': 2.4594128131866455, 'learning_rate': 7.261586658066895e-06, 'epoch': 2.55}


 64%|██████▍   | 27701/43412 [2:13:47<53:15,  4.92it/s]

{'loss': 0.1488, 'grad_norm': 7.192209720611572, 'learning_rate': 7.238551552566111e-06, 'epoch': 2.55}


 64%|██████▍   | 27751/43412 [2:13:57<53:18,  4.90it/s]

{'loss': 0.1683, 'grad_norm': 7.584349632263184, 'learning_rate': 7.215516447065328e-06, 'epoch': 2.56}


 64%|██████▍   | 27801/43412 [2:14:07<52:58,  4.91it/s]

{'loss': 0.1268, 'grad_norm': 11.920284271240234, 'learning_rate': 7.192481341564544e-06, 'epoch': 2.56}


 64%|██████▍   | 27851/43412 [2:14:17<52:47,  4.91it/s]

{'loss': 0.1333, 'grad_norm': 3.0092267990112305, 'learning_rate': 7.169446236063762e-06, 'epoch': 2.57}


 64%|██████▍   | 27901/43412 [2:14:27<52:53,  4.89it/s]

{'loss': 0.1457, 'grad_norm': 0.7752646803855896, 'learning_rate': 7.146411130562979e-06, 'epoch': 2.57}


 64%|██████▍   | 27951/43412 [2:14:37<52:30,  4.91it/s]

{'loss': 0.131, 'grad_norm': 2.197432279586792, 'learning_rate': 7.123376025062195e-06, 'epoch': 2.58}


 65%|██████▍   | 28001/43412 [2:14:47<52:26,  4.90it/s]

{'loss': 0.1451, 'grad_norm': 0.7054200172424316, 'learning_rate': 7.100340919561413e-06, 'epoch': 2.58}


 65%|██████▍   | 28051/43412 [2:14:58<52:18,  4.89it/s]

{'loss': 0.1427, 'grad_norm': 1.5201425552368164, 'learning_rate': 7.077305814060629e-06, 'epoch': 2.58}


 65%|██████▍   | 28101/43412 [2:15:08<52:12,  4.89it/s]

{'loss': 0.1188, 'grad_norm': 0.6337586641311646, 'learning_rate': 7.054270708559846e-06, 'epoch': 2.59}


 65%|██████▍   | 28151/43412 [2:15:18<51:53,  4.90it/s]

{'loss': 0.1397, 'grad_norm': 0.3316611647605896, 'learning_rate': 7.031235603059062e-06, 'epoch': 2.59}


 65%|██████▍   | 28201/43412 [2:15:28<51:46,  4.90it/s]

{'loss': 0.1303, 'grad_norm': 12.464303970336914, 'learning_rate': 7.00820049755828e-06, 'epoch': 2.6}


 65%|██████▌   | 28251/43412 [2:15:38<51:39,  4.89it/s]

{'loss': 0.1468, 'grad_norm': 5.419243335723877, 'learning_rate': 6.985165392057496e-06, 'epoch': 2.6}


 65%|██████▌   | 28301/43412 [2:15:48<51:26,  4.90it/s]

{'loss': 0.1272, 'grad_norm': 0.49851179122924805, 'learning_rate': 6.962130286556713e-06, 'epoch': 2.61}


 65%|██████▌   | 28351/43412 [2:15:59<51:20,  4.89it/s]

{'loss': 0.1702, 'grad_norm': 2.5714118480682373, 'learning_rate': 6.939095181055929e-06, 'epoch': 2.61}


 65%|██████▌   | 28401/43412 [2:16:09<51:20,  4.87it/s]

{'loss': 0.1436, 'grad_norm': 7.8597235679626465, 'learning_rate': 6.916060075555147e-06, 'epoch': 2.62}


 66%|██████▌   | 28451/43412 [2:16:19<50:54,  4.90it/s]

{'loss': 0.1673, 'grad_norm': 3.3577065467834473, 'learning_rate': 6.893024970054364e-06, 'epoch': 2.62}


 66%|██████▌   | 28501/43412 [2:16:29<50:49,  4.89it/s]

{'loss': 0.1547, 'grad_norm': 12.261388778686523, 'learning_rate': 6.86998986455358e-06, 'epoch': 2.63}


 66%|██████▌   | 28551/43412 [2:16:39<50:31,  4.90it/s]

{'loss': 0.1357, 'grad_norm': 0.2475101500749588, 'learning_rate': 6.846954759052797e-06, 'epoch': 2.63}


 66%|██████▌   | 28601/43412 [2:16:49<50:30,  4.89it/s]

{'loss': 0.1429, 'grad_norm': 4.54005765914917, 'learning_rate': 6.823919653552014e-06, 'epoch': 2.64}


 66%|██████▌   | 28651/43412 [2:16:59<50:13,  4.90it/s]

{'loss': 0.1415, 'grad_norm': 0.5068523287773132, 'learning_rate': 6.800884548051231e-06, 'epoch': 2.64}


 66%|██████▌   | 28701/43412 [2:17:10<49:59,  4.90it/s]

{'loss': 0.1261, 'grad_norm': 0.4022388756275177, 'learning_rate': 6.777849442550447e-06, 'epoch': 2.64}


 66%|██████▌   | 28751/43412 [2:17:20<49:57,  4.89it/s]

{'loss': 0.1496, 'grad_norm': 7.030691623687744, 'learning_rate': 6.754814337049664e-06, 'epoch': 2.65}


 66%|██████▋   | 28801/43412 [2:17:30<49:40,  4.90it/s]

{'loss': 0.118, 'grad_norm': 13.780177116394043, 'learning_rate': 6.731779231548881e-06, 'epoch': 2.65}


 66%|██████▋   | 28851/43412 [2:17:40<49:30,  4.90it/s]

{'loss': 0.1803, 'grad_norm': 12.672889709472656, 'learning_rate': 6.708744126048098e-06, 'epoch': 2.66}


 67%|██████▋   | 28901/43412 [2:17:50<49:20,  4.90it/s]

{'loss': 0.133, 'grad_norm': 3.414564371109009, 'learning_rate': 6.685709020547315e-06, 'epoch': 2.66}


 67%|██████▋   | 28951/43412 [2:18:00<49:07,  4.91it/s]

{'loss': 0.1648, 'grad_norm': 7.6480512619018555, 'learning_rate': 6.662673915046531e-06, 'epoch': 2.67}


 67%|██████▋   | 29001/43412 [2:18:11<48:53,  4.91it/s]

{'loss': 0.1597, 'grad_norm': 0.12915565073490143, 'learning_rate': 6.639638809545749e-06, 'epoch': 2.67}


 67%|██████▋   | 29051/43412 [2:18:21<48:54,  4.89it/s]

{'loss': 0.1661, 'grad_norm': 9.468689918518066, 'learning_rate': 6.616603704044965e-06, 'epoch': 2.68}


 67%|██████▋   | 29101/43412 [2:18:31<48:41,  4.90it/s]

{'loss': 0.1301, 'grad_norm': 2.8949697017669678, 'learning_rate': 6.593568598544182e-06, 'epoch': 2.68}


 67%|██████▋   | 29151/43412 [2:18:41<48:13,  4.93it/s]

{'loss': 0.1376, 'grad_norm': 0.5570261478424072, 'learning_rate': 6.570533493043398e-06, 'epoch': 2.69}


 67%|██████▋   | 29201/43412 [2:18:51<48:27,  4.89it/s]

{'loss': 0.1694, 'grad_norm': 0.4459134340286255, 'learning_rate': 6.547498387542616e-06, 'epoch': 2.69}


 67%|██████▋   | 29251/43412 [2:19:01<48:16,  4.89it/s]

{'loss': 0.1039, 'grad_norm': 0.06550543010234833, 'learning_rate': 6.524463282041832e-06, 'epoch': 2.7}


 67%|██████▋   | 29301/43412 [2:19:12<48:05,  4.89it/s]

{'loss': 0.1753, 'grad_norm': 0.2307043820619583, 'learning_rate': 6.501428176541049e-06, 'epoch': 2.7}


 68%|██████▊   | 29351/43412 [2:19:22<47:38,  4.92it/s]

{'loss': 0.1394, 'grad_norm': 4.939142227172852, 'learning_rate': 6.478393071040265e-06, 'epoch': 2.7}


 68%|██████▊   | 29401/43412 [2:19:32<47:25,  4.92it/s]

{'loss': 0.1373, 'grad_norm': 1.4854774475097656, 'learning_rate': 6.455357965539483e-06, 'epoch': 2.71}


 68%|██████▊   | 29451/43412 [2:19:42<47:41,  4.88it/s]

{'loss': 0.13, 'grad_norm': 18.51213264465332, 'learning_rate': 6.4323228600387e-06, 'epoch': 2.71}


 68%|██████▊   | 29501/43412 [2:19:52<47:08,  4.92it/s]

{'loss': 0.1685, 'grad_norm': 5.973273754119873, 'learning_rate': 6.409287754537916e-06, 'epoch': 2.72}


 68%|██████▊   | 29551/43412 [2:20:02<47:06,  4.90it/s]

{'loss': 0.1469, 'grad_norm': 11.167139053344727, 'learning_rate': 6.386252649037134e-06, 'epoch': 2.72}


 68%|██████▊   | 29601/43412 [2:20:12<46:52,  4.91it/s]

{'loss': 0.1287, 'grad_norm': 1.5603680610656738, 'learning_rate': 6.36321754353635e-06, 'epoch': 2.73}


 68%|██████▊   | 29651/43412 [2:20:23<46:38,  4.92it/s]

{'loss': 0.1382, 'grad_norm': 1.0903180837631226, 'learning_rate': 6.340182438035567e-06, 'epoch': 2.73}


 68%|██████▊   | 29701/43412 [2:20:33<46:14,  4.94it/s]

{'loss': 0.147, 'grad_norm': 8.27341079711914, 'learning_rate': 6.317147332534783e-06, 'epoch': 2.74}


 69%|██████▊   | 29751/43412 [2:20:43<45:59,  4.95it/s]

{'loss': 0.1645, 'grad_norm': 12.510262489318848, 'learning_rate': 6.294112227034001e-06, 'epoch': 2.74}


 69%|██████▊   | 29801/43412 [2:20:53<46:10,  4.91it/s]

{'loss': 0.1606, 'grad_norm': 6.4014177322387695, 'learning_rate': 6.271077121533217e-06, 'epoch': 2.75}


 69%|██████▉   | 29851/43412 [2:21:03<46:13,  4.89it/s]

{'loss': 0.1364, 'grad_norm': 2.5504918098449707, 'learning_rate': 6.248042016032434e-06, 'epoch': 2.75}


 69%|██████▉   | 29901/43412 [2:21:13<45:54,  4.91it/s]

{'loss': 0.148, 'grad_norm': 0.8931390047073364, 'learning_rate': 6.225006910531652e-06, 'epoch': 2.75}


 69%|██████▉   | 29951/43412 [2:21:24<45:48,  4.90it/s]

{'loss': 0.1521, 'grad_norm': 2.8645944595336914, 'learning_rate': 6.201971805030868e-06, 'epoch': 2.76}


 69%|██████▉   | 30001/43412 [2:21:34<45:32,  4.91it/s]

{'loss': 0.1354, 'grad_norm': 3.702667713165283, 'learning_rate': 6.178936699530085e-06, 'epoch': 2.76}


 69%|██████▉   | 30051/43412 [2:21:44<45:27,  4.90it/s]

{'loss': 0.1509, 'grad_norm': 2.739983320236206, 'learning_rate': 6.155901594029301e-06, 'epoch': 2.77}


 69%|██████▉   | 30101/43412 [2:21:54<45:26,  4.88it/s]

{'loss': 0.1345, 'grad_norm': 10.230154037475586, 'learning_rate': 6.132866488528519e-06, 'epoch': 2.77}


 69%|██████▉   | 30151/43412 [2:22:04<45:13,  4.89it/s]

{'loss': 0.1462, 'grad_norm': 5.148848056793213, 'learning_rate': 6.109831383027735e-06, 'epoch': 2.78}


 70%|██████▉   | 30201/43412 [2:22:14<45:02,  4.89it/s]

{'loss': 0.1432, 'grad_norm': 8.434064865112305, 'learning_rate': 6.086796277526952e-06, 'epoch': 2.78}


 70%|██████▉   | 30251/43412 [2:22:24<44:50,  4.89it/s]

{'loss': 0.1722, 'grad_norm': 8.193312644958496, 'learning_rate': 6.063761172026168e-06, 'epoch': 2.79}


 70%|██████▉   | 30301/43412 [2:22:35<44:29,  4.91it/s]

{'loss': 0.1345, 'grad_norm': 2.7092254161834717, 'learning_rate': 6.040726066525386e-06, 'epoch': 2.79}


 70%|██████▉   | 30351/43412 [2:22:45<44:01,  4.95it/s]

{'loss': 0.1268, 'grad_norm': 5.000594139099121, 'learning_rate': 6.017690961024602e-06, 'epoch': 2.8}


 70%|███████   | 30401/43412 [2:22:55<44:23,  4.88it/s]

{'loss': 0.1638, 'grad_norm': 4.514840602874756, 'learning_rate': 5.994655855523819e-06, 'epoch': 2.8}


 70%|███████   | 30451/43412 [2:23:05<44:09,  4.89it/s]

{'loss': 0.1557, 'grad_norm': 4.502166271209717, 'learning_rate': 5.971620750023036e-06, 'epoch': 2.81}


 70%|███████   | 30501/43412 [2:23:15<43:54,  4.90it/s]

{'loss': 0.1268, 'grad_norm': 0.6718182563781738, 'learning_rate': 5.948585644522253e-06, 'epoch': 2.81}


 70%|███████   | 30551/43412 [2:23:25<43:47,  4.90it/s]

{'loss': 0.1249, 'grad_norm': 5.6081953048706055, 'learning_rate': 5.92555053902147e-06, 'epoch': 2.81}


 70%|███████   | 30601/43412 [2:23:36<43:20,  4.93it/s]

{'loss': 0.1578, 'grad_norm': 0.22327820956707, 'learning_rate': 5.902515433520686e-06, 'epoch': 2.82}


 71%|███████   | 30651/43412 [2:23:46<43:26,  4.90it/s]

{'loss': 0.1273, 'grad_norm': 0.16425269842147827, 'learning_rate': 5.879480328019903e-06, 'epoch': 2.82}


 71%|███████   | 30701/43412 [2:23:56<43:11,  4.91it/s]

{'loss': 0.1584, 'grad_norm': 0.18690793216228485, 'learning_rate': 5.85644522251912e-06, 'epoch': 2.83}


 71%|███████   | 30751/43412 [2:24:06<43:13,  4.88it/s]

{'loss': 0.1179, 'grad_norm': 3.0590341091156006, 'learning_rate': 5.833410117018337e-06, 'epoch': 2.83}


 71%|███████   | 30801/43412 [2:24:16<42:53,  4.90it/s]

{'loss': 0.1017, 'grad_norm': 4.307656764984131, 'learning_rate': 5.810375011517553e-06, 'epoch': 2.84}


 71%|███████   | 30851/43412 [2:24:26<42:59,  4.87it/s]

{'loss': 0.1475, 'grad_norm': 5.405308723449707, 'learning_rate': 5.78733990601677e-06, 'epoch': 2.84}


 71%|███████   | 30901/43412 [2:24:37<42:38,  4.89it/s]

{'loss': 0.1491, 'grad_norm': 6.03171968460083, 'learning_rate': 5.764304800515987e-06, 'epoch': 2.85}


 71%|███████▏  | 30951/43412 [2:24:47<42:18,  4.91it/s]

{'loss': 0.143, 'grad_norm': 4.709690570831299, 'learning_rate': 5.741269695015204e-06, 'epoch': 2.85}


 71%|███████▏  | 31001/43412 [2:24:57<42:15,  4.89it/s]

{'loss': 0.1402, 'grad_norm': 8.067234992980957, 'learning_rate': 5.718234589514421e-06, 'epoch': 2.86}


 72%|███████▏  | 31051/43412 [2:25:07<41:49,  4.93it/s]

{'loss': 0.1338, 'grad_norm': 0.3839910626411438, 'learning_rate': 5.695199484013637e-06, 'epoch': 2.86}


 72%|███████▏  | 31101/43412 [2:25:17<41:53,  4.90it/s]

{'loss': 0.1458, 'grad_norm': 0.8947722315788269, 'learning_rate': 5.6721643785128546e-06, 'epoch': 2.87}


 72%|███████▏  | 31151/43412 [2:25:27<41:42,  4.90it/s]

{'loss': 0.1587, 'grad_norm': 8.684982299804688, 'learning_rate': 5.649129273012071e-06, 'epoch': 2.87}


 72%|███████▏  | 31201/43412 [2:25:38<41:39,  4.89it/s]

{'loss': 0.1228, 'grad_norm': 1.8390471935272217, 'learning_rate': 5.626094167511288e-06, 'epoch': 2.87}


 72%|███████▏  | 31251/43412 [2:25:48<41:27,  4.89it/s]

{'loss': 0.1421, 'grad_norm': 1.3207262754440308, 'learning_rate': 5.603059062010504e-06, 'epoch': 2.88}


 72%|███████▏  | 31301/43412 [2:25:58<41:16,  4.89it/s]

{'loss': 0.1432, 'grad_norm': 9.246715545654297, 'learning_rate': 5.5800239565097216e-06, 'epoch': 2.88}


 72%|███████▏  | 31351/43412 [2:26:08<41:10,  4.88it/s]

{'loss': 0.1409, 'grad_norm': 1.1029958724975586, 'learning_rate': 5.556988851008938e-06, 'epoch': 2.89}


 72%|███████▏  | 31401/43412 [2:26:18<40:53,  4.90it/s]

{'loss': 0.1323, 'grad_norm': 3.5577101707458496, 'learning_rate': 5.533953745508155e-06, 'epoch': 2.89}


 72%|███████▏  | 31451/43412 [2:26:28<40:29,  4.92it/s]

{'loss': 0.1435, 'grad_norm': 13.865251541137695, 'learning_rate': 5.5109186400073725e-06, 'epoch': 2.9}


 73%|███████▎  | 31501/43412 [2:26:39<40:21,  4.92it/s]

{'loss': 0.1493, 'grad_norm': 6.818154811859131, 'learning_rate': 5.487883534506589e-06, 'epoch': 2.9}


 73%|███████▎  | 31551/43412 [2:26:49<40:18,  4.90it/s]

{'loss': 0.1389, 'grad_norm': 6.5054030418396, 'learning_rate': 5.4648484290058056e-06, 'epoch': 2.91}


 73%|███████▎  | 31601/43412 [2:26:59<40:14,  4.89it/s]

{'loss': 0.1669, 'grad_norm': 4.132950305938721, 'learning_rate': 5.441813323505022e-06, 'epoch': 2.91}


 73%|███████▎  | 31651/43412 [2:27:09<40:06,  4.89it/s]

{'loss': 0.163, 'grad_norm': 3.0733957290649414, 'learning_rate': 5.4187782180042395e-06, 'epoch': 2.92}


 73%|███████▎  | 31701/43412 [2:27:19<39:51,  4.90it/s]

{'loss': 0.131, 'grad_norm': 9.892181396484375, 'learning_rate': 5.395743112503456e-06, 'epoch': 2.92}


 73%|███████▎  | 31751/43412 [2:27:29<39:41,  4.90it/s]

{'loss': 0.1378, 'grad_norm': 2.32692289352417, 'learning_rate': 5.3727080070026726e-06, 'epoch': 2.93}


 73%|███████▎  | 31801/43412 [2:27:39<39:27,  4.90it/s]

{'loss': 0.1403, 'grad_norm': 4.605106353759766, 'learning_rate': 5.349672901501889e-06, 'epoch': 2.93}


 73%|███████▎  | 31851/43412 [2:27:50<39:28,  4.88it/s]

{'loss': 0.1163, 'grad_norm': 0.6528195738792419, 'learning_rate': 5.3266377960011065e-06, 'epoch': 2.93}


 73%|███████▎  | 31901/43412 [2:28:00<39:07,  4.90it/s]

{'loss': 0.1071, 'grad_norm': 3.407932758331299, 'learning_rate': 5.303602690500323e-06, 'epoch': 2.94}


 74%|███████▎  | 31951/43412 [2:28:10<38:52,  4.91it/s]

{'loss': 0.1443, 'grad_norm': 8.113716125488281, 'learning_rate': 5.2805675849995396e-06, 'epoch': 2.94}


 74%|███████▎  | 32001/43412 [2:28:20<38:45,  4.91it/s]

{'loss': 0.1075, 'grad_norm': 5.188842296600342, 'learning_rate': 5.257532479498757e-06, 'epoch': 2.95}


 74%|███████▍  | 32051/43412 [2:28:30<38:33,  4.91it/s]

{'loss': 0.1345, 'grad_norm': 14.460245132446289, 'learning_rate': 5.2344973739979735e-06, 'epoch': 2.95}


 74%|███████▍  | 32101/43412 [2:28:40<38:29,  4.90it/s]

{'loss': 0.1298, 'grad_norm': 5.148496150970459, 'learning_rate': 5.2114622684971905e-06, 'epoch': 2.96}


 74%|███████▍  | 32151/43412 [2:28:51<38:12,  4.91it/s]

{'loss': 0.1206, 'grad_norm': 6.923243045806885, 'learning_rate': 5.188427162996407e-06, 'epoch': 2.96}


 74%|███████▍  | 32201/43412 [2:29:01<38:06,  4.90it/s]

{'loss': 0.1669, 'grad_norm': 0.229193776845932, 'learning_rate': 5.165392057495624e-06, 'epoch': 2.97}


 74%|███████▍  | 32251/43412 [2:29:11<37:39,  4.94it/s]

{'loss': 0.1635, 'grad_norm': 1.5567162036895752, 'learning_rate': 5.1423569519948405e-06, 'epoch': 2.97}


 74%|███████▍  | 32301/43412 [2:29:21<37:47,  4.90it/s]

{'loss': 0.1295, 'grad_norm': 0.264157235622406, 'learning_rate': 5.1193218464940575e-06, 'epoch': 2.98}


 75%|███████▍  | 32351/43412 [2:29:31<37:34,  4.91it/s]

{'loss': 0.1774, 'grad_norm': 11.35776138305664, 'learning_rate': 5.096286740993274e-06, 'epoch': 2.98}


 75%|███████▍  | 32401/43412 [2:29:41<37:24,  4.91it/s]

{'loss': 0.1202, 'grad_norm': 11.054006576538086, 'learning_rate': 5.073251635492491e-06, 'epoch': 2.99}


 75%|███████▍  | 32451/43412 [2:29:52<37:21,  4.89it/s]

{'loss': 0.1505, 'grad_norm': 0.1843762993812561, 'learning_rate': 5.050216529991708e-06, 'epoch': 2.99}


 75%|███████▍  | 32501/43412 [2:30:02<36:52,  4.93it/s]

{'loss': 0.126, 'grad_norm': 0.20084379613399506, 'learning_rate': 5.0271814244909245e-06, 'epoch': 2.99}


 75%|███████▍  | 32551/43412 [2:30:12<37:05,  4.88it/s]

{'loss': 0.1405, 'grad_norm': 2.4524381160736084, 'learning_rate': 5.0041463189901415e-06, 'epoch': 3.0}


                                                       
 75%|███████▌  | 32559/43412 [2:31:43<33:57,  5.33it/s]

{'eval_loss': 0.29650455713272095, 'eval_f1_macro': 0.5806902124108231, 'eval_precision_macro': 0.6289273802339197, 'eval_recall_macro': 0.546084748923619, 'eval_accuracy': 0.6032067821599705, 'eval_runtime': 89.2843, 'eval_samples_per_second': 60.772, 'eval_steps_per_second': 7.605, 'epoch': 3.0}


 75%|███████▌  | 32601/43412 [2:31:53<36:48,  4.90it/s]   

{'loss': 0.1392, 'grad_norm': 0.7589386105537415, 'learning_rate': 4.981111213489358e-06, 'epoch': 3.0}


 75%|███████▌  | 32651/43412 [2:32:03<36:38,  4.89it/s]

{'loss': 0.0998, 'grad_norm': 4.059994220733643, 'learning_rate': 4.9580761079885745e-06, 'epoch': 3.01}


 75%|███████▌  | 32701/43412 [2:32:13<36:30,  4.89it/s]

{'loss': 0.1061, 'grad_norm': 9.507473945617676, 'learning_rate': 4.935041002487792e-06, 'epoch': 3.01}


 75%|███████▌  | 32751/43412 [2:32:23<36:31,  4.87it/s]

{'loss': 0.1129, 'grad_norm': 3.396885633468628, 'learning_rate': 4.9120058969870085e-06, 'epoch': 3.02}


 76%|███████▌  | 32801/43412 [2:32:34<36:15,  4.88it/s]

{'loss': 0.0855, 'grad_norm': 4.430655002593994, 'learning_rate': 4.8889707914862254e-06, 'epoch': 3.02}


 76%|███████▌  | 32851/43412 [2:32:44<36:02,  4.88it/s]

{'loss': 0.062, 'grad_norm': 1.798823595046997, 'learning_rate': 4.865935685985442e-06, 'epoch': 3.03}


 76%|███████▌  | 32901/43412 [2:32:54<35:31,  4.93it/s]

{'loss': 0.0918, 'grad_norm': 1.282507300376892, 'learning_rate': 4.842900580484659e-06, 'epoch': 3.03}


 76%|███████▌  | 32951/43412 [2:33:04<35:44,  4.88it/s]

{'loss': 0.1008, 'grad_norm': 5.163121223449707, 'learning_rate': 4.8198654749838755e-06, 'epoch': 3.04}


 76%|███████▌  | 33001/43412 [2:33:14<35:24,  4.90it/s]

{'loss': 0.0883, 'grad_norm': 0.22762012481689453, 'learning_rate': 4.7968303694830924e-06, 'epoch': 3.04}


 76%|███████▌  | 33051/43412 [2:33:24<34:56,  4.94it/s]

{'loss': 0.1026, 'grad_norm': 7.656461238861084, 'learning_rate': 4.773795263982309e-06, 'epoch': 3.05}


 76%|███████▌  | 33101/43412 [2:33:35<35:04,  4.90it/s]

{'loss': 0.0736, 'grad_norm': 4.9038310050964355, 'learning_rate': 4.750760158481526e-06, 'epoch': 3.05}


 76%|███████▋  | 33151/43412 [2:33:45<34:51,  4.91it/s]

{'loss': 0.0907, 'grad_norm': 0.10234902799129486, 'learning_rate': 4.7277250529807425e-06, 'epoch': 3.05}


 76%|███████▋  | 33201/43412 [2:33:55<34:31,  4.93it/s]

{'loss': 0.0889, 'grad_norm': 14.70960521697998, 'learning_rate': 4.70468994747996e-06, 'epoch': 3.06}


 77%|███████▋  | 33251/43412 [2:34:05<34:19,  4.93it/s]

{'loss': 0.1118, 'grad_norm': 0.0809929221868515, 'learning_rate': 4.681654841979177e-06, 'epoch': 3.06}


 77%|███████▋  | 33301/43412 [2:34:15<34:17,  4.91it/s]

{'loss': 0.081, 'grad_norm': 2.641123056411743, 'learning_rate': 4.658619736478393e-06, 'epoch': 3.07}


 77%|███████▋  | 33351/43412 [2:34:25<34:02,  4.93it/s]

{'loss': 0.0964, 'grad_norm': 4.218536376953125, 'learning_rate': 4.63558463097761e-06, 'epoch': 3.07}


 77%|███████▋  | 33401/43412 [2:34:35<33:53,  4.92it/s]

{'loss': 0.0895, 'grad_norm': 0.21608641743659973, 'learning_rate': 4.612549525476827e-06, 'epoch': 3.08}


 77%|███████▋  | 33451/43412 [2:34:46<33:51,  4.90it/s]

{'loss': 0.0881, 'grad_norm': 0.3072989881038666, 'learning_rate': 4.5895144199760434e-06, 'epoch': 3.08}


 77%|███████▋  | 33501/43412 [2:34:56<33:27,  4.94it/s]

{'loss': 0.0863, 'grad_norm': 2.599923849105835, 'learning_rate': 4.56647931447526e-06, 'epoch': 3.09}


 77%|███████▋  | 33551/43412 [2:35:06<33:32,  4.90it/s]

{'loss': 0.1051, 'grad_norm': 7.609246253967285, 'learning_rate': 4.543444208974477e-06, 'epoch': 3.09}


 77%|███████▋  | 33601/43412 [2:35:16<33:08,  4.93it/s]

{'loss': 0.0939, 'grad_norm': 4.820834636688232, 'learning_rate': 4.520409103473694e-06, 'epoch': 3.1}


 78%|███████▊  | 33651/43412 [2:35:26<33:11,  4.90it/s]

{'loss': 0.0936, 'grad_norm': 1.0173676013946533, 'learning_rate': 4.4973739979729104e-06, 'epoch': 3.1}


 78%|███████▊  | 33701/43412 [2:35:36<32:56,  4.91it/s]

{'loss': 0.1131, 'grad_norm': 6.386465549468994, 'learning_rate': 4.474338892472128e-06, 'epoch': 3.11}


 78%|███████▊  | 33751/43412 [2:35:47<32:58,  4.88it/s]

{'loss': 0.0704, 'grad_norm': 2.121934413909912, 'learning_rate': 4.451303786971345e-06, 'epoch': 3.11}


 78%|███████▊  | 33801/43412 [2:35:57<32:36,  4.91it/s]

{'loss': 0.1019, 'grad_norm': 0.08066972345113754, 'learning_rate': 4.428268681470561e-06, 'epoch': 3.11}


 78%|███████▊  | 33851/43412 [2:36:07<32:29,  4.90it/s]

{'loss': 0.1077, 'grad_norm': 2.005573272705078, 'learning_rate': 4.405233575969778e-06, 'epoch': 3.12}


 78%|███████▊  | 33901/43412 [2:36:17<32:20,  4.90it/s]

{'loss': 0.1382, 'grad_norm': 5.42318868637085, 'learning_rate': 4.382198470468995e-06, 'epoch': 3.12}


 78%|███████▊  | 33951/43412 [2:36:27<32:15,  4.89it/s]

{'loss': 0.1059, 'grad_norm': 0.31676268577575684, 'learning_rate': 4.359163364968212e-06, 'epoch': 3.13}


 78%|███████▊  | 34001/43412 [2:36:37<31:57,  4.91it/s]

{'loss': 0.097, 'grad_norm': 0.5504676699638367, 'learning_rate': 4.336128259467428e-06, 'epoch': 3.13}


 78%|███████▊  | 34051/43412 [2:36:48<31:50,  4.90it/s]

{'loss': 0.1082, 'grad_norm': 0.6776078939437866, 'learning_rate': 4.313093153966645e-06, 'epoch': 3.14}


 79%|███████▊  | 34101/43412 [2:36:58<31:43,  4.89it/s]

{'loss': 0.1083, 'grad_norm': 13.717680931091309, 'learning_rate': 4.290058048465862e-06, 'epoch': 3.14}


 79%|███████▊  | 34151/43412 [2:37:08<31:38,  4.88it/s]

{'loss': 0.1126, 'grad_norm': 0.17890781164169312, 'learning_rate': 4.267022942965079e-06, 'epoch': 3.15}


 79%|███████▉  | 34201/43412 [2:37:18<31:20,  4.90it/s]

{'loss': 0.089, 'grad_norm': 5.732766151428223, 'learning_rate': 4.243987837464295e-06, 'epoch': 3.15}


 79%|███████▉  | 34251/43412 [2:37:28<31:13,  4.89it/s]

{'loss': 0.1107, 'grad_norm': 0.1688990443944931, 'learning_rate': 4.220952731963513e-06, 'epoch': 3.16}


 79%|███████▉  | 34301/43412 [2:37:38<31:07,  4.88it/s]

{'loss': 0.1066, 'grad_norm': 2.0217459201812744, 'learning_rate': 4.197917626462729e-06, 'epoch': 3.16}


 79%|███████▉  | 34351/43412 [2:37:49<30:49,  4.90it/s]

{'loss': 0.1038, 'grad_norm': 0.07663950324058533, 'learning_rate': 4.174882520961946e-06, 'epoch': 3.17}


 79%|███████▉  | 34401/43412 [2:37:59<30:35,  4.91it/s]

{'loss': 0.1018, 'grad_norm': 11.684978485107422, 'learning_rate': 4.151847415461163e-06, 'epoch': 3.17}


 79%|███████▉  | 34451/43412 [2:38:09<30:30,  4.90it/s]

{'loss': 0.1092, 'grad_norm': 3.4231185913085938, 'learning_rate': 4.12881230996038e-06, 'epoch': 3.17}


 79%|███████▉  | 34501/43412 [2:38:19<30:24,  4.88it/s]

{'loss': 0.0903, 'grad_norm': 0.1405463069677353, 'learning_rate': 4.105777204459596e-06, 'epoch': 3.18}


 80%|███████▉  | 34551/43412 [2:38:29<29:57,  4.93it/s]

{'loss': 0.0853, 'grad_norm': 0.2413383424282074, 'learning_rate': 4.082742098958813e-06, 'epoch': 3.18}


 80%|███████▉  | 34601/43412 [2:38:39<29:50,  4.92it/s]

{'loss': 0.114, 'grad_norm': 5.6034040451049805, 'learning_rate': 4.05970699345803e-06, 'epoch': 3.19}


 80%|███████▉  | 34651/43412 [2:38:50<29:49,  4.90it/s]

{'loss': 0.1384, 'grad_norm': 16.798133850097656, 'learning_rate': 4.036671887957247e-06, 'epoch': 3.19}


 80%|███████▉  | 34701/43412 [2:39:00<29:43,  4.89it/s]

{'loss': 0.0909, 'grad_norm': 15.74837589263916, 'learning_rate': 4.013636782456463e-06, 'epoch': 3.2}


 80%|████████  | 34751/43412 [2:39:10<29:34,  4.88it/s]

{'loss': 0.1027, 'grad_norm': 16.486480712890625, 'learning_rate': 3.990601676955681e-06, 'epoch': 3.2}


 80%|████████  | 34801/43412 [2:39:20<29:12,  4.91it/s]

{'loss': 0.1036, 'grad_norm': 2.872699499130249, 'learning_rate': 3.967566571454898e-06, 'epoch': 3.21}


 80%|████████  | 34851/43412 [2:39:30<28:45,  4.96it/s]

{'loss': 0.0813, 'grad_norm': 1.002397894859314, 'learning_rate': 3.944531465954114e-06, 'epoch': 3.21}


 80%|████████  | 34901/43412 [2:39:40<28:58,  4.90it/s]

{'loss': 0.1285, 'grad_norm': 0.3039937913417816, 'learning_rate': 3.921496360453331e-06, 'epoch': 3.22}


 81%|████████  | 34951/43412 [2:39:50<28:43,  4.91it/s]

{'loss': 0.1058, 'grad_norm': 0.4577071964740753, 'learning_rate': 3.898461254952548e-06, 'epoch': 3.22}


 81%|████████  | 35001/43412 [2:40:01<28:33,  4.91it/s]

{'loss': 0.1019, 'grad_norm': 2.100341796875, 'learning_rate': 3.875426149451765e-06, 'epoch': 3.22}


 81%|████████  | 35051/43412 [2:40:11<28:21,  4.91it/s]

{'loss': 0.0788, 'grad_norm': 6.193408012390137, 'learning_rate': 3.852391043950981e-06, 'epoch': 3.23}


 81%|████████  | 35101/43412 [2:40:21<28:13,  4.91it/s]

{'loss': 0.1055, 'grad_norm': 0.22619789838790894, 'learning_rate': 3.829355938450198e-06, 'epoch': 3.23}


 81%|████████  | 35151/43412 [2:40:31<27:52,  4.94it/s]

{'loss': 0.1179, 'grad_norm': 0.14129240810871124, 'learning_rate': 3.806320832949415e-06, 'epoch': 3.24}


 81%|████████  | 35201/43412 [2:40:41<27:59,  4.89it/s]

{'loss': 0.0888, 'grad_norm': 5.121674537658691, 'learning_rate': 3.7832857274486317e-06, 'epoch': 3.24}


 81%|████████  | 35251/43412 [2:40:51<27:34,  4.93it/s]

{'loss': 0.1083, 'grad_norm': 1.9829961061477661, 'learning_rate': 3.760250621947849e-06, 'epoch': 3.25}


 81%|████████▏ | 35301/43412 [2:41:02<27:31,  4.91it/s]

{'loss': 0.0876, 'grad_norm': 3.6786601543426514, 'learning_rate': 3.7372155164470656e-06, 'epoch': 3.25}


 81%|████████▏ | 35351/43412 [2:41:12<27:27,  4.89it/s]

{'loss': 0.1146, 'grad_norm': 0.1374724805355072, 'learning_rate': 3.7141804109462826e-06, 'epoch': 3.26}


 82%|████████▏ | 35401/43412 [2:41:22<27:15,  4.90it/s]

{'loss': 0.0944, 'grad_norm': 1.5382434129714966, 'learning_rate': 3.691145305445499e-06, 'epoch': 3.26}


 82%|████████▏ | 35451/43412 [2:41:32<27:06,  4.89it/s]

{'loss': 0.0863, 'grad_norm': 1.2000313997268677, 'learning_rate': 3.668110199944716e-06, 'epoch': 3.27}


 82%|████████▏ | 35501/43412 [2:41:42<26:53,  4.90it/s]

{'loss': 0.1017, 'grad_norm': 1.4998581409454346, 'learning_rate': 3.6450750944439326e-06, 'epoch': 3.27}


 82%|████████▏ | 35551/43412 [2:41:52<26:45,  4.90it/s]

{'loss': 0.0832, 'grad_norm': 6.116176605224609, 'learning_rate': 3.6220399889431496e-06, 'epoch': 3.28}


 82%|████████▏ | 35601/43412 [2:42:02<26:34,  4.90it/s]

{'loss': 0.1225, 'grad_norm': 0.9322522878646851, 'learning_rate': 3.599004883442366e-06, 'epoch': 3.28}


 82%|████████▏ | 35651/43412 [2:42:13<26:27,  4.89it/s]

{'loss': 0.1102, 'grad_norm': 16.958560943603516, 'learning_rate': 3.575969777941583e-06, 'epoch': 3.28}


 82%|████████▏ | 35701/43412 [2:42:23<26:04,  4.93it/s]

{'loss': 0.1531, 'grad_norm': 0.6018526554107666, 'learning_rate': 3.5529346724407996e-06, 'epoch': 3.29}


 82%|████████▏ | 35751/43412 [2:42:33<26:04,  4.90it/s]

{'loss': 0.107, 'grad_norm': 6.04557991027832, 'learning_rate': 3.529899566940017e-06, 'epoch': 3.29}


 82%|████████▏ | 35801/43412 [2:42:43<25:45,  4.92it/s]

{'loss': 0.1178, 'grad_norm': 12.627701759338379, 'learning_rate': 3.506864461439234e-06, 'epoch': 3.3}


 83%|████████▎ | 35851/43412 [2:42:53<25:41,  4.90it/s]

{'loss': 0.1103, 'grad_norm': 6.059162139892578, 'learning_rate': 3.4838293559384505e-06, 'epoch': 3.3}


 83%|████████▎ | 35901/43412 [2:43:03<25:34,  4.90it/s]

{'loss': 0.0991, 'grad_norm': 1.5789694786071777, 'learning_rate': 3.4607942504376675e-06, 'epoch': 3.31}


 83%|████████▎ | 35951/43412 [2:43:14<25:11,  4.94it/s]

{'loss': 0.1035, 'grad_norm': 15.444649696350098, 'learning_rate': 3.437759144936884e-06, 'epoch': 3.31}


 83%|████████▎ | 36001/43412 [2:43:24<25:13,  4.90it/s]

{'loss': 0.0889, 'grad_norm': 0.49968820810317993, 'learning_rate': 3.414724039436101e-06, 'epoch': 3.32}


 83%|████████▎ | 36051/43412 [2:43:34<25:08,  4.88it/s]

{'loss': 0.1079, 'grad_norm': 13.003715515136719, 'learning_rate': 3.3916889339353175e-06, 'epoch': 3.32}


 83%|████████▎ | 36101/43412 [2:43:44<24:51,  4.90it/s]

{'loss': 0.1222, 'grad_norm': 1.1508827209472656, 'learning_rate': 3.3686538284345345e-06, 'epoch': 3.33}


 83%|████████▎ | 36151/43412 [2:43:54<24:41,  4.90it/s]

{'loss': 0.0973, 'grad_norm': 1.7516180276870728, 'learning_rate': 3.345618722933751e-06, 'epoch': 3.33}


 83%|████████▎ | 36201/43412 [2:44:04<24:31,  4.90it/s]

{'loss': 0.1048, 'grad_norm': 12.509742736816406, 'learning_rate': 3.322583617432968e-06, 'epoch': 3.34}


 84%|████████▎ | 36251/43412 [2:44:14<24:06,  4.95it/s]

{'loss': 0.1066, 'grad_norm': 1.3647338151931763, 'learning_rate': 3.299548511932185e-06, 'epoch': 3.34}


 84%|████████▎ | 36301/43412 [2:44:25<24:06,  4.92it/s]

{'loss': 0.0892, 'grad_norm': 0.19796055555343628, 'learning_rate': 3.276513406431402e-06, 'epoch': 3.34}


 84%|████████▎ | 36351/43412 [2:44:35<23:48,  4.94it/s]

{'loss': 0.1038, 'grad_norm': 0.03586950525641441, 'learning_rate': 3.2534783009306185e-06, 'epoch': 3.35}


 84%|████████▍ | 36401/43412 [2:44:45<23:51,  4.90it/s]

{'loss': 0.109, 'grad_norm': 0.3638087511062622, 'learning_rate': 3.2304431954298355e-06, 'epoch': 3.35}


 84%|████████▍ | 36451/43412 [2:44:55<23:37,  4.91it/s]

{'loss': 0.0821, 'grad_norm': 3.89811372756958, 'learning_rate': 3.207408089929052e-06, 'epoch': 3.36}


 84%|████████▍ | 36501/43412 [2:45:05<23:30,  4.90it/s]

{'loss': 0.1047, 'grad_norm': 2.39050030708313, 'learning_rate': 3.184372984428269e-06, 'epoch': 3.36}


 84%|████████▍ | 36551/43412 [2:45:15<23:20,  4.90it/s]

{'loss': 0.104, 'grad_norm': 0.16509591042995453, 'learning_rate': 3.1613378789274855e-06, 'epoch': 3.37}


 84%|████████▍ | 36601/43412 [2:45:26<23:13,  4.89it/s]

{'loss': 0.0951, 'grad_norm': 0.2738393545150757, 'learning_rate': 3.1383027734267025e-06, 'epoch': 3.37}


 84%|████████▍ | 36651/43412 [2:45:36<22:56,  4.91it/s]

{'loss': 0.1085, 'grad_norm': 12.11307144165039, 'learning_rate': 3.115267667925919e-06, 'epoch': 3.38}


 85%|████████▍ | 36701/43412 [2:45:46<22:48,  4.90it/s]

{'loss': 0.1092, 'grad_norm': 4.831409454345703, 'learning_rate': 3.092232562425136e-06, 'epoch': 3.38}


 85%|████████▍ | 36751/43412 [2:45:56<22:45,  4.88it/s]

{'loss': 0.1166, 'grad_norm': 5.308596611022949, 'learning_rate': 3.0691974569243534e-06, 'epoch': 3.39}


 85%|████████▍ | 36801/43412 [2:46:06<22:25,  4.91it/s]

{'loss': 0.0904, 'grad_norm': 0.13741715252399445, 'learning_rate': 3.04616235142357e-06, 'epoch': 3.39}


 85%|████████▍ | 36851/43412 [2:46:16<22:16,  4.91it/s]

{'loss': 0.1115, 'grad_norm': 1.2684589624404907, 'learning_rate': 3.023127245922787e-06, 'epoch': 3.4}


 85%|████████▌ | 36901/43412 [2:46:26<22:07,  4.90it/s]

{'loss': 0.0976, 'grad_norm': 0.23135912418365479, 'learning_rate': 3.0000921404220034e-06, 'epoch': 3.4}


 85%|████████▌ | 36951/43412 [2:46:37<21:56,  4.91it/s]

{'loss': 0.0963, 'grad_norm': 2.7271945476531982, 'learning_rate': 2.9770570349212204e-06, 'epoch': 3.4}


 85%|████████▌ | 37001/43412 [2:46:47<21:43,  4.92it/s]

{'loss': 0.102, 'grad_norm': 7.89882755279541, 'learning_rate': 2.954021929420437e-06, 'epoch': 3.41}


 85%|████████▌ | 37051/43412 [2:46:57<21:29,  4.93it/s]

{'loss': 0.1082, 'grad_norm': 14.732525825500488, 'learning_rate': 2.930986823919654e-06, 'epoch': 3.41}


 85%|████████▌ | 37101/43412 [2:47:07<21:30,  4.89it/s]

{'loss': 0.1242, 'grad_norm': 0.039948124438524246, 'learning_rate': 2.9079517184188704e-06, 'epoch': 3.42}


 86%|████████▌ | 37151/43412 [2:47:17<21:12,  4.92it/s]

{'loss': 0.0777, 'grad_norm': 0.13891686499118805, 'learning_rate': 2.8849166129180874e-06, 'epoch': 3.42}


 86%|████████▌ | 37201/43412 [2:47:27<21:01,  4.92it/s]

{'loss': 0.1208, 'grad_norm': 4.933363914489746, 'learning_rate': 2.861881507417304e-06, 'epoch': 3.43}


 86%|████████▌ | 37251/43412 [2:47:38<20:51,  4.92it/s]

{'loss': 0.1118, 'grad_norm': 5.1934943199157715, 'learning_rate': 2.8388464019165213e-06, 'epoch': 3.43}


 86%|████████▌ | 37301/43412 [2:47:48<20:51,  4.88it/s]

{'loss': 0.1369, 'grad_norm': 3.5948309898376465, 'learning_rate': 2.815811296415738e-06, 'epoch': 3.44}


 86%|████████▌ | 37351/43412 [2:47:58<20:37,  4.90it/s]

{'loss': 0.1263, 'grad_norm': 1.1824747323989868, 'learning_rate': 2.792776190914955e-06, 'epoch': 3.44}


 86%|████████▌ | 37401/43412 [2:48:08<20:18,  4.93it/s]

{'loss': 0.118, 'grad_norm': 20.382129669189453, 'learning_rate': 2.7697410854141714e-06, 'epoch': 3.45}


 86%|████████▋ | 37451/43412 [2:48:18<20:20,  4.88it/s]

{'loss': 0.1065, 'grad_norm': 15.745622634887695, 'learning_rate': 2.7467059799133883e-06, 'epoch': 3.45}


 86%|████████▋ | 37501/43412 [2:48:28<20:10,  4.88it/s]

{'loss': 0.0992, 'grad_norm': 20.101911544799805, 'learning_rate': 2.723670874412605e-06, 'epoch': 3.46}


 86%|████████▋ | 37551/43412 [2:48:38<19:55,  4.90it/s]

{'loss': 0.0895, 'grad_norm': 2.427661418914795, 'learning_rate': 2.700635768911822e-06, 'epoch': 3.46}


 87%|████████▋ | 37601/43412 [2:48:49<19:36,  4.94it/s]

{'loss': 0.0952, 'grad_norm': 0.05392531678080559, 'learning_rate': 2.6776006634110384e-06, 'epoch': 3.46}


 87%|████████▋ | 37651/43412 [2:48:59<19:37,  4.89it/s]

{'loss': 0.1135, 'grad_norm': 0.3574114143848419, 'learning_rate': 2.6545655579102553e-06, 'epoch': 3.47}


 87%|████████▋ | 37701/43412 [2:49:09<19:26,  4.90it/s]

{'loss': 0.1043, 'grad_norm': 15.03975772857666, 'learning_rate': 2.631530452409472e-06, 'epoch': 3.47}


 87%|████████▋ | 37751/43412 [2:49:19<19:08,  4.93it/s]

{'loss': 0.1224, 'grad_norm': 8.854674339294434, 'learning_rate': 2.6084953469086893e-06, 'epoch': 3.48}


 87%|████████▋ | 37801/43412 [2:49:29<19:03,  4.91it/s]

{'loss': 0.0845, 'grad_norm': 19.571081161499023, 'learning_rate': 2.5854602414079062e-06, 'epoch': 3.48}


 87%|████████▋ | 37851/43412 [2:49:39<18:54,  4.90it/s]

{'loss': 0.0717, 'grad_norm': 3.23403263092041, 'learning_rate': 2.5624251359071228e-06, 'epoch': 3.49}


 87%|████████▋ | 37901/43412 [2:49:49<18:44,  4.90it/s]

{'loss': 0.0882, 'grad_norm': 3.0644049644470215, 'learning_rate': 2.5393900304063397e-06, 'epoch': 3.49}


 87%|████████▋ | 37951/43412 [2:50:00<18:35,  4.89it/s]

{'loss': 0.1007, 'grad_norm': 0.3610784113407135, 'learning_rate': 2.5163549249055563e-06, 'epoch': 3.5}


 88%|████████▊ | 38001/43412 [2:50:10<18:20,  4.92it/s]

{'loss': 0.1159, 'grad_norm': 2.3017568588256836, 'learning_rate': 2.4933198194047732e-06, 'epoch': 3.5}


 88%|████████▊ | 38051/43412 [2:50:20<18:13,  4.90it/s]

{'loss': 0.0766, 'grad_norm': 5.006652355194092, 'learning_rate': 2.4702847139039898e-06, 'epoch': 3.51}


 88%|████████▊ | 38101/43412 [2:50:30<18:03,  4.90it/s]

{'loss': 0.1081, 'grad_norm': 0.2520756423473358, 'learning_rate': 2.4472496084032067e-06, 'epoch': 3.51}


 88%|████████▊ | 38151/43412 [2:50:40<17:54,  4.90it/s]

{'loss': 0.0618, 'grad_norm': 1.4886120557785034, 'learning_rate': 2.4242145029024237e-06, 'epoch': 3.52}


 88%|████████▊ | 38201/43412 [2:50:50<17:44,  4.90it/s]

{'loss': 0.0686, 'grad_norm': 3.017711877822876, 'learning_rate': 2.4011793974016403e-06, 'epoch': 3.52}


 88%|████████▊ | 38251/43412 [2:51:01<17:32,  4.90it/s]

{'loss': 0.1246, 'grad_norm': 1.1574145555496216, 'learning_rate': 2.3781442919008572e-06, 'epoch': 3.52}


 88%|████████▊ | 38301/43412 [2:51:11<17:22,  4.90it/s]

{'loss': 0.0789, 'grad_norm': 10.861990928649902, 'learning_rate': 2.3551091864000738e-06, 'epoch': 3.53}


 88%|████████▊ | 38351/43412 [2:51:21<17:06,  4.93it/s]

{'loss': 0.0935, 'grad_norm': 6.066778659820557, 'learning_rate': 2.3320740808992907e-06, 'epoch': 3.53}


 88%|████████▊ | 38401/43412 [2:51:31<17:01,  4.91it/s]

{'loss': 0.1091, 'grad_norm': 8.626241683959961, 'learning_rate': 2.3090389753985077e-06, 'epoch': 3.54}


 89%|████████▊ | 38451/43412 [2:51:41<16:57,  4.88it/s]

{'loss': 0.0884, 'grad_norm': 1.0424163341522217, 'learning_rate': 2.2860038698977242e-06, 'epoch': 3.54}


 89%|████████▊ | 38501/43412 [2:51:51<16:42,  4.90it/s]

{'loss': 0.113, 'grad_norm': 15.8342866897583, 'learning_rate': 2.262968764396941e-06, 'epoch': 3.55}


 89%|████████▉ | 38551/43412 [2:52:01<16:29,  4.91it/s]

{'loss': 0.0819, 'grad_norm': 4.484166145324707, 'learning_rate': 2.2399336588961577e-06, 'epoch': 3.55}


 89%|████████▉ | 38601/43412 [2:52:12<16:21,  4.90it/s]

{'loss': 0.1217, 'grad_norm': 2.3690927028656006, 'learning_rate': 2.2168985533953747e-06, 'epoch': 3.56}


 89%|████████▉ | 38651/43412 [2:52:22<16:08,  4.92it/s]

{'loss': 0.1059, 'grad_norm': 6.874204635620117, 'learning_rate': 2.1938634478945917e-06, 'epoch': 3.56}


 89%|████████▉ | 38701/43412 [2:52:32<15:58,  4.91it/s]

{'loss': 0.1158, 'grad_norm': 8.01368522644043, 'learning_rate': 2.170828342393808e-06, 'epoch': 3.57}


 89%|████████▉ | 38751/43412 [2:52:42<15:45,  4.93it/s]

{'loss': 0.1231, 'grad_norm': 5.616844654083252, 'learning_rate': 2.147793236893025e-06, 'epoch': 3.57}


 89%|████████▉ | 38801/43412 [2:52:52<15:39,  4.91it/s]

{'loss': 0.0863, 'grad_norm': 11.937386512756348, 'learning_rate': 2.1247581313922417e-06, 'epoch': 3.58}


 89%|████████▉ | 38851/43412 [2:53:02<15:29,  4.91it/s]

{'loss': 0.077, 'grad_norm': 1.4031076431274414, 'learning_rate': 2.1017230258914587e-06, 'epoch': 3.58}


 90%|████████▉ | 38901/43412 [2:53:13<15:21,  4.89it/s]

{'loss': 0.109, 'grad_norm': 1.6270891427993774, 'learning_rate': 2.0786879203906756e-06, 'epoch': 3.58}


 90%|████████▉ | 38951/43412 [2:53:23<15:06,  4.92it/s]

{'loss': 0.0851, 'grad_norm': 16.8028621673584, 'learning_rate': 2.0556528148898926e-06, 'epoch': 3.59}


 90%|████████▉ | 39001/43412 [2:53:33<15:03,  4.88it/s]

{'loss': 0.1106, 'grad_norm': 15.698975563049316, 'learning_rate': 2.032617709389109e-06, 'epoch': 3.59}


 90%|████████▉ | 39051/43412 [2:53:43<14:50,  4.90it/s]

{'loss': 0.0947, 'grad_norm': 2.1206915378570557, 'learning_rate': 2.009582603888326e-06, 'epoch': 3.6}


 90%|█████████ | 39101/43412 [2:53:53<14:42,  4.89it/s]

{'loss': 0.1171, 'grad_norm': 0.2560296058654785, 'learning_rate': 1.9865474983875427e-06, 'epoch': 3.6}


 90%|█████████ | 39151/43412 [2:54:03<14:23,  4.93it/s]

{'loss': 0.0863, 'grad_norm': 0.5903702974319458, 'learning_rate': 1.9635123928867596e-06, 'epoch': 3.61}


 90%|█████████ | 39201/43412 [2:54:14<14:10,  4.95it/s]

{'loss': 0.1162, 'grad_norm': 0.12296278774738312, 'learning_rate': 1.9404772873859766e-06, 'epoch': 3.61}


 90%|█████████ | 39251/43412 [2:54:24<14:15,  4.86it/s]

{'loss': 0.0967, 'grad_norm': 5.197003364562988, 'learning_rate': 1.917442181885193e-06, 'epoch': 3.62}


 91%|█████████ | 39301/43412 [2:54:34<14:00,  4.89it/s]

{'loss': 0.0905, 'grad_norm': 0.8484534621238708, 'learning_rate': 1.8944070763844099e-06, 'epoch': 3.62}


 91%|█████████ | 39351/43412 [2:54:44<13:45,  4.92it/s]

{'loss': 0.0912, 'grad_norm': 0.4338374733924866, 'learning_rate': 1.8713719708836266e-06, 'epoch': 3.63}


 91%|█████████ | 39401/43412 [2:54:54<13:39,  4.90it/s]

{'loss': 0.1194, 'grad_norm': 1.0379207134246826, 'learning_rate': 1.8483368653828438e-06, 'epoch': 3.63}


 91%|█████████ | 39451/43412 [2:55:04<13:27,  4.90it/s]

{'loss': 0.1024, 'grad_norm': 6.639780044555664, 'learning_rate': 1.8253017598820606e-06, 'epoch': 3.63}


 91%|█████████ | 39501/43412 [2:55:15<13:17,  4.91it/s]

{'loss': 0.1213, 'grad_norm': 0.3797440826892853, 'learning_rate': 1.8022666543812773e-06, 'epoch': 3.64}


 91%|█████████ | 39551/43412 [2:55:25<13:10,  4.88it/s]

{'loss': 0.0927, 'grad_norm': 0.06090246140956879, 'learning_rate': 1.779231548880494e-06, 'epoch': 3.64}


 91%|█████████ | 39601/43412 [2:55:35<12:58,  4.90it/s]

{'loss': 0.092, 'grad_norm': 0.9911276698112488, 'learning_rate': 1.7561964433797108e-06, 'epoch': 3.65}


 91%|█████████▏| 39651/43412 [2:55:45<12:47,  4.90it/s]

{'loss': 0.0873, 'grad_norm': 6.304294109344482, 'learning_rate': 1.7331613378789278e-06, 'epoch': 3.65}


 91%|█████████▏| 39701/43412 [2:55:55<12:37,  4.90it/s]

{'loss': 0.0838, 'grad_norm': 4.735485076904297, 'learning_rate': 1.7101262323781445e-06, 'epoch': 3.66}


 92%|█████████▏| 39751/43412 [2:56:05<12:19,  4.95it/s]

{'loss': 0.1002, 'grad_norm': 24.137466430664062, 'learning_rate': 1.6870911268773613e-06, 'epoch': 3.66}


 92%|█████████▏| 39801/43412 [2:56:15<12:19,  4.88it/s]

{'loss': 0.1055, 'grad_norm': 0.0706896111369133, 'learning_rate': 1.664056021376578e-06, 'epoch': 3.67}


 92%|█████████▏| 39851/43412 [2:56:26<12:08,  4.89it/s]

{'loss': 0.1121, 'grad_norm': 11.187481880187988, 'learning_rate': 1.6410209158757948e-06, 'epoch': 3.67}


 92%|█████████▏| 39901/43412 [2:56:36<11:55,  4.91it/s]

{'loss': 0.1215, 'grad_norm': 4.804137229919434, 'learning_rate': 1.6179858103750115e-06, 'epoch': 3.68}


 92%|█████████▏| 39951/43412 [2:56:46<11:46,  4.90it/s]

{'loss': 0.0749, 'grad_norm': 6.233026027679443, 'learning_rate': 1.5949507048742285e-06, 'epoch': 3.68}


 92%|█████████▏| 40001/43412 [2:56:56<11:34,  4.91it/s]

{'loss': 0.152, 'grad_norm': 3.5735418796539307, 'learning_rate': 1.5719155993734453e-06, 'epoch': 3.69}


 92%|█████████▏| 40051/43412 [2:57:06<11:23,  4.92it/s]

{'loss': 0.0912, 'grad_norm': 7.744278907775879, 'learning_rate': 1.548880493872662e-06, 'epoch': 3.69}


 92%|█████████▏| 40101/43412 [2:57:16<11:18,  4.88it/s]

{'loss': 0.1166, 'grad_norm': 3.313210964202881, 'learning_rate': 1.5258453883718788e-06, 'epoch': 3.69}


 92%|█████████▏| 40151/43412 [2:57:27<11:08,  4.88it/s]

{'loss': 0.1162, 'grad_norm': 6.58150577545166, 'learning_rate': 1.5028102828710955e-06, 'epoch': 3.7}


 93%|█████████▎| 40201/43412 [2:57:37<10:53,  4.91it/s]

{'loss': 0.1315, 'grad_norm': 12.355570793151855, 'learning_rate': 1.4797751773703125e-06, 'epoch': 3.7}


 93%|█████████▎| 40251/43412 [2:57:47<10:42,  4.92it/s]

{'loss': 0.1292, 'grad_norm': 0.40740686655044556, 'learning_rate': 1.4567400718695292e-06, 'epoch': 3.71}


 93%|█████████▎| 40301/43412 [2:57:57<10:32,  4.92it/s]

{'loss': 0.1319, 'grad_norm': 15.26648235321045, 'learning_rate': 1.433704966368746e-06, 'epoch': 3.71}


 93%|█████████▎| 40351/43412 [2:58:07<10:27,  4.88it/s]

{'loss': 0.1025, 'grad_norm': 4.001504421234131, 'learning_rate': 1.4106698608679627e-06, 'epoch': 3.72}


 93%|█████████▎| 40401/43412 [2:58:17<10:11,  4.93it/s]

{'loss': 0.128, 'grad_norm': 16.0627384185791, 'learning_rate': 1.3876347553671795e-06, 'epoch': 3.72}


 93%|█████████▎| 40451/43412 [2:58:28<10:03,  4.91it/s]

{'loss': 0.0885, 'grad_norm': 0.3194390833377838, 'learning_rate': 1.3645996498663967e-06, 'epoch': 3.73}


 93%|█████████▎| 40501/43412 [2:58:38<09:56,  4.88it/s]

{'loss': 0.0987, 'grad_norm': 9.083839416503906, 'learning_rate': 1.3415645443656134e-06, 'epoch': 3.73}


 93%|█████████▎| 40551/43412 [2:58:48<09:45,  4.89it/s]

{'loss': 0.0998, 'grad_norm': 21.203227996826172, 'learning_rate': 1.3185294388648302e-06, 'epoch': 3.74}


 94%|█████████▎| 40601/43412 [2:58:58<09:33,  4.90it/s]

{'loss': 0.1084, 'grad_norm': 1.810583472251892, 'learning_rate': 1.295494333364047e-06, 'epoch': 3.74}


 94%|█████████▎| 40651/43412 [2:59:08<09:24,  4.89it/s]

{'loss': 0.0948, 'grad_norm': 2.43005633354187, 'learning_rate': 1.2724592278632637e-06, 'epoch': 3.75}


 94%|█████████▍| 40701/43412 [2:59:18<09:09,  4.93it/s]

{'loss': 0.0994, 'grad_norm': 0.056614749133586884, 'learning_rate': 1.2494241223624804e-06, 'epoch': 3.75}


 94%|█████████▍| 40751/43412 [2:59:28<09:04,  4.89it/s]

{'loss': 0.0871, 'grad_norm': 3.3827309608459473, 'learning_rate': 1.2263890168616974e-06, 'epoch': 3.75}


 94%|█████████▍| 40801/43412 [2:59:39<08:54,  4.89it/s]

{'loss': 0.1204, 'grad_norm': 17.953237533569336, 'learning_rate': 1.2033539113609142e-06, 'epoch': 3.76}


 94%|█████████▍| 40851/43412 [2:59:49<08:43,  4.90it/s]

{'loss': 0.0988, 'grad_norm': 0.18887925148010254, 'learning_rate': 1.180318805860131e-06, 'epoch': 3.76}


 94%|█████████▍| 40901/43412 [2:59:59<08:31,  4.90it/s]

{'loss': 0.1061, 'grad_norm': 1.8327041864395142, 'learning_rate': 1.1572837003593477e-06, 'epoch': 3.77}


 94%|█████████▍| 40951/43412 [3:00:09<08:20,  4.92it/s]

{'loss': 0.0816, 'grad_norm': 6.37905740737915, 'learning_rate': 1.1342485948585644e-06, 'epoch': 3.77}


 94%|█████████▍| 41001/43412 [3:00:19<08:12,  4.89it/s]

{'loss': 0.1091, 'grad_norm': 1.7815093994140625, 'learning_rate': 1.1112134893577814e-06, 'epoch': 3.78}


 95%|█████████▍| 41051/43412 [3:00:29<08:01,  4.90it/s]

{'loss': 0.1058, 'grad_norm': 0.9082793593406677, 'learning_rate': 1.0881783838569981e-06, 'epoch': 3.78}


 95%|█████████▍| 41101/43412 [3:00:40<07:53,  4.88it/s]

{'loss': 0.0894, 'grad_norm': 4.808686256408691, 'learning_rate': 1.0651432783562149e-06, 'epoch': 3.79}


 95%|█████████▍| 41151/43412 [3:00:50<07:42,  4.89it/s]

{'loss': 0.0811, 'grad_norm': 0.19043898582458496, 'learning_rate': 1.0421081728554319e-06, 'epoch': 3.79}


 95%|█████████▍| 41201/43412 [3:01:00<07:31,  4.89it/s]

{'loss': 0.1108, 'grad_norm': 8.932888984680176, 'learning_rate': 1.0190730673546486e-06, 'epoch': 3.8}


 95%|█████████▌| 41251/43412 [3:01:10<07:20,  4.90it/s]

{'loss': 0.1052, 'grad_norm': 17.520551681518555, 'learning_rate': 9.960379618538654e-07, 'epoch': 3.8}


 95%|█████████▌| 41301/43412 [3:01:20<07:10,  4.90it/s]

{'loss': 0.1006, 'grad_norm': 9.84227466583252, 'learning_rate': 9.730028563530821e-07, 'epoch': 3.81}


 95%|█████████▌| 41351/43412 [3:01:30<07:00,  4.90it/s]

{'loss': 0.0861, 'grad_norm': 2.776395320892334, 'learning_rate': 9.49967750852299e-07, 'epoch': 3.81}


 95%|█████████▌| 41401/43412 [3:01:41<06:50,  4.90it/s]

{'loss': 0.0739, 'grad_norm': 3.6675944328308105, 'learning_rate': 9.269326453515158e-07, 'epoch': 3.81}


 95%|█████████▌| 41451/43412 [3:01:51<06:40,  4.90it/s]

{'loss': 0.1162, 'grad_norm': 14.31552505493164, 'learning_rate': 9.038975398507326e-07, 'epoch': 3.82}


 96%|█████████▌| 41501/43412 [3:02:01<06:31,  4.88it/s]

{'loss': 0.0918, 'grad_norm': 5.446174144744873, 'learning_rate': 8.808624343499494e-07, 'epoch': 3.82}


 96%|█████████▌| 41551/43412 [3:02:11<06:16,  4.94it/s]

{'loss': 0.1017, 'grad_norm': 4.124997615814209, 'learning_rate': 8.578273288491662e-07, 'epoch': 3.83}


 96%|█████████▌| 41601/43412 [3:02:21<06:07,  4.92it/s]

{'loss': 0.0917, 'grad_norm': 4.798072814941406, 'learning_rate': 8.34792223348383e-07, 'epoch': 3.83}


 96%|█████████▌| 41651/43412 [3:02:31<06:00,  4.89it/s]

{'loss': 0.1266, 'grad_norm': 6.659280776977539, 'learning_rate': 8.117571178475998e-07, 'epoch': 3.84}


 96%|█████████▌| 41701/43412 [3:02:41<05:49,  4.90it/s]

{'loss': 0.0923, 'grad_norm': 7.883464813232422, 'learning_rate': 7.887220123468166e-07, 'epoch': 3.84}


 96%|█████████▌| 41751/43412 [3:02:52<05:38,  4.90it/s]

{'loss': 0.0975, 'grad_norm': 9.64897632598877, 'learning_rate': 7.656869068460335e-07, 'epoch': 3.85}


 96%|█████████▋| 41801/43412 [3:03:02<05:29,  4.89it/s]

{'loss': 0.104, 'grad_norm': 0.6010503172874451, 'learning_rate': 7.426518013452502e-07, 'epoch': 3.85}


 96%|█████████▋| 41851/43412 [3:03:12<05:18,  4.89it/s]

{'loss': 0.0995, 'grad_norm': 6.79343843460083, 'learning_rate': 7.196166958444669e-07, 'epoch': 3.86}


 97%|█████████▋| 41901/43412 [3:03:22<05:08,  4.90it/s]

{'loss': 0.1063, 'grad_norm': 5.796242713928223, 'learning_rate': 6.965815903436839e-07, 'epoch': 3.86}


 97%|█████████▋| 41951/43412 [3:03:32<04:59,  4.88it/s]

{'loss': 0.1, 'grad_norm': 3.8310351371765137, 'learning_rate': 6.735464848429006e-07, 'epoch': 3.87}


 97%|█████████▋| 42001/43412 [3:03:42<04:48,  4.89it/s]

{'loss': 0.1404, 'grad_norm': 0.26333388686180115, 'learning_rate': 6.505113793421175e-07, 'epoch': 3.87}


 97%|█████████▋| 42051/43412 [3:03:53<04:37,  4.91it/s]

{'loss': 0.1296, 'grad_norm': 10.279221534729004, 'learning_rate': 6.274762738413343e-07, 'epoch': 3.87}


 97%|█████████▋| 42101/43412 [3:04:03<04:27,  4.90it/s]

{'loss': 0.083, 'grad_norm': 0.0781816840171814, 'learning_rate': 6.044411683405511e-07, 'epoch': 3.88}


 97%|█████████▋| 42151/43412 [3:04:13<04:17,  4.89it/s]

{'loss': 0.1175, 'grad_norm': 0.07919865101575851, 'learning_rate': 5.814060628397679e-07, 'epoch': 3.88}


 97%|█████████▋| 42201/43412 [3:04:23<04:07,  4.89it/s]

{'loss': 0.1056, 'grad_norm': 1.189443588256836, 'learning_rate': 5.583709573389846e-07, 'epoch': 3.89}


 97%|█████████▋| 42251/43412 [3:04:33<03:56,  4.90it/s]

{'loss': 0.1036, 'grad_norm': 0.2298041135072708, 'learning_rate': 5.353358518382015e-07, 'epoch': 3.89}


 97%|█████████▋| 42301/43412 [3:04:43<03:47,  4.89it/s]

{'loss': 0.1351, 'grad_norm': 5.359370231628418, 'learning_rate': 5.123007463374182e-07, 'epoch': 3.9}


 98%|█████████▊| 42351/43412 [3:04:54<03:37,  4.88it/s]

{'loss': 0.1104, 'grad_norm': 4.295743465423584, 'learning_rate': 4.892656408366351e-07, 'epoch': 3.9}


 98%|█████████▊| 42401/43412 [3:05:04<03:26,  4.90it/s]

{'loss': 0.0898, 'grad_norm': 5.888671398162842, 'learning_rate': 4.6623053533585184e-07, 'epoch': 3.91}


 98%|█████████▊| 42451/43412 [3:05:14<03:15,  4.91it/s]

{'loss': 0.1137, 'grad_norm': 1.065148949623108, 'learning_rate': 4.4319542983506865e-07, 'epoch': 3.91}


 98%|█████████▊| 42501/43412 [3:05:24<03:05,  4.91it/s]

{'loss': 0.1024, 'grad_norm': 0.11589058488607407, 'learning_rate': 4.201603243342855e-07, 'epoch': 3.92}


 98%|█████████▊| 42551/43412 [3:05:34<02:55,  4.90it/s]

{'loss': 0.1008, 'grad_norm': 2.7642745971679688, 'learning_rate': 3.971252188335023e-07, 'epoch': 3.92}


 98%|█████████▊| 42601/43412 [3:05:44<02:45,  4.89it/s]

{'loss': 0.069, 'grad_norm': 7.381715774536133, 'learning_rate': 3.740901133327191e-07, 'epoch': 3.93}


 98%|█████████▊| 42651/43412 [3:05:54<02:35,  4.89it/s]

{'loss': 0.139, 'grad_norm': 0.7521736025810242, 'learning_rate': 3.5105500783193587e-07, 'epoch': 3.93}


 98%|█████████▊| 42701/43412 [3:06:05<02:25,  4.89it/s]

{'loss': 0.1092, 'grad_norm': 5.963848114013672, 'learning_rate': 3.280199023311527e-07, 'epoch': 3.93}


 98%|█████████▊| 42751/43412 [3:06:15<02:15,  4.88it/s]

{'loss': 0.1192, 'grad_norm': 5.571871757507324, 'learning_rate': 3.0498479683036953e-07, 'epoch': 3.94}


 99%|█████████▊| 42801/43412 [3:06:25<02:04,  4.90it/s]

{'loss': 0.1019, 'grad_norm': 7.219573974609375, 'learning_rate': 2.8194969132958634e-07, 'epoch': 3.94}


 99%|█████████▊| 42851/43412 [3:06:35<01:54,  4.90it/s]

{'loss': 0.0993, 'grad_norm': 0.5664328932762146, 'learning_rate': 2.589145858288031e-07, 'epoch': 3.95}


 99%|█████████▉| 42901/43412 [3:06:45<01:43,  4.94it/s]

{'loss': 0.0948, 'grad_norm': 2.1179592609405518, 'learning_rate': 2.3587948032801993e-07, 'epoch': 3.95}


 99%|█████████▉| 42951/43412 [3:06:55<01:34,  4.89it/s]

{'loss': 0.1185, 'grad_norm': 8.121331214904785, 'learning_rate': 2.128443748272367e-07, 'epoch': 3.96}


 99%|█████████▉| 43001/43412 [3:07:06<01:23,  4.92it/s]

{'loss': 0.0764, 'grad_norm': 4.1567463874816895, 'learning_rate': 1.8980926932645354e-07, 'epoch': 3.96}


 99%|█████████▉| 43051/43412 [3:07:16<01:13,  4.91it/s]

{'loss': 0.1217, 'grad_norm': 2.6949052810668945, 'learning_rate': 1.6677416382567032e-07, 'epoch': 3.97}


 99%|█████████▉| 43101/43412 [3:07:26<01:03,  4.90it/s]

{'loss': 0.0702, 'grad_norm': 2.1923532485961914, 'learning_rate': 1.4373905832488715e-07, 'epoch': 3.97}


 99%|█████████▉| 43151/43412 [3:07:36<00:53,  4.90it/s]

{'loss': 0.0964, 'grad_norm': 0.09308475255966187, 'learning_rate': 1.2070395282410393e-07, 'epoch': 3.98}


100%|█████████▉| 43201/43412 [3:07:46<00:43,  4.89it/s]

{'loss': 0.1162, 'grad_norm': 1.646806001663208, 'learning_rate': 9.766884732332075e-08, 'epoch': 3.98}


100%|█████████▉| 43251/43412 [3:07:56<00:32,  4.89it/s]

{'loss': 0.0926, 'grad_norm': 5.4483771324157715, 'learning_rate': 7.463374182253755e-08, 'epoch': 3.99}


100%|█████████▉| 43301/43412 [3:08:06<00:22,  4.90it/s]

{'loss': 0.1024, 'grad_norm': 2.0327296257019043, 'learning_rate': 5.159863632175435e-08, 'epoch': 3.99}


100%|█████████▉| 43351/43412 [3:08:17<00:12,  4.89it/s]

{'loss': 0.1329, 'grad_norm': 0.13774633407592773, 'learning_rate': 2.8563530820971162e-08, 'epoch': 3.99}


100%|█████████▉| 43401/43412 [3:08:27<00:02,  4.88it/s]

{'loss': 0.0804, 'grad_norm': 0.2748449146747589, 'learning_rate': 5.528425320187967e-09, 'epoch': 4.0}


                                                       
100%|██████████| 43412/43412 [3:09:58<00:00,  5.33it/s]

{'eval_loss': 0.3436893820762634, 'eval_f1_macro': 0.5809349359556722, 'eval_precision_macro': 0.6001711797057409, 'eval_recall_macro': 0.5642203499394266, 'eval_accuracy': 0.599705123479543, 'eval_runtime': 89.29, 'eval_samples_per_second': 60.768, 'eval_steps_per_second': 7.604, 'epoch': 4.0}


100%|██████████| 43412/43412 [3:10:01<00:00,  3.81it/s]

{'train_runtime': 11404.4174, 'train_samples_per_second': 15.226, 'train_steps_per_second': 3.807, 'train_loss': 0.1715104971395092, 'epoch': 4.0}





TrainOutput(global_step=43412, training_loss=0.1715104971395092, metrics={'train_runtime': 11404.4174, 'train_samples_per_second': 15.226, 'train_steps_per_second': 3.807, 'total_flos': 1.1422163665152e+16, 'train_loss': 0.1715104971395092, 'epoch': 4.0})

In [13]:
trainer.save_model("bert-goemotions-final")


In [14]:
tokenizer.save_pretrained("bert-goemotions-final")


('bert-goemotions-final\\tokenizer_config.json',
 'bert-goemotions-final\\special_tokens_map.json',
 'bert-goemotions-final\\vocab.txt',
 'bert-goemotions-final\\added_tokens.json',
 'bert-goemotions-final\\tokenizer.json')

In [15]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("bert-goemotions-final")
model = BertForSequenceClassification.from_pretrained(
    "bert-goemotions-final", 
    problem_type="multi_label_classification"
)


In [None]:
trainer.evaluate(eval_dataset=test_ds)  


100%|██████████| 679/679 [01:28<00:00,  7.67it/s]


{'eval_loss': 0.2316652089357376,
 'eval_f1_macro': 0.5919607132686104,
 'eval_precision_macro': 0.6597682519226583,
 'eval_recall_macro': 0.5403244714443878,
 'eval_accuracy': 0.6073337018610651,
 'eval_runtime': 88.7951,
 'eval_samples_per_second': 61.118,
 'eval_steps_per_second': 7.647,
 'epoch': 4.0}

In [17]:
pred_output = trainer.predict(test_ds)


100%|██████████| 679/679 [01:28<00:00,  7.66it/s]


In [18]:
pred_output.predictions  # logits (float32)
pred_output.label_ids    # ground truth (multi-hot)


array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [20]:
text = "I hate eating salad."
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(model.device)
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]
    labels = [i for i, p in enumerate(probs) if p >= 0.5]

ekman_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
predicted = [ekman_labels[i] for i in labels]
print("Predicted emotions:", predicted)


Predicted emotions: ['anger']


In [5]:
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-goemotions-final")
tokenizer = BertTokenizer.from_pretrained("bert-goemotions-final")
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
def map_labels(labels):
    new_id_label = []
    for label_id in labels:  # labels là list các chỉ số dạng string hoặc int
        label = emotion_list[int(label_id)]  # chuyển chỉ số sang tên cảm xúc
        new_label = [ekman_to_index[k] for k, v in ekman_mapping.items() if label in v]
        new_id_label.extend(new_label)
    return list(set(new_id_label))  # loại trùng
test_df['new_labels'] = test_df['labels'].apply(map_labels)


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=list(range(7)))
y_true = mlb.fit_transform(test_df['new_labels'])  # (N, 7)


In [13]:
test_encodings = tokenizer(
    test_df["text"].tolist(),
    padding=True,
    truncation=True,
    return_tensors="pt"
)


In [14]:
import torch
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
    torch.tensor(y_true, dtype=torch.float)
)
loader = DataLoader(dataset, batch_size=16)

y_preds = []
y_labels = []

with torch.no_grad():
    for batch in loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = (torch.sigmoid(logits) > 0.5).int()

        y_preds.append(preds)
        y_labels.append(labels.int())

y_preds = torch.cat(y_preds).numpy()
y_true = torch.cat(y_labels).numpy()


In [15]:
from sklearn.metrics import hamming_loss, accuracy_score

hamming = hamming_loss(y_true, y_preds)
subset_acc = accuracy_score(y_true, y_preds)

print(f"Hamming Loss: {hamming:.4f}")
print(f"Subset Accuracy: {subset_acc:.4f}")


Hamming Loss: 0.1704
Subset Accuracy: 0.3586
