In [1]:
from transformers import pipeline
from transformers import AutoTokenizer
import torch
# import numpy as np
import pandas as pd
from transformers import TrainingArguments, Trainer
model="distilbert-base-uncased-finetuned-sst-2-english"
# Load tokenizer for the model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use MPS (Metal GPU)
else:
    device = torch.device("cpu")  # Fallback to CPU

# Check if MPS is available
device = 0 if torch.backends.mps.is_available() else -1
print(f"Using device: {'MPS' if device == 0 else 'CPU'}")

Using device: MPS


In [3]:
sentiment_pipeline = pipeline("sentiment-analysis", model=model, device=device)


In [4]:
import os

# Set the environment variable
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"


In [5]:
data = ["I love you", "I hate you"]
r = sentiment_pipeline(data)
r

[{'label': 'POSITIVE', 'score': 0.9998656511306763},
 {'label': 'NEGATIVE', 'score': 0.9991129040718079}]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
print(torch.cuda.is_available())  # Returns True if a GPU is available
print(torch.cuda.device_count())  # Number of GPUs


False
0


In [8]:
# from datasets import load_dataset

# # Load dataset from a CSV file
# data_files = {"train": "path_to_train.csv", "validation": "path_to_val.csv"}  # Update paths
# dataset = load_dataset("csv", data_files=data_files)

# # Example structure: {'text': ..., 'label': ...}
# print(dataset)


In [9]:
from utils_classes import load_and_process_comments


train_comments, val_comments, test_comments, test_labels = load_and_process_comments(
    train_path='train',
    batch_size=50,
)
# Flatten train_comments
train_texts = [text for batch in train_comments for text in batch[0]]
train_labels = [label for batch in train_comments for label in batch[1]]

# Flatten test_comments
test_texts = [text for batch in test_comments for text in batch[0]]
test_labels = [label for batch in test_comments for label in batch[1]]


In [10]:
# Tokenize training and test data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)


In [11]:
type(train_encodings)


transformers.tokenization_utils_base.BatchEncoding

In [12]:
class CommentsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create dataset objects
train_dataset = CommentsDataset(train_encodings, train_labels)
test_dataset = CommentsDataset(test_encodings, test_labels)

In [13]:
from transformers import AutoModelForSequenceClassification

# Replace "distilbert-base-uncased" with your model's name if needed
model_new = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=2  # Adjust `num_labels` based on your dataset (e.g., binary classification)
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save the model
    eval_strategy="epoch",          # Evaluate after each epoch (updated argument)
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,             # Number of epochs
    weight_decay=0.01,              # Weight decay
    logging_dir="./logs",           # Directory for logs
    logging_steps=10,               # Log every 10 steps
    save_strategy="epoch",          # Save checkpoint each epoch
    load_best_model_at_end=True,    # Load best model at the end of training
)

In [None]:
trainer = Trainer(
    model=model_new,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
  0%|          | 10/5625 [00:29<4:04:18,  2.61s/it]

{'loss': 0.6913, 'grad_norm': 1.0757776498794556, 'learning_rate': 1.9964444444444447e-05, 'epoch': 0.01}


  0%|          | 20/5625 [00:54<3:58:54,  2.56s/it]

{'loss': 0.6766, 'grad_norm': 1.3927199840545654, 'learning_rate': 1.992888888888889e-05, 'epoch': 0.01}


  1%|          | 30/5625 [01:19<3:56:47,  2.54s/it]

{'loss': 0.6389, 'grad_norm': 2.456381320953369, 'learning_rate': 1.9893333333333335e-05, 'epoch': 0.02}


  1%|          | 40/5625 [01:45<3:57:24,  2.55s/it]

{'loss': 0.5756, 'grad_norm': 2.932981491088867, 'learning_rate': 1.985777777777778e-05, 'epoch': 0.02}


  1%|          | 50/5625 [02:10<3:55:07,  2.53s/it]

{'loss': 0.4786, 'grad_norm': 6.559523105621338, 'learning_rate': 1.9822222222222226e-05, 'epoch': 0.03}


  1%|          | 60/5625 [02:35<3:54:45,  2.53s/it]

{'loss': 0.3612, 'grad_norm': 3.3584725856781006, 'learning_rate': 1.9786666666666668e-05, 'epoch': 0.03}


  1%|          | 70/5625 [03:01<3:54:27,  2.53s/it]

{'loss': 0.3962, 'grad_norm': 8.029385566711426, 'learning_rate': 1.9751111111111114e-05, 'epoch': 0.04}


  1%|▏         | 80/5625 [03:26<3:53:11,  2.52s/it]

{'loss': 0.3264, 'grad_norm': 9.017687797546387, 'learning_rate': 1.9715555555555556e-05, 'epoch': 0.04}


  2%|▏         | 90/5625 [03:51<3:57:24,  2.57s/it]

{'loss': 0.3786, 'grad_norm': 7.496423244476318, 'learning_rate': 1.968e-05, 'epoch': 0.05}


  2%|▏         | 100/5625 [04:18<4:10:45,  2.72s/it]

{'loss': 0.3244, 'grad_norm': 6.624642848968506, 'learning_rate': 1.9644444444444447e-05, 'epoch': 0.05}


  2%|▏         | 110/5625 [04:46<4:12:31,  2.75s/it]

{'loss': 0.4023, 'grad_norm': 4.734385967254639, 'learning_rate': 1.960888888888889e-05, 'epoch': 0.06}


  2%|▏         | 120/5625 [05:13<4:08:36,  2.71s/it]

{'loss': 0.3528, 'grad_norm': 10.13439655303955, 'learning_rate': 1.9573333333333335e-05, 'epoch': 0.06}


  2%|▏         | 130/5625 [05:40<4:06:01,  2.69s/it]

{'loss': 0.3575, 'grad_norm': 6.145967483520508, 'learning_rate': 1.953777777777778e-05, 'epoch': 0.07}


  2%|▏         | 140/5625 [06:07<4:06:24,  2.70s/it]

{'loss': 0.2396, 'grad_norm': 8.374120712280273, 'learning_rate': 1.9502222222222226e-05, 'epoch': 0.07}


  3%|▎         | 150/5625 [06:34<4:05:13,  2.69s/it]

{'loss': 0.3129, 'grad_norm': 10.080734252929688, 'learning_rate': 1.9466666666666668e-05, 'epoch': 0.08}


  3%|▎         | 160/5625 [07:01<4:03:44,  2.68s/it]

{'loss': 0.325, 'grad_norm': 8.429008483886719, 'learning_rate': 1.9431111111111113e-05, 'epoch': 0.09}


  3%|▎         | 170/5625 [07:28<4:05:03,  2.70s/it]

{'loss': 0.2917, 'grad_norm': 7.159471035003662, 'learning_rate': 1.9395555555555555e-05, 'epoch': 0.09}


  3%|▎         | 180/5625 [07:55<4:03:46,  2.69s/it]

{'loss': 0.195, 'grad_norm': 3.8266632556915283, 'learning_rate': 1.936e-05, 'epoch': 0.1}


  3%|▎         | 190/5625 [08:22<4:05:32,  2.71s/it]

{'loss': 0.3125, 'grad_norm': 12.619710922241211, 'learning_rate': 1.9324444444444447e-05, 'epoch': 0.1}


  4%|▎         | 200/5625 [08:49<4:03:48,  2.70s/it]

{'loss': 0.2434, 'grad_norm': 7.255180835723877, 'learning_rate': 1.928888888888889e-05, 'epoch': 0.11}


  4%|▎         | 210/5625 [09:16<4:02:02,  2.68s/it]

{'loss': 0.3675, 'grad_norm': 7.606292247772217, 'learning_rate': 1.9253333333333334e-05, 'epoch': 0.11}


  4%|▍         | 220/5625 [09:42<4:02:59,  2.70s/it]

{'loss': 0.3758, 'grad_norm': 5.968516826629639, 'learning_rate': 1.921777777777778e-05, 'epoch': 0.12}


  4%|▍         | 230/5625 [10:09<3:59:32,  2.66s/it]

{'loss': 0.2455, 'grad_norm': 9.170674324035645, 'learning_rate': 1.9182222222222225e-05, 'epoch': 0.12}


  4%|▍         | 240/5625 [10:36<3:59:29,  2.67s/it]

{'loss': 0.2999, 'grad_norm': 2.9509646892547607, 'learning_rate': 1.9146666666666667e-05, 'epoch': 0.13}


  4%|▍         | 250/5625 [11:03<4:01:02,  2.69s/it]

{'loss': 0.3168, 'grad_norm': 10.969223022460938, 'learning_rate': 1.9111111111111113e-05, 'epoch': 0.13}


  5%|▍         | 260/5625 [11:29<3:57:15,  2.65s/it]

{'loss': 0.2412, 'grad_norm': 8.459951400756836, 'learning_rate': 1.9075555555555555e-05, 'epoch': 0.14}


  5%|▍         | 270/5625 [11:56<4:01:04,  2.70s/it]

{'loss': 0.2508, 'grad_norm': 3.707149028778076, 'learning_rate': 1.904e-05, 'epoch': 0.14}


  5%|▍         | 280/5625 [12:23<4:01:37,  2.71s/it]

{'loss': 0.2428, 'grad_norm': 9.7522611618042, 'learning_rate': 1.9004444444444446e-05, 'epoch': 0.15}


  5%|▌         | 290/5625 [12:50<4:00:03,  2.70s/it]

{'loss': 0.3071, 'grad_norm': 16.806697845458984, 'learning_rate': 1.896888888888889e-05, 'epoch': 0.15}


  5%|▌         | 300/5625 [13:17<4:00:23,  2.71s/it]

{'loss': 0.2768, 'grad_norm': 8.48902702331543, 'learning_rate': 1.8933333333333334e-05, 'epoch': 0.16}


  6%|▌         | 310/5625 [13:44<3:56:53,  2.67s/it]

{'loss': 0.3604, 'grad_norm': 4.833974361419678, 'learning_rate': 1.889777777777778e-05, 'epoch': 0.17}


  6%|▌         | 320/5625 [14:11<4:01:10,  2.73s/it]

{'loss': 0.3532, 'grad_norm': 9.287692070007324, 'learning_rate': 1.8862222222222225e-05, 'epoch': 0.17}


  6%|▌         | 330/5625 [14:38<3:56:36,  2.68s/it]

{'loss': 0.2223, 'grad_norm': 3.173433303833008, 'learning_rate': 1.8826666666666667e-05, 'epoch': 0.18}


  6%|▌         | 340/5625 [15:05<3:51:59,  2.63s/it]

{'loss': 0.2896, 'grad_norm': 7.8174662590026855, 'learning_rate': 1.8791111111111113e-05, 'epoch': 0.18}


  6%|▌         | 350/5625 [15:31<3:51:04,  2.63s/it]

{'loss': 0.247, 'grad_norm': 6.554244518280029, 'learning_rate': 1.8755555555555558e-05, 'epoch': 0.19}


  6%|▋         | 360/5625 [15:58<3:53:13,  2.66s/it]

{'loss': 0.3593, 'grad_norm': 3.635120153427124, 'learning_rate': 1.8720000000000004e-05, 'epoch': 0.19}


  7%|▋         | 370/5625 [16:25<3:57:40,  2.71s/it]

{'loss': 0.3269, 'grad_norm': 8.25832748413086, 'learning_rate': 1.8684444444444446e-05, 'epoch': 0.2}


  7%|▋         | 380/5625 [16:52<4:00:03,  2.75s/it]

{'loss': 0.2589, 'grad_norm': 6.62159538269043, 'learning_rate': 1.8648888888888888e-05, 'epoch': 0.2}


  7%|▋         | 390/5625 [17:19<3:56:36,  2.71s/it]

{'loss': 0.3142, 'grad_norm': 9.64205551147461, 'learning_rate': 1.8613333333333334e-05, 'epoch': 0.21}


  7%|▋         | 400/5625 [17:47<4:00:07,  2.76s/it]

{'loss': 0.2657, 'grad_norm': 6.696769714355469, 'learning_rate': 1.857777777777778e-05, 'epoch': 0.21}


  7%|▋         | 410/5625 [18:14<3:58:41,  2.75s/it]

{'loss': 0.2942, 'grad_norm': 4.526856899261475, 'learning_rate': 1.8542222222222225e-05, 'epoch': 0.22}


  7%|▋         | 420/5625 [18:42<3:56:41,  2.73s/it]

{'loss': 0.2776, 'grad_norm': 15.033031463623047, 'learning_rate': 1.8506666666666667e-05, 'epoch': 0.22}


  8%|▊         | 430/5625 [19:09<3:57:17,  2.74s/it]

{'loss': 0.2989, 'grad_norm': 5.736593723297119, 'learning_rate': 1.8471111111111112e-05, 'epoch': 0.23}


  8%|▊         | 440/5625 [19:36<3:54:02,  2.71s/it]

{'loss': 0.2989, 'grad_norm': 4.316392421722412, 'learning_rate': 1.8435555555555558e-05, 'epoch': 0.23}


  8%|▊         | 450/5625 [20:03<3:53:50,  2.71s/it]

{'loss': 0.1939, 'grad_norm': 2.568378210067749, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.24}


  8%|▊         | 460/5625 [20:31<3:54:54,  2.73s/it]

{'loss': 0.229, 'grad_norm': 11.667183876037598, 'learning_rate': 1.8364444444444446e-05, 'epoch': 0.25}


  8%|▊         | 470/5625 [20:58<3:53:57,  2.72s/it]

{'loss': 0.2582, 'grad_norm': 11.034014701843262, 'learning_rate': 1.832888888888889e-05, 'epoch': 0.25}


  9%|▊         | 480/5625 [21:25<3:52:42,  2.71s/it]

{'loss': 0.2944, 'grad_norm': 6.830078125, 'learning_rate': 1.8293333333333333e-05, 'epoch': 0.26}


  9%|▊         | 490/5625 [21:52<3:50:45,  2.70s/it]

{'loss': 0.2513, 'grad_norm': 4.388125896453857, 'learning_rate': 1.825777777777778e-05, 'epoch': 0.26}


  9%|▉         | 500/5625 [22:19<3:54:43,  2.75s/it]

{'loss': 0.2418, 'grad_norm': 14.415300369262695, 'learning_rate': 1.8222222222222224e-05, 'epoch': 0.27}


  9%|▉         | 510/5625 [22:47<3:57:32,  2.79s/it]

{'loss': 0.2191, 'grad_norm': 6.5063934326171875, 'learning_rate': 1.8186666666666666e-05, 'epoch': 0.27}


  9%|▉         | 520/5625 [23:15<3:53:07,  2.74s/it]

{'loss': 0.1914, 'grad_norm': 4.6506781578063965, 'learning_rate': 1.8151111111111112e-05, 'epoch': 0.28}


  9%|▉         | 530/5625 [23:42<3:51:05,  2.72s/it]

{'loss': 0.352, 'grad_norm': 4.886295795440674, 'learning_rate': 1.8115555555555558e-05, 'epoch': 0.28}


 10%|▉         | 540/5625 [24:09<3:52:38,  2.74s/it]

{'loss': 0.2607, 'grad_norm': 8.531559944152832, 'learning_rate': 1.8080000000000003e-05, 'epoch': 0.29}


 10%|▉         | 550/5625 [24:37<3:52:57,  2.75s/it]

{'loss': 0.225, 'grad_norm': 5.085914611816406, 'learning_rate': 1.8044444444444445e-05, 'epoch': 0.29}


 10%|▉         | 560/5625 [25:04<3:50:54,  2.74s/it]

{'loss': 0.2471, 'grad_norm': 3.216747999191284, 'learning_rate': 1.800888888888889e-05, 'epoch': 0.3}


 10%|█         | 570/5625 [25:32<3:49:45,  2.73s/it]

{'loss': 0.3189, 'grad_norm': 7.305058479309082, 'learning_rate': 1.7973333333333333e-05, 'epoch': 0.3}


 10%|█         | 580/5625 [25:59<3:45:13,  2.68s/it]

{'loss': 0.1887, 'grad_norm': 4.74844217300415, 'learning_rate': 1.793777777777778e-05, 'epoch': 0.31}


 10%|█         | 590/5625 [26:26<3:48:26,  2.72s/it]

{'loss': 0.326, 'grad_norm': 13.131484031677246, 'learning_rate': 1.7902222222222224e-05, 'epoch': 0.31}


 11%|█         | 600/5625 [26:53<3:45:46,  2.70s/it]

{'loss': 0.4355, 'grad_norm': 8.238785743713379, 'learning_rate': 1.7866666666666666e-05, 'epoch': 0.32}


 11%|█         | 610/5625 [27:20<3:45:17,  2.70s/it]

{'loss': 0.2268, 'grad_norm': 8.31891918182373, 'learning_rate': 1.783111111111111e-05, 'epoch': 0.33}


 11%|█         | 620/5625 [27:47<3:47:26,  2.73s/it]

{'loss': 0.3022, 'grad_norm': 10.503755569458008, 'learning_rate': 1.7795555555555557e-05, 'epoch': 0.33}


 11%|█         | 630/5625 [28:14<3:47:39,  2.73s/it]

{'loss': 0.2784, 'grad_norm': 10.737902641296387, 'learning_rate': 1.7760000000000003e-05, 'epoch': 0.34}


 11%|█▏        | 640/5625 [28:42<3:47:27,  2.74s/it]

{'loss': 0.2005, 'grad_norm': 1.7203443050384521, 'learning_rate': 1.7724444444444445e-05, 'epoch': 0.34}


 12%|█▏        | 650/5625 [29:09<3:45:52,  2.72s/it]

{'loss': 0.2493, 'grad_norm': 5.000640869140625, 'learning_rate': 1.768888888888889e-05, 'epoch': 0.35}


 12%|█▏        | 660/5625 [29:36<3:43:16,  2.70s/it]

{'loss': 0.2526, 'grad_norm': 7.030618667602539, 'learning_rate': 1.7653333333333336e-05, 'epoch': 0.35}


 12%|█▏        | 670/5625 [30:03<3:45:21,  2.73s/it]

{'loss': 0.2732, 'grad_norm': 5.974021911621094, 'learning_rate': 1.761777777777778e-05, 'epoch': 0.36}


 12%|█▏        | 680/5625 [30:31<3:52:19,  2.82s/it]

{'loss': 0.3383, 'grad_norm': 11.876544952392578, 'learning_rate': 1.7582222222222224e-05, 'epoch': 0.36}


 12%|█▏        | 690/5625 [30:58<3:45:01,  2.74s/it]

{'loss': 0.2253, 'grad_norm': 5.734549522399902, 'learning_rate': 1.7546666666666666e-05, 'epoch': 0.37}


 12%|█▏        | 700/5625 [31:25<3:40:25,  2.69s/it]

{'loss': 0.2409, 'grad_norm': 5.807636260986328, 'learning_rate': 1.751111111111111e-05, 'epoch': 0.37}


 13%|█▎        | 710/5625 [31:52<3:40:34,  2.69s/it]

{'loss': 0.2335, 'grad_norm': 10.784811019897461, 'learning_rate': 1.7475555555555557e-05, 'epoch': 0.38}


 13%|█▎        | 720/5625 [32:20<3:43:56,  2.74s/it]

{'loss': 0.149, 'grad_norm': 1.1077824831008911, 'learning_rate': 1.7440000000000002e-05, 'epoch': 0.38}


 13%|█▎        | 730/5625 [32:47<3:44:12,  2.75s/it]

{'loss': 0.2249, 'grad_norm': 7.006741523742676, 'learning_rate': 1.7404444444444445e-05, 'epoch': 0.39}


 13%|█▎        | 740/5625 [33:14<3:41:28,  2.72s/it]

{'loss': 0.3977, 'grad_norm': 12.056144714355469, 'learning_rate': 1.736888888888889e-05, 'epoch': 0.39}


 13%|█▎        | 750/5625 [33:41<3:38:47,  2.69s/it]

{'loss': 0.2982, 'grad_norm': 17.632282257080078, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.4}


 14%|█▎        | 760/5625 [34:08<3:38:58,  2.70s/it]

{'loss': 0.2504, 'grad_norm': 13.545416831970215, 'learning_rate': 1.729777777777778e-05, 'epoch': 0.41}


 14%|█▎        | 770/5625 [34:36<3:42:12,  2.75s/it]

{'loss': 0.2558, 'grad_norm': 5.918613433837891, 'learning_rate': 1.7262222222222223e-05, 'epoch': 0.41}


 14%|█▍        | 780/5625 [35:03<3:39:51,  2.72s/it]

{'loss': 0.2227, 'grad_norm': 3.7579104900360107, 'learning_rate': 1.7226666666666665e-05, 'epoch': 0.42}


 14%|█▍        | 790/5625 [35:30<3:37:24,  2.70s/it]

{'loss': 0.2473, 'grad_norm': 8.422957420349121, 'learning_rate': 1.719111111111111e-05, 'epoch': 0.42}


 14%|█▍        | 800/5625 [35:57<3:36:40,  2.69s/it]

{'loss': 0.2338, 'grad_norm': 10.09135913848877, 'learning_rate': 1.7155555555555557e-05, 'epoch': 0.43}


 14%|█▍        | 810/5625 [36:24<3:39:11,  2.73s/it]

{'loss': 0.326, 'grad_norm': 1.8441932201385498, 'learning_rate': 1.7120000000000002e-05, 'epoch': 0.43}


 15%|█▍        | 820/5625 [36:51<3:36:15,  2.70s/it]

{'loss': 0.2386, 'grad_norm': 8.489684104919434, 'learning_rate': 1.7084444444444444e-05, 'epoch': 0.44}


 15%|█▍        | 830/5625 [37:18<3:36:41,  2.71s/it]

{'loss': 0.2085, 'grad_norm': 2.050258159637451, 'learning_rate': 1.704888888888889e-05, 'epoch': 0.44}


 15%|█▍        | 840/5625 [37:46<3:41:43,  2.78s/it]

{'loss': 0.1484, 'grad_norm': 8.556418418884277, 'learning_rate': 1.7013333333333335e-05, 'epoch': 0.45}


 15%|█▌        | 850/5625 [38:13<3:39:04,  2.75s/it]

{'loss': 0.2151, 'grad_norm': 15.983199119567871, 'learning_rate': 1.697777777777778e-05, 'epoch': 0.45}


 15%|█▌        | 860/5625 [38:41<3:40:57,  2.78s/it]

{'loss': 0.4639, 'grad_norm': 16.557130813598633, 'learning_rate': 1.6942222222222223e-05, 'epoch': 0.46}


 15%|█▌        | 870/5625 [39:08<3:35:38,  2.72s/it]

{'loss': 0.2152, 'grad_norm': 6.674469470977783, 'learning_rate': 1.690666666666667e-05, 'epoch': 0.46}


 16%|█▌        | 880/5625 [39:35<3:34:00,  2.71s/it]

{'loss': 0.1539, 'grad_norm': 10.337217330932617, 'learning_rate': 1.687111111111111e-05, 'epoch': 0.47}


 16%|█▌        | 890/5625 [40:02<3:33:25,  2.70s/it]

{'loss': 0.2329, 'grad_norm': 9.434793472290039, 'learning_rate': 1.6835555555555556e-05, 'epoch': 0.47}


 16%|█▌        | 900/5625 [40:30<3:35:18,  2.73s/it]

{'loss': 0.1659, 'grad_norm': 5.493890762329102, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.48}


 16%|█▌        | 910/5625 [40:57<3:34:00,  2.72s/it]

{'loss': 0.2617, 'grad_norm': 6.332819938659668, 'learning_rate': 1.6764444444444444e-05, 'epoch': 0.49}


 16%|█▋        | 920/5625 [41:24<3:34:29,  2.74s/it]

{'loss': 0.2751, 'grad_norm': 7.062108516693115, 'learning_rate': 1.672888888888889e-05, 'epoch': 0.49}


 17%|█▋        | 930/5625 [41:51<3:31:33,  2.70s/it]

{'loss': 0.1749, 'grad_norm': 4.748235702514648, 'learning_rate': 1.6693333333333335e-05, 'epoch': 0.5}


 17%|█▋        | 940/5625 [42:19<3:35:26,  2.76s/it]

{'loss': 0.2242, 'grad_norm': 6.867767810821533, 'learning_rate': 1.665777777777778e-05, 'epoch': 0.5}


 17%|█▋        | 950/5625 [42:47<3:34:59,  2.76s/it]

{'loss': 0.2207, 'grad_norm': 2.1321587562561035, 'learning_rate': 1.6622222222222223e-05, 'epoch': 0.51}


 17%|█▋        | 960/5625 [43:13<3:27:17,  2.67s/it]

{'loss': 0.3239, 'grad_norm': 7.744108200073242, 'learning_rate': 1.6586666666666668e-05, 'epoch': 0.51}


 17%|█▋        | 970/5625 [43:40<3:25:40,  2.65s/it]

{'loss': 0.2161, 'grad_norm': 3.4093165397644043, 'learning_rate': 1.6551111111111114e-05, 'epoch': 0.52}


 17%|█▋        | 980/5625 [44:07<3:30:21,  2.72s/it]

{'loss': 0.3407, 'grad_norm': 10.257011413574219, 'learning_rate': 1.651555555555556e-05, 'epoch': 0.52}


 18%|█▊        | 990/5625 [44:34<3:27:52,  2.69s/it]

{'loss': 0.2633, 'grad_norm': 11.209884643554688, 'learning_rate': 1.648e-05, 'epoch': 0.53}


 18%|█▊        | 1000/5625 [45:00<3:23:35,  2.64s/it]

{'loss': 0.1393, 'grad_norm': 6.915460586547852, 'learning_rate': 1.6444444444444444e-05, 'epoch': 0.53}


 18%|█▊        | 1010/5625 [45:27<3:23:05,  2.64s/it]

{'loss': 0.2989, 'grad_norm': 13.20694351196289, 'learning_rate': 1.640888888888889e-05, 'epoch': 0.54}


 18%|█▊        | 1020/5625 [45:53<3:23:16,  2.65s/it]

{'loss': 0.3122, 'grad_norm': 13.23774528503418, 'learning_rate': 1.6373333333333335e-05, 'epoch': 0.54}


 18%|█▊        | 1030/5625 [46:20<3:26:48,  2.70s/it]

{'loss': 0.2971, 'grad_norm': 5.748946189880371, 'learning_rate': 1.633777777777778e-05, 'epoch': 0.55}


 18%|█▊        | 1040/5625 [46:47<3:26:15,  2.70s/it]

{'loss': 0.2338, 'grad_norm': 6.457972526550293, 'learning_rate': 1.6302222222222222e-05, 'epoch': 0.55}


 19%|█▊        | 1050/5625 [47:14<3:24:02,  2.68s/it]

{'loss': 0.3085, 'grad_norm': 13.74535846710205, 'learning_rate': 1.6266666666666668e-05, 'epoch': 0.56}


 19%|█▉        | 1060/5625 [47:40<3:22:00,  2.66s/it]

{'loss': 0.159, 'grad_norm': 6.1180853843688965, 'learning_rate': 1.6231111111111113e-05, 'epoch': 0.57}


 19%|█▉        | 1070/5625 [48:07<3:24:23,  2.69s/it]

{'loss': 0.2016, 'grad_norm': 1.4568464756011963, 'learning_rate': 1.619555555555556e-05, 'epoch': 0.57}


 19%|█▉        | 1080/5625 [48:34<3:21:13,  2.66s/it]

{'loss': 0.171, 'grad_norm': 8.095763206481934, 'learning_rate': 1.616e-05, 'epoch': 0.58}


 19%|█▉        | 1090/5625 [49:00<3:18:37,  2.63s/it]

{'loss': 0.3233, 'grad_norm': 12.538481712341309, 'learning_rate': 1.6124444444444443e-05, 'epoch': 0.58}


 20%|█▉        | 1100/5625 [49:26<3:17:08,  2.61s/it]

{'loss': 0.2243, 'grad_norm': 2.875605821609497, 'learning_rate': 1.608888888888889e-05, 'epoch': 0.59}


 20%|█▉        | 1110/5625 [49:53<3:20:38,  2.67s/it]

{'loss': 0.2715, 'grad_norm': 5.928725242614746, 'learning_rate': 1.6053333333333334e-05, 'epoch': 0.59}


 20%|█▉        | 1120/5625 [50:18<3:19:29,  2.66s/it]

{'loss': 0.2464, 'grad_norm': 2.8307576179504395, 'learning_rate': 1.601777777777778e-05, 'epoch': 0.6}


 20%|██        | 1130/5625 [50:44<3:14:59,  2.60s/it]

{'loss': 0.2078, 'grad_norm': 8.62723445892334, 'learning_rate': 1.5982222222222222e-05, 'epoch': 0.6}


 20%|██        | 1140/5625 [51:09<3:09:05,  2.53s/it]

{'loss': 0.3249, 'grad_norm': 1.7110470533370972, 'learning_rate': 1.5946666666666668e-05, 'epoch': 0.61}


 20%|██        | 1150/5625 [51:34<3:06:53,  2.51s/it]

{'loss': 0.251, 'grad_norm': 11.364871978759766, 'learning_rate': 1.5911111111111113e-05, 'epoch': 0.61}


 21%|██        | 1160/5625 [52:02<3:28:17,  2.80s/it]

{'loss': 0.2146, 'grad_norm': 6.277186393737793, 'learning_rate': 1.587555555555556e-05, 'epoch': 0.62}


 21%|██        | 1170/5625 [52:28<3:08:27,  2.54s/it]

{'loss': 0.1724, 'grad_norm': 4.517725467681885, 'learning_rate': 1.584e-05, 'epoch': 0.62}


 21%|██        | 1180/5625 [52:53<3:04:41,  2.49s/it]

{'loss': 0.2876, 'grad_norm': 7.320858001708984, 'learning_rate': 1.5804444444444446e-05, 'epoch': 0.63}


 21%|██        | 1190/5625 [53:18<3:07:01,  2.53s/it]

{'loss': 0.2097, 'grad_norm': 2.4553780555725098, 'learning_rate': 1.576888888888889e-05, 'epoch': 0.63}


 21%|██▏       | 1200/5625 [53:43<3:05:00,  2.51s/it]

{'loss': 0.1817, 'grad_norm': 5.256978511810303, 'learning_rate': 1.5733333333333334e-05, 'epoch': 0.64}


 22%|██▏       | 1210/5625 [54:08<3:03:25,  2.49s/it]

{'loss': 0.2002, 'grad_norm': 11.39245319366455, 'learning_rate': 1.569777777777778e-05, 'epoch': 0.65}


 22%|██▏       | 1220/5625 [54:33<3:00:18,  2.46s/it]

{'loss': 0.2436, 'grad_norm': 7.446401119232178, 'learning_rate': 1.5662222222222222e-05, 'epoch': 0.65}


 22%|██▏       | 1230/5625 [54:57<2:57:10,  2.42s/it]

{'loss': 0.3104, 'grad_norm': 6.895421028137207, 'learning_rate': 1.5626666666666667e-05, 'epoch': 0.66}


 22%|██▏       | 1240/5625 [55:21<2:56:51,  2.42s/it]

{'loss': 0.2377, 'grad_norm': 4.99259090423584, 'learning_rate': 1.5591111111111113e-05, 'epoch': 0.66}


 22%|██▏       | 1250/5625 [55:46<2:56:10,  2.42s/it]

{'loss': 0.3064, 'grad_norm': 6.466382026672363, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.67}


 22%|██▏       | 1260/5625 [56:10<2:56:07,  2.42s/it]

{'loss': 0.204, 'grad_norm': 4.862316131591797, 'learning_rate': 1.552e-05, 'epoch': 0.67}


 23%|██▎       | 1270/5625 [56:34<2:55:11,  2.41s/it]

{'loss': 0.2079, 'grad_norm': 8.428715705871582, 'learning_rate': 1.5484444444444446e-05, 'epoch': 0.68}


 23%|██▎       | 1280/5625 [56:58<2:54:59,  2.42s/it]

{'loss': 0.2408, 'grad_norm': 5.714728832244873, 'learning_rate': 1.544888888888889e-05, 'epoch': 0.68}


 23%|██▎       | 1290/5625 [57:22<2:58:37,  2.47s/it]

{'loss': 0.2337, 'grad_norm': 10.027798652648926, 'learning_rate': 1.5413333333333337e-05, 'epoch': 0.69}


 23%|██▎       | 1300/5625 [57:47<2:54:42,  2.42s/it]

{'loss': 0.2405, 'grad_norm': 12.364171028137207, 'learning_rate': 1.537777777777778e-05, 'epoch': 0.69}


 23%|██▎       | 1310/5625 [58:11<2:54:14,  2.42s/it]

{'loss': 0.2376, 'grad_norm': 7.690675735473633, 'learning_rate': 1.534222222222222e-05, 'epoch': 0.7}


 23%|██▎       | 1320/5625 [58:35<2:53:14,  2.41s/it]

{'loss': 0.2064, 'grad_norm': 2.14184832572937, 'learning_rate': 1.5306666666666667e-05, 'epoch': 0.7}


 24%|██▎       | 1330/5625 [58:59<2:52:16,  2.41s/it]

{'loss': 0.2426, 'grad_norm': 5.736889839172363, 'learning_rate': 1.5271111111111112e-05, 'epoch': 0.71}


 24%|██▍       | 1340/5625 [59:23<2:52:02,  2.41s/it]

{'loss': 0.2613, 'grad_norm': 6.879069805145264, 'learning_rate': 1.5235555555555556e-05, 'epoch': 0.71}


 24%|██▍       | 1350/5625 [59:49<2:58:21,  2.50s/it]

{'loss': 0.2425, 'grad_norm': 8.424748420715332, 'learning_rate': 1.5200000000000002e-05, 'epoch': 0.72}


 24%|██▍       | 1360/5625 [1:00:14<2:56:29,  2.48s/it]

{'loss': 0.1776, 'grad_norm': 9.644362449645996, 'learning_rate': 1.5164444444444446e-05, 'epoch': 0.73}


 24%|██▍       | 1370/5625 [1:00:38<2:53:09,  2.44s/it]

{'loss': 0.216, 'grad_norm': 12.625131607055664, 'learning_rate': 1.5128888888888891e-05, 'epoch': 0.73}


 25%|██▍       | 1380/5625 [1:01:03<2:54:01,  2.46s/it]

{'loss': 0.2429, 'grad_norm': 1.2342337369918823, 'learning_rate': 1.5093333333333335e-05, 'epoch': 0.74}


 25%|██▍       | 1390/5625 [1:01:27<2:52:15,  2.44s/it]

{'loss': 0.4043, 'grad_norm': 5.366588115692139, 'learning_rate': 1.505777777777778e-05, 'epoch': 0.74}


 25%|██▍       | 1400/5625 [1:01:52<2:50:02,  2.41s/it]

{'loss': 0.1984, 'grad_norm': 4.821699619293213, 'learning_rate': 1.5022222222222223e-05, 'epoch': 0.75}


 25%|██▌       | 1410/5625 [1:02:16<2:56:15,  2.51s/it]

{'loss': 0.2805, 'grad_norm': 10.031403541564941, 'learning_rate': 1.4986666666666667e-05, 'epoch': 0.75}


 25%|██▌       | 1420/5625 [1:02:43<3:05:09,  2.64s/it]

{'loss': 0.2872, 'grad_norm': 4.845560073852539, 'learning_rate': 1.4951111111111112e-05, 'epoch': 0.76}


 25%|██▌       | 1430/5625 [1:03:09<2:59:10,  2.56s/it]

{'loss': 0.2627, 'grad_norm': 6.214970111846924, 'learning_rate': 1.4915555555555556e-05, 'epoch': 0.76}


 26%|██▌       | 1440/5625 [1:03:35<3:08:51,  2.71s/it]

{'loss': 0.198, 'grad_norm': 6.623776912689209, 'learning_rate': 1.4880000000000002e-05, 'epoch': 0.77}


 26%|██▌       | 1450/5625 [1:04:02<3:01:18,  2.61s/it]

{'loss': 0.3065, 'grad_norm': 7.18418550491333, 'learning_rate': 1.4844444444444445e-05, 'epoch': 0.77}


 26%|██▌       | 1460/5625 [1:04:28<3:03:14,  2.64s/it]

{'loss': 0.2246, 'grad_norm': 5.602612018585205, 'learning_rate': 1.4808888888888891e-05, 'epoch': 0.78}


 26%|██▌       | 1470/5625 [1:04:54<2:55:40,  2.54s/it]

{'loss': 0.2121, 'grad_norm': 2.711733818054199, 'learning_rate': 1.4773333333333335e-05, 'epoch': 0.78}


 26%|██▋       | 1480/5625 [1:05:19<2:54:35,  2.53s/it]

{'loss': 0.1339, 'grad_norm': 3.677867889404297, 'learning_rate': 1.473777777777778e-05, 'epoch': 0.79}


 26%|██▋       | 1490/5625 [1:05:44<2:52:13,  2.50s/it]

{'loss': 0.1865, 'grad_norm': 10.120269775390625, 'learning_rate': 1.4702222222222224e-05, 'epoch': 0.79}


 27%|██▋       | 1500/5625 [1:06:09<2:57:31,  2.58s/it]

{'loss': 0.2609, 'grad_norm': 2.705946445465088, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}


 27%|██▋       | 1510/5625 [1:06:35<2:53:08,  2.52s/it]

{'loss': 0.253, 'grad_norm': 8.181340217590332, 'learning_rate': 1.4631111111111112e-05, 'epoch': 0.81}


 27%|██▋       | 1520/5625 [1:07:00<2:51:32,  2.51s/it]

{'loss': 0.2069, 'grad_norm': 6.150300979614258, 'learning_rate': 1.4595555555555556e-05, 'epoch': 0.81}


 27%|██▋       | 1530/5625 [1:07:25<2:50:47,  2.50s/it]

{'loss': 0.2479, 'grad_norm': 3.3802995681762695, 'learning_rate': 1.4560000000000001e-05, 'epoch': 0.82}


 27%|██▋       | 1540/5625 [1:07:51<2:53:50,  2.55s/it]

{'loss': 0.1259, 'grad_norm': 9.865365028381348, 'learning_rate': 1.4524444444444445e-05, 'epoch': 0.82}


 28%|██▊       | 1550/5625 [1:08:18<2:59:28,  2.64s/it]

{'loss': 0.3359, 'grad_norm': 14.169591903686523, 'learning_rate': 1.448888888888889e-05, 'epoch': 0.83}


 28%|██▊       | 1560/5625 [1:08:43<2:48:06,  2.48s/it]

{'loss': 0.2173, 'grad_norm': 7.634241580963135, 'learning_rate': 1.4453333333333334e-05, 'epoch': 0.83}


 28%|██▊       | 1570/5625 [1:09:08<2:49:34,  2.51s/it]

{'loss': 0.2465, 'grad_norm': 3.1743688583374023, 'learning_rate': 1.441777777777778e-05, 'epoch': 0.84}


 28%|██▊       | 1580/5625 [1:09:32<2:45:15,  2.45s/it]

{'loss': 0.2257, 'grad_norm': 7.822437763214111, 'learning_rate': 1.4382222222222224e-05, 'epoch': 0.84}


 28%|██▊       | 1590/5625 [1:09:57<2:47:22,  2.49s/it]

{'loss': 0.1241, 'grad_norm': 4.239272117614746, 'learning_rate': 1.434666666666667e-05, 'epoch': 0.85}


 28%|██▊       | 1600/5625 [1:10:24<2:59:00,  2.67s/it]

{'loss': 0.2612, 'grad_norm': 8.734350204467773, 'learning_rate': 1.4311111111111111e-05, 'epoch': 0.85}


 29%|██▊       | 1610/5625 [1:10:49<2:54:45,  2.61s/it]

{'loss': 0.3215, 'grad_norm': 11.838032722473145, 'learning_rate': 1.4275555555555555e-05, 'epoch': 0.86}


 29%|██▉       | 1620/5625 [1:11:15<2:52:24,  2.58s/it]

{'loss': 0.2588, 'grad_norm': 9.924620628356934, 'learning_rate': 1.4240000000000001e-05, 'epoch': 0.86}


 29%|██▉       | 1630/5625 [1:11:48<3:49:00,  3.44s/it]

{'loss': 0.2452, 'grad_norm': 5.8739333152771, 'learning_rate': 1.4204444444444445e-05, 'epoch': 0.87}


 29%|██▉       | 1640/5625 [1:12:14<2:52:23,  2.60s/it]

{'loss': 0.223, 'grad_norm': 2.9123754501342773, 'learning_rate': 1.416888888888889e-05, 'epoch': 0.87}


 29%|██▉       | 1650/5625 [1:12:40<2:48:22,  2.54s/it]

{'loss': 0.2104, 'grad_norm': 3.6849215030670166, 'learning_rate': 1.4133333333333334e-05, 'epoch': 0.88}


 30%|██▉       | 1660/5625 [1:13:04<2:40:42,  2.43s/it]

{'loss': 0.156, 'grad_norm': 0.8356369733810425, 'learning_rate': 1.409777777777778e-05, 'epoch': 0.89}


 30%|██▉       | 1670/5625 [1:13:29<2:41:06,  2.44s/it]

{'loss': 0.15, 'grad_norm': 8.323168754577637, 'learning_rate': 1.4062222222222223e-05, 'epoch': 0.89}


 30%|██▉       | 1680/5625 [1:13:53<2:41:14,  2.45s/it]

{'loss': 0.1792, 'grad_norm': 9.915739059448242, 'learning_rate': 1.4026666666666669e-05, 'epoch': 0.9}


 30%|███       | 1690/5625 [1:14:18<2:44:08,  2.50s/it]

{'loss': 0.1204, 'grad_norm': 0.29058414697647095, 'learning_rate': 1.3991111111111113e-05, 'epoch': 0.9}


 30%|███       | 1700/5625 [1:14:43<2:39:21,  2.44s/it]

{'loss': 0.3339, 'grad_norm': 5.567606449127197, 'learning_rate': 1.3955555555555558e-05, 'epoch': 0.91}


 30%|███       | 1710/5625 [1:15:08<2:45:22,  2.53s/it]

{'loss': 0.1502, 'grad_norm': 4.657992839813232, 'learning_rate': 1.392e-05, 'epoch': 0.91}


 31%|███       | 1720/5625 [1:15:32<2:40:32,  2.47s/it]

{'loss': 0.2004, 'grad_norm': 1.1733512878417969, 'learning_rate': 1.3884444444444444e-05, 'epoch': 0.92}


 31%|███       | 1730/5625 [1:15:57<2:40:48,  2.48s/it]

{'loss': 0.3412, 'grad_norm': 3.332545518875122, 'learning_rate': 1.384888888888889e-05, 'epoch': 0.92}


 31%|███       | 1740/5625 [1:16:22<2:45:28,  2.56s/it]

{'loss': 0.2961, 'grad_norm': 2.673657178878784, 'learning_rate': 1.3813333333333334e-05, 'epoch': 0.93}


 31%|███       | 1750/5625 [1:16:47<2:42:49,  2.52s/it]

{'loss': 0.1296, 'grad_norm': 11.939236640930176, 'learning_rate': 1.377777777777778e-05, 'epoch': 0.93}


 31%|███▏      | 1760/5625 [1:17:14<2:47:58,  2.61s/it]

{'loss': 0.2212, 'grad_norm': 2.7254035472869873, 'learning_rate': 1.3742222222222223e-05, 'epoch': 0.94}


 31%|███▏      | 1770/5625 [1:17:39<2:39:04,  2.48s/it]

{'loss': 0.2241, 'grad_norm': 9.871827125549316, 'learning_rate': 1.3706666666666669e-05, 'epoch': 0.94}


 32%|███▏      | 1780/5625 [1:18:07<2:55:21,  2.74s/it]

{'loss': 0.1827, 'grad_norm': 3.4168996810913086, 'learning_rate': 1.3671111111111113e-05, 'epoch': 0.95}


 32%|███▏      | 1790/5625 [1:18:32<2:45:14,  2.59s/it]

{'loss': 0.2723, 'grad_norm': 7.253110408782959, 'learning_rate': 1.3635555555555558e-05, 'epoch': 0.95}


 32%|███▏      | 1800/5625 [1:18:58<2:40:03,  2.51s/it]

{'loss': 0.2116, 'grad_norm': 1.7197450399398804, 'learning_rate': 1.3600000000000002e-05, 'epoch': 0.96}


 32%|███▏      | 1810/5625 [1:19:23<2:36:46,  2.47s/it]

{'loss': 0.2572, 'grad_norm': 10.525096893310547, 'learning_rate': 1.3564444444444444e-05, 'epoch': 0.97}


 32%|███▏      | 1820/5625 [1:19:47<2:32:38,  2.41s/it]

{'loss': 0.259, 'grad_norm': 5.322013854980469, 'learning_rate': 1.352888888888889e-05, 'epoch': 0.97}


 33%|███▎      | 1830/5625 [1:20:11<2:33:37,  2.43s/it]

{'loss': 0.1919, 'grad_norm': 4.7418365478515625, 'learning_rate': 1.3493333333333333e-05, 'epoch': 0.98}


 33%|███▎      | 1840/5625 [1:20:36<2:36:10,  2.48s/it]

{'loss': 0.2501, 'grad_norm': 4.602412700653076, 'learning_rate': 1.3457777777777779e-05, 'epoch': 0.98}


 33%|███▎      | 1850/5625 [1:21:01<2:38:54,  2.53s/it]

{'loss': 0.2876, 'grad_norm': 2.022270441055298, 'learning_rate': 1.3422222222222223e-05, 'epoch': 0.99}


 33%|███▎      | 1860/5625 [1:21:26<2:40:26,  2.56s/it]

{'loss': 0.2597, 'grad_norm': 12.392839431762695, 'learning_rate': 1.3386666666666668e-05, 'epoch': 0.99}


 33%|███▎      | 1870/5625 [1:21:52<2:42:12,  2.59s/it]

{'loss': 0.174, 'grad_norm': 11.084526062011719, 'learning_rate': 1.3351111111111112e-05, 'epoch': 1.0}


                                                       
 33%|███▎      | 1875/5625 [1:28:04<2:40:15,  2.56s/it]

{'eval_loss': 0.21309563517570496, 'eval_runtime': 358.4394, 'eval_samples_per_second': 20.924, 'eval_steps_per_second': 1.308, 'epoch': 1.0}


 33%|███▎      | 1880/5625 [1:28:23<30:00:16, 28.84s/it]  

{'loss': 0.1155, 'grad_norm': 1.718192458152771, 'learning_rate': 1.3315555555555558e-05, 'epoch': 1.0}


 34%|███▎      | 1890/5625 [1:28:50<3:26:57,  3.32s/it] 

{'loss': 0.1107, 'grad_norm': 1.441074252128601, 'learning_rate': 1.3280000000000002e-05, 'epoch': 1.01}


 34%|███▍      | 1900/5625 [1:29:15<2:37:15,  2.53s/it]

{'loss': 0.2593, 'grad_norm': 11.789787292480469, 'learning_rate': 1.3244444444444447e-05, 'epoch': 1.01}


 34%|███▍      | 1910/5625 [1:29:40<2:40:12,  2.59s/it]

{'loss': 0.146, 'grad_norm': 6.909331798553467, 'learning_rate': 1.320888888888889e-05, 'epoch': 1.02}


 34%|███▍      | 1920/5625 [1:30:09<3:03:11,  2.97s/it]

{'loss': 0.1054, 'grad_norm': 0.17156977951526642, 'learning_rate': 1.3173333333333333e-05, 'epoch': 1.02}


 34%|███▍      | 1930/5625 [1:30:36<2:40:48,  2.61s/it]

{'loss': 0.2051, 'grad_norm': 8.53600788116455, 'learning_rate': 1.3137777777777779e-05, 'epoch': 1.03}


 34%|███▍      | 1940/5625 [1:31:02<2:39:00,  2.59s/it]

{'loss': 0.1532, 'grad_norm': 7.050012588500977, 'learning_rate': 1.3102222222222223e-05, 'epoch': 1.03}


 35%|███▍      | 1950/5625 [1:31:27<2:39:21,  2.60s/it]

{'loss': 0.1523, 'grad_norm': 9.591782569885254, 'learning_rate': 1.3066666666666668e-05, 'epoch': 1.04}


 35%|███▍      | 1960/5625 [1:31:53<2:35:54,  2.55s/it]

{'loss': 0.1343, 'grad_norm': 1.1332614421844482, 'learning_rate': 1.3031111111111112e-05, 'epoch': 1.05}


 35%|███▌      | 1970/5625 [1:32:19<2:35:07,  2.55s/it]

{'loss': 0.1654, 'grad_norm': 4.9645562171936035, 'learning_rate': 1.2995555555555557e-05, 'epoch': 1.05}


 35%|███▌      | 1980/5625 [1:32:44<2:35:01,  2.55s/it]

{'loss': 0.1574, 'grad_norm': 6.546961307525635, 'learning_rate': 1.2960000000000001e-05, 'epoch': 1.06}


 35%|███▌      | 1990/5625 [1:33:12<2:55:33,  2.90s/it]

{'loss': 0.0934, 'grad_norm': 0.20022746920585632, 'learning_rate': 1.2924444444444447e-05, 'epoch': 1.06}


 36%|███▌      | 2000/5625 [1:33:39<2:47:28,  2.77s/it]

{'loss': 0.0953, 'grad_norm': 15.046198844909668, 'learning_rate': 1.288888888888889e-05, 'epoch': 1.07}


 36%|███▌      | 2010/5625 [1:34:10<3:10:33,  3.16s/it]

{'loss': 0.1003, 'grad_norm': 5.953245639801025, 'learning_rate': 1.2853333333333336e-05, 'epoch': 1.07}


 36%|███▌      | 2020/5625 [1:34:38<2:43:34,  2.72s/it]

{'loss': 0.1522, 'grad_norm': 10.708683967590332, 'learning_rate': 1.2817777777777778e-05, 'epoch': 1.08}


 36%|███▌      | 2030/5625 [1:35:06<2:41:52,  2.70s/it]

{'loss': 0.0818, 'grad_norm': 0.9081658720970154, 'learning_rate': 1.2782222222222222e-05, 'epoch': 1.08}


 36%|███▋      | 2040/5625 [1:35:31<2:30:01,  2.51s/it]

{'loss': 0.0478, 'grad_norm': 0.14287753403186798, 'learning_rate': 1.2746666666666668e-05, 'epoch': 1.09}


 36%|███▋      | 2050/5625 [1:35:56<2:34:23,  2.59s/it]

{'loss': 0.2085, 'grad_norm': 10.323556900024414, 'learning_rate': 1.2711111111111112e-05, 'epoch': 1.09}


 37%|███▋      | 2060/5625 [1:36:22<2:30:58,  2.54s/it]

{'loss': 0.3154, 'grad_norm': 1.0298333168029785, 'learning_rate': 1.2675555555555557e-05, 'epoch': 1.1}


 37%|███▋      | 2070/5625 [1:36:47<2:27:48,  2.49s/it]

{'loss': 0.0708, 'grad_norm': 7.507911205291748, 'learning_rate': 1.2640000000000001e-05, 'epoch': 1.1}


 37%|███▋      | 2080/5625 [1:37:13<2:29:25,  2.53s/it]

{'loss': 0.194, 'grad_norm': 6.836507320404053, 'learning_rate': 1.2604444444444446e-05, 'epoch': 1.11}


 37%|███▋      | 2090/5625 [1:37:38<2:28:01,  2.51s/it]

{'loss': 0.1916, 'grad_norm': 7.1635236740112305, 'learning_rate': 1.256888888888889e-05, 'epoch': 1.11}


 37%|███▋      | 2100/5625 [1:38:05<2:34:51,  2.64s/it]

{'loss': 0.1746, 'grad_norm': 3.2747185230255127, 'learning_rate': 1.2533333333333336e-05, 'epoch': 1.12}


 38%|███▊      | 2110/5625 [1:38:32<2:41:45,  2.76s/it]

{'loss': 0.1815, 'grad_norm': 6.954092979431152, 'learning_rate': 1.249777777777778e-05, 'epoch': 1.13}


 38%|███▊      | 2120/5625 [1:39:00<2:41:46,  2.77s/it]

{'loss': 0.1378, 'grad_norm': 1.5691715478897095, 'learning_rate': 1.2462222222222222e-05, 'epoch': 1.13}


 38%|███▊      | 2130/5625 [1:39:30<2:50:30,  2.93s/it]

{'loss': 0.1296, 'grad_norm': 9.24160099029541, 'learning_rate': 1.2426666666666667e-05, 'epoch': 1.14}


 38%|███▊      | 2140/5625 [1:39:55<2:25:10,  2.50s/it]

{'loss': 0.082, 'grad_norm': 0.7997148036956787, 'learning_rate': 1.2391111111111111e-05, 'epoch': 1.14}


 38%|███▊      | 2150/5625 [1:40:19<2:21:28,  2.44s/it]

{'loss': 0.1894, 'grad_norm': 5.457069396972656, 'learning_rate': 1.2355555555555557e-05, 'epoch': 1.15}


 38%|███▊      | 2160/5625 [1:40:44<2:20:21,  2.43s/it]

{'loss': 0.154, 'grad_norm': 7.519636631011963, 'learning_rate': 1.232e-05, 'epoch': 1.15}


 39%|███▊      | 2170/5625 [1:41:08<2:19:58,  2.43s/it]

{'loss': 0.0892, 'grad_norm': 1.962371826171875, 'learning_rate': 1.2284444444444446e-05, 'epoch': 1.16}


 39%|███▉      | 2180/5625 [1:41:32<2:20:35,  2.45s/it]

{'loss': 0.1909, 'grad_norm': 7.720769882202148, 'learning_rate': 1.224888888888889e-05, 'epoch': 1.16}


 39%|███▉      | 2190/5625 [1:41:58<2:28:20,  2.59s/it]

{'loss': 0.1719, 'grad_norm': 5.830695152282715, 'learning_rate': 1.2213333333333336e-05, 'epoch': 1.17}


 39%|███▉      | 2200/5625 [1:42:25<2:27:31,  2.58s/it]

{'loss': 0.1743, 'grad_norm': 14.22662353515625, 'learning_rate': 1.217777777777778e-05, 'epoch': 1.17}


 39%|███▉      | 2210/5625 [1:42:50<2:21:58,  2.49s/it]

{'loss': 0.1326, 'grad_norm': 8.774093627929688, 'learning_rate': 1.2142222222222225e-05, 'epoch': 1.18}


 39%|███▉      | 2220/5625 [1:43:15<2:23:14,  2.52s/it]

{'loss': 0.2502, 'grad_norm': 5.436602592468262, 'learning_rate': 1.2106666666666667e-05, 'epoch': 1.18}


 40%|███▉      | 2230/5625 [1:43:41<2:28:20,  2.62s/it]

{'loss': 0.1901, 'grad_norm': 11.161641120910645, 'learning_rate': 1.2071111111111111e-05, 'epoch': 1.19}


 40%|███▉      | 2240/5625 [1:44:06<2:15:29,  2.40s/it]

{'loss': 0.1169, 'grad_norm': 0.24536347389221191, 'learning_rate': 1.2035555555555556e-05, 'epoch': 1.19}


 40%|████      | 2250/5625 [1:44:30<2:13:04,  2.37s/it]

{'loss': 0.1735, 'grad_norm': 7.500782012939453, 'learning_rate': 1.2e-05, 'epoch': 1.2}


 40%|████      | 2260/5625 [1:44:53<2:12:48,  2.37s/it]

{'loss': 0.0597, 'grad_norm': 0.3656861186027527, 'learning_rate': 1.1964444444444446e-05, 'epoch': 1.21}


 40%|████      | 2270/5625 [1:45:17<2:11:28,  2.35s/it]

{'loss': 0.1532, 'grad_norm': 7.257423400878906, 'learning_rate': 1.192888888888889e-05, 'epoch': 1.21}


 41%|████      | 2280/5625 [1:45:41<2:10:52,  2.35s/it]

{'loss': 0.1591, 'grad_norm': 12.047918319702148, 'learning_rate': 1.1893333333333335e-05, 'epoch': 1.22}


 41%|████      | 2290/5625 [1:46:04<2:10:58,  2.36s/it]

{'loss': 0.221, 'grad_norm': 15.256983757019043, 'learning_rate': 1.1857777777777779e-05, 'epoch': 1.22}


 41%|████      | 2300/5625 [1:46:28<2:11:42,  2.38s/it]

{'loss': 0.1132, 'grad_norm': 14.877728462219238, 'learning_rate': 1.1822222222222225e-05, 'epoch': 1.23}


 41%|████      | 2310/5625 [1:46:52<2:11:26,  2.38s/it]

{'loss': 0.1806, 'grad_norm': 2.3256466388702393, 'learning_rate': 1.1786666666666668e-05, 'epoch': 1.23}


 41%|████      | 2320/5625 [1:47:15<2:09:57,  2.36s/it]

{'loss': 0.1417, 'grad_norm': 9.971715927124023, 'learning_rate': 1.1751111111111112e-05, 'epoch': 1.24}


 41%|████▏     | 2330/5625 [1:47:39<2:09:38,  2.36s/it]

{'loss': 0.1694, 'grad_norm': 15.03469181060791, 'learning_rate': 1.1715555555555556e-05, 'epoch': 1.24}


 42%|████▏     | 2340/5625 [1:48:02<2:09:21,  2.36s/it]

{'loss': 0.1352, 'grad_norm': 1.326996088027954, 'learning_rate': 1.168e-05, 'epoch': 1.25}


 42%|████▏     | 2350/5625 [1:48:26<2:09:00,  2.36s/it]

{'loss': 0.2449, 'grad_norm': 13.153261184692383, 'learning_rate': 1.1644444444444446e-05, 'epoch': 1.25}


 42%|████▏     | 2360/5625 [1:48:50<2:07:49,  2.35s/it]

{'loss': 0.2044, 'grad_norm': 7.513379096984863, 'learning_rate': 1.160888888888889e-05, 'epoch': 1.26}


 42%|████▏     | 2370/5625 [1:49:16<2:23:26,  2.64s/it]

{'loss': 0.132, 'grad_norm': 1.419888973236084, 'learning_rate': 1.1573333333333335e-05, 'epoch': 1.26}


 42%|████▏     | 2380/5625 [1:49:43<2:22:50,  2.64s/it]

{'loss': 0.2417, 'grad_norm': 7.545433044433594, 'learning_rate': 1.1537777777777779e-05, 'epoch': 1.27}


 42%|████▏     | 2390/5625 [1:50:09<2:20:49,  2.61s/it]

{'loss': 0.1856, 'grad_norm': 5.593334197998047, 'learning_rate': 1.1502222222222224e-05, 'epoch': 1.27}


 43%|████▎     | 2400/5625 [1:50:35<2:20:27,  2.61s/it]

{'loss': 0.1999, 'grad_norm': 10.662130355834961, 'learning_rate': 1.1466666666666668e-05, 'epoch': 1.28}


 43%|████▎     | 2410/5625 [1:51:02<2:22:12,  2.65s/it]

{'loss': 0.1193, 'grad_norm': 3.1097311973571777, 'learning_rate': 1.1431111111111114e-05, 'epoch': 1.29}


 43%|████▎     | 2420/5625 [1:51:28<2:21:42,  2.65s/it]

{'loss': 0.1355, 'grad_norm': 12.058769226074219, 'learning_rate': 1.1395555555555558e-05, 'epoch': 1.29}


 43%|████▎     | 2430/5625 [1:51:55<2:20:28,  2.64s/it]

{'loss': 0.1139, 'grad_norm': 6.820870399475098, 'learning_rate': 1.136e-05, 'epoch': 1.3}


 43%|████▎     | 2440/5625 [1:52:21<2:19:50,  2.63s/it]

{'loss': 0.1311, 'grad_norm': 1.1441140174865723, 'learning_rate': 1.1324444444444445e-05, 'epoch': 1.3}


 44%|████▎     | 2450/5625 [1:52:48<2:21:48,  2.68s/it]

{'loss': 0.2665, 'grad_norm': 8.868013381958008, 'learning_rate': 1.1288888888888889e-05, 'epoch': 1.31}


 44%|████▎     | 2460/5625 [1:53:16<2:25:24,  2.76s/it]

{'loss': 0.2025, 'grad_norm': 11.477438926696777, 'learning_rate': 1.1253333333333335e-05, 'epoch': 1.31}


 44%|████▍     | 2470/5625 [1:53:43<2:21:39,  2.69s/it]

{'loss': 0.2005, 'grad_norm': 15.096490859985352, 'learning_rate': 1.1217777777777778e-05, 'epoch': 1.32}


 44%|████▍     | 2480/5625 [1:54:09<2:17:53,  2.63s/it]

{'loss': 0.1241, 'grad_norm': 20.60124969482422, 'learning_rate': 1.1182222222222224e-05, 'epoch': 1.32}


 44%|████▍     | 2490/5625 [1:54:35<2:16:24,  2.61s/it]

{'loss': 0.1756, 'grad_norm': 6.538668155670166, 'learning_rate': 1.1146666666666668e-05, 'epoch': 1.33}


 44%|████▍     | 2500/5625 [1:55:00<2:05:22,  2.41s/it]

{'loss': 0.1772, 'grad_norm': 2.898080825805664, 'learning_rate': 1.1111111111111113e-05, 'epoch': 1.33}


 45%|████▍     | 2510/5625 [1:55:24<2:02:08,  2.35s/it]

{'loss': 0.1704, 'grad_norm': 12.864710807800293, 'learning_rate': 1.1075555555555557e-05, 'epoch': 1.34}


 45%|████▍     | 2520/5625 [1:55:49<2:09:11,  2.50s/it]

{'loss': 0.1113, 'grad_norm': 5.339456558227539, 'learning_rate': 1.1040000000000001e-05, 'epoch': 1.34}


 45%|████▍     | 2530/5625 [1:56:13<2:05:57,  2.44s/it]

{'loss': 0.1113, 'grad_norm': 18.80276870727539, 'learning_rate': 1.1004444444444445e-05, 'epoch': 1.35}


 45%|████▌     | 2540/5625 [1:56:39<2:10:19,  2.53s/it]

{'loss': 0.2425, 'grad_norm': 10.661674499511719, 'learning_rate': 1.0968888888888889e-05, 'epoch': 1.35}


 45%|████▌     | 2550/5625 [1:57:03<2:05:25,  2.45s/it]

{'loss': 0.1782, 'grad_norm': 4.403253078460693, 'learning_rate': 1.0933333333333334e-05, 'epoch': 1.36}


 46%|████▌     | 2560/5625 [1:57:28<2:08:19,  2.51s/it]

{'loss': 0.1845, 'grad_norm': 9.663712501525879, 'learning_rate': 1.0897777777777778e-05, 'epoch': 1.37}


 46%|████▌     | 2570/5625 [1:57:54<2:08:15,  2.52s/it]

{'loss': 0.0994, 'grad_norm': 0.14004811644554138, 'learning_rate': 1.0862222222222224e-05, 'epoch': 1.37}


 46%|████▌     | 2580/5625 [1:58:20<2:10:50,  2.58s/it]

{'loss': 0.2209, 'grad_norm': 4.943680763244629, 'learning_rate': 1.0826666666666667e-05, 'epoch': 1.38}


 46%|████▌     | 2590/5625 [1:58:45<2:08:52,  2.55s/it]

{'loss': 0.2121, 'grad_norm': 7.462039470672607, 'learning_rate': 1.0791111111111113e-05, 'epoch': 1.38}


 46%|████▌     | 2600/5625 [1:59:10<2:06:12,  2.50s/it]

{'loss': 0.1842, 'grad_norm': 7.893235206604004, 'learning_rate': 1.0755555555555557e-05, 'epoch': 1.39}


 46%|████▋     | 2610/5625 [1:59:35<2:08:14,  2.55s/it]

{'loss': 0.2598, 'grad_norm': 11.460413932800293, 'learning_rate': 1.072e-05, 'epoch': 1.39}


 47%|████▋     | 2620/5625 [2:00:00<2:02:58,  2.46s/it]

{'loss': 0.119, 'grad_norm': 4.431056499481201, 'learning_rate': 1.0684444444444446e-05, 'epoch': 1.4}


 47%|████▋     | 2630/5625 [2:00:25<2:03:29,  2.47s/it]

{'loss': 0.1203, 'grad_norm': 0.5134333372116089, 'learning_rate': 1.064888888888889e-05, 'epoch': 1.4}


 47%|████▋     | 2640/5625 [2:00:50<2:06:37,  2.55s/it]

{'loss': 0.2358, 'grad_norm': 0.3783318102359772, 'learning_rate': 1.0613333333333334e-05, 'epoch': 1.41}


 47%|████▋     | 2650/5625 [2:01:15<2:07:19,  2.57s/it]

{'loss': 0.1773, 'grad_norm': 13.503410339355469, 'learning_rate': 1.0577777777777778e-05, 'epoch': 1.41}


 47%|████▋     | 2660/5625 [2:01:40<2:03:18,  2.50s/it]

{'loss': 0.1514, 'grad_norm': 16.606210708618164, 'learning_rate': 1.0542222222222223e-05, 'epoch': 1.42}


 47%|████▋     | 2670/5625 [2:02:06<2:06:51,  2.58s/it]

{'loss': 0.1848, 'grad_norm': 20.04993438720703, 'learning_rate': 1.0506666666666667e-05, 'epoch': 1.42}


 48%|████▊     | 2680/5625 [2:02:31<2:06:27,  2.58s/it]

{'loss': 0.1617, 'grad_norm': 15.405360221862793, 'learning_rate': 1.0471111111111113e-05, 'epoch': 1.43}


 48%|████▊     | 2690/5625 [2:02:57<2:07:42,  2.61s/it]

{'loss': 0.1595, 'grad_norm': 15.146735191345215, 'learning_rate': 1.0435555555555557e-05, 'epoch': 1.43}


 48%|████▊     | 2700/5625 [2:03:22<2:01:52,  2.50s/it]

{'loss': 0.0858, 'grad_norm': 12.44192886352539, 'learning_rate': 1.04e-05, 'epoch': 1.44}


 48%|████▊     | 2710/5625 [2:03:47<2:00:05,  2.47s/it]

{'loss': 0.1304, 'grad_norm': 10.11918830871582, 'learning_rate': 1.0364444444444446e-05, 'epoch': 1.45}


 48%|████▊     | 2720/5625 [2:04:13<2:01:32,  2.51s/it]

{'loss': 0.1398, 'grad_norm': 1.7583073377609253, 'learning_rate': 1.032888888888889e-05, 'epoch': 1.45}


 49%|████▊     | 2730/5625 [2:04:38<2:02:06,  2.53s/it]

{'loss': 0.2298, 'grad_norm': 6.531718730926514, 'learning_rate': 1.0293333333333335e-05, 'epoch': 1.46}


 49%|████▊     | 2740/5625 [2:05:03<2:00:18,  2.50s/it]

{'loss': 0.2075, 'grad_norm': 6.732921600341797, 'learning_rate': 1.0257777777777777e-05, 'epoch': 1.46}


 49%|████▉     | 2750/5625 [2:05:28<2:01:07,  2.53s/it]

{'loss': 0.1111, 'grad_norm': 9.490497589111328, 'learning_rate': 1.0222222222222223e-05, 'epoch': 1.47}


 49%|████▉     | 2760/5625 [2:05:54<2:02:23,  2.56s/it]

{'loss': 0.1324, 'grad_norm': 0.8730683922767639, 'learning_rate': 1.0186666666666667e-05, 'epoch': 1.47}


 49%|████▉     | 2770/5625 [2:06:19<1:59:53,  2.52s/it]

{'loss': 0.2283, 'grad_norm': 5.666689872741699, 'learning_rate': 1.0151111111111112e-05, 'epoch': 1.48}


 49%|████▉     | 2780/5625 [2:06:45<2:01:01,  2.55s/it]

{'loss': 0.0585, 'grad_norm': 2.8144750595092773, 'learning_rate': 1.0115555555555556e-05, 'epoch': 1.48}


 50%|████▉     | 2790/5625 [2:07:10<1:57:47,  2.49s/it]

{'loss': 0.048, 'grad_norm': 0.5049982070922852, 'learning_rate': 1.008e-05, 'epoch': 1.49}


 50%|████▉     | 2800/5625 [2:07:35<1:57:44,  2.50s/it]

{'loss': 0.2264, 'grad_norm': 11.560365676879883, 'learning_rate': 1.0044444444444446e-05, 'epoch': 1.49}


 50%|████▉     | 2810/5625 [2:08:00<1:56:20,  2.48s/it]

{'loss': 0.2331, 'grad_norm': 0.17538684606552124, 'learning_rate': 1.000888888888889e-05, 'epoch': 1.5}


 50%|█████     | 2820/5625 [2:08:25<1:55:01,  2.46s/it]

{'loss': 0.2485, 'grad_norm': 0.2957993447780609, 'learning_rate': 9.973333333333333e-06, 'epoch': 1.5}


 50%|█████     | 2830/5625 [2:08:50<1:56:41,  2.51s/it]

{'loss': 0.1614, 'grad_norm': 7.185286045074463, 'learning_rate': 9.937777777777779e-06, 'epoch': 1.51}


 50%|█████     | 2840/5625 [2:09:15<1:53:53,  2.45s/it]

{'loss': 0.1553, 'grad_norm': 5.46243143081665, 'learning_rate': 9.902222222222223e-06, 'epoch': 1.51}


 51%|█████     | 2850/5625 [2:09:40<1:56:28,  2.52s/it]

{'loss': 0.1289, 'grad_norm': 0.5388346314430237, 'learning_rate': 9.866666666666668e-06, 'epoch': 1.52}


 51%|█████     | 2860/5625 [2:10:06<2:00:21,  2.61s/it]

{'loss': 0.187, 'grad_norm': 11.62816047668457, 'learning_rate': 9.831111111111112e-06, 'epoch': 1.53}


 51%|█████     | 2870/5625 [2:10:31<1:53:52,  2.48s/it]

{'loss': 0.1249, 'grad_norm': 19.096059799194336, 'learning_rate': 9.795555555555556e-06, 'epoch': 1.53}


 51%|█████     | 2880/5625 [2:10:55<1:52:17,  2.45s/it]

{'loss': 0.1925, 'grad_norm': 17.897075653076172, 'learning_rate': 9.760000000000001e-06, 'epoch': 1.54}


 51%|█████▏    | 2890/5625 [2:11:21<1:55:42,  2.54s/it]

{'loss': 0.0603, 'grad_norm': 1.5814893245697021, 'learning_rate': 9.724444444444445e-06, 'epoch': 1.54}


 52%|█████▏    | 2900/5625 [2:11:46<1:53:57,  2.51s/it]

{'loss': 0.1353, 'grad_norm': 11.335566520690918, 'learning_rate': 9.688888888888889e-06, 'epoch': 1.55}


 52%|█████▏    | 2910/5625 [2:12:10<1:51:42,  2.47s/it]

{'loss': 0.1256, 'grad_norm': 6.975654602050781, 'learning_rate': 9.653333333333335e-06, 'epoch': 1.55}


 52%|█████▏    | 2920/5625 [2:12:36<1:53:06,  2.51s/it]

{'loss': 0.0623, 'grad_norm': 0.10907703638076782, 'learning_rate': 9.617777777777778e-06, 'epoch': 1.56}


 52%|█████▏    | 2930/5625 [2:13:00<1:51:15,  2.48s/it]

{'loss': 0.1716, 'grad_norm': 6.521307945251465, 'learning_rate': 9.582222222222222e-06, 'epoch': 1.56}


 52%|█████▏    | 2940/5625 [2:13:25<1:52:19,  2.51s/it]

{'loss': 0.1848, 'grad_norm': 2.815077066421509, 'learning_rate': 9.546666666666668e-06, 'epoch': 1.57}


 52%|█████▏    | 2950/5625 [2:13:51<1:53:21,  2.54s/it]

{'loss': 0.1483, 'grad_norm': 2.47059965133667, 'learning_rate': 9.511111111111112e-06, 'epoch': 1.57}


 53%|█████▎    | 2960/5625 [2:14:17<1:50:17,  2.48s/it]

{'loss': 0.1036, 'grad_norm': 7.543852806091309, 'learning_rate': 9.475555555555557e-06, 'epoch': 1.58}


 53%|█████▎    | 2970/5625 [2:14:41<1:45:04,  2.37s/it]

{'loss': 0.1984, 'grad_norm': 15.131213188171387, 'learning_rate': 9.440000000000001e-06, 'epoch': 1.58}


 53%|█████▎    | 2980/5625 [2:15:04<1:43:16,  2.34s/it]

{'loss': 0.0784, 'grad_norm': 17.112356185913086, 'learning_rate': 9.404444444444445e-06, 'epoch': 1.59}


 53%|█████▎    | 2990/5625 [2:15:28<1:43:17,  2.35s/it]

{'loss': 0.0803, 'grad_norm': 12.209057807922363, 'learning_rate': 9.368888888888889e-06, 'epoch': 1.59}


 53%|█████▎    | 3000/5625 [2:15:51<1:43:02,  2.36s/it]

{'loss': 0.2797, 'grad_norm': 16.179040908813477, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.6}


 54%|█████▎    | 3010/5625 [2:16:15<1:42:53,  2.36s/it]

{'loss': 0.1166, 'grad_norm': 15.807929039001465, 'learning_rate': 9.297777777777778e-06, 'epoch': 1.61}


 54%|█████▎    | 3020/5625 [2:16:38<1:42:30,  2.36s/it]

{'loss': 0.1777, 'grad_norm': 9.08372688293457, 'learning_rate': 9.262222222222222e-06, 'epoch': 1.61}


 54%|█████▍    | 3030/5625 [2:17:02<1:41:24,  2.34s/it]

{'loss': 0.1771, 'grad_norm': 0.47143951058387756, 'learning_rate': 9.226666666666668e-06, 'epoch': 1.62}


 54%|█████▍    | 3040/5625 [2:17:26<1:41:57,  2.37s/it]

{'loss': 0.2033, 'grad_norm': 17.670042037963867, 'learning_rate': 9.191111111111111e-06, 'epoch': 1.62}


 54%|█████▍    | 3050/5625 [2:17:49<1:42:01,  2.38s/it]

{'loss': 0.1516, 'grad_norm': 6.0523295402526855, 'learning_rate': 9.155555555555557e-06, 'epoch': 1.63}


 54%|█████▍    | 3060/5625 [2:18:13<1:41:24,  2.37s/it]

{'loss': 0.1875, 'grad_norm': 14.00075626373291, 'learning_rate': 9.12e-06, 'epoch': 1.63}


 55%|█████▍    | 3070/5625 [2:18:36<1:39:58,  2.35s/it]

{'loss': 0.1982, 'grad_norm': 0.9187277555465698, 'learning_rate': 9.084444444444446e-06, 'epoch': 1.64}


 55%|█████▍    | 3080/5625 [2:19:01<1:51:40,  2.63s/it]

{'loss': 0.198, 'grad_norm': 3.779987335205078, 'learning_rate': 9.048888888888888e-06, 'epoch': 1.64}


 55%|█████▍    | 3090/5625 [2:19:28<1:53:27,  2.69s/it]

{'loss': 0.1862, 'grad_norm': 0.8129512667655945, 'learning_rate': 9.013333333333334e-06, 'epoch': 1.65}


 55%|█████▌    | 3100/5625 [2:19:54<1:43:31,  2.46s/it]

{'loss': 0.2403, 'grad_norm': 6.538107395172119, 'learning_rate': 8.977777777777778e-06, 'epoch': 1.65}


 55%|█████▌    | 3110/5625 [2:20:17<1:39:16,  2.37s/it]

{'loss': 0.1767, 'grad_norm': 6.875643730163574, 'learning_rate': 8.942222222222223e-06, 'epoch': 1.66}


 55%|█████▌    | 3120/5625 [2:20:41<1:41:54,  2.44s/it]

{'loss': 0.2229, 'grad_norm': 4.571364879608154, 'learning_rate': 8.906666666666667e-06, 'epoch': 1.66}


 56%|█████▌    | 3130/5625 [2:21:08<1:54:52,  2.76s/it]

{'loss': 0.1452, 'grad_norm': 12.925074577331543, 'learning_rate': 8.871111111111111e-06, 'epoch': 1.67}


 56%|█████▌    | 3140/5625 [2:21:35<1:49:17,  2.64s/it]

{'loss': 0.2309, 'grad_norm': 13.607156753540039, 'learning_rate': 8.835555555555557e-06, 'epoch': 1.67}


 56%|█████▌    | 3150/5625 [2:22:01<1:48:58,  2.64s/it]

{'loss': 0.0743, 'grad_norm': 11.530865669250488, 'learning_rate': 8.8e-06, 'epoch': 1.68}


 56%|█████▌    | 3160/5625 [2:22:27<1:46:01,  2.58s/it]

{'loss': 0.1398, 'grad_norm': 14.052227020263672, 'learning_rate': 8.764444444444446e-06, 'epoch': 1.69}


 56%|█████▋    | 3170/5625 [2:22:53<1:46:26,  2.60s/it]

{'loss': 0.1656, 'grad_norm': 0.3216961920261383, 'learning_rate': 8.72888888888889e-06, 'epoch': 1.69}


 57%|█████▋    | 3180/5625 [2:23:18<1:44:28,  2.56s/it]

{'loss': 0.1314, 'grad_norm': 19.33456039428711, 'learning_rate': 8.693333333333334e-06, 'epoch': 1.7}


 57%|█████▋    | 3190/5625 [2:23:44<1:41:53,  2.51s/it]

{'loss': 0.157, 'grad_norm': 2.8182291984558105, 'learning_rate': 8.657777777777778e-06, 'epoch': 1.7}


 57%|█████▋    | 3200/5625 [2:24:09<1:41:57,  2.52s/it]

{'loss': 0.2016, 'grad_norm': 0.38021039962768555, 'learning_rate': 8.622222222222223e-06, 'epoch': 1.71}


 57%|█████▋    | 3210/5625 [2:24:35<1:40:19,  2.49s/it]

{'loss': 0.2193, 'grad_norm': 6.54796838760376, 'learning_rate': 8.586666666666667e-06, 'epoch': 1.71}


 57%|█████▋    | 3220/5625 [2:24:59<1:38:18,  2.45s/it]

{'loss': 0.1283, 'grad_norm': 11.131301879882812, 'learning_rate': 8.551111111111112e-06, 'epoch': 1.72}


 57%|█████▋    | 3230/5625 [2:25:24<1:36:42,  2.42s/it]

{'loss': 0.1845, 'grad_norm': 12.091300964355469, 'learning_rate': 8.515555555555556e-06, 'epoch': 1.72}


 58%|█████▊    | 3240/5625 [2:25:49<1:37:17,  2.45s/it]

{'loss': 0.1054, 'grad_norm': 3.8272523880004883, 'learning_rate': 8.48e-06, 'epoch': 1.73}


 58%|█████▊    | 3250/5625 [2:26:13<1:37:28,  2.46s/it]

{'loss': 0.1038, 'grad_norm': 23.007713317871094, 'learning_rate': 8.444444444444446e-06, 'epoch': 1.73}


 58%|█████▊    | 3260/5625 [2:26:38<1:36:52,  2.46s/it]

{'loss': 0.1783, 'grad_norm': 15.974729537963867, 'learning_rate': 8.40888888888889e-06, 'epoch': 1.74}


 58%|█████▊    | 3270/5625 [2:27:02<1:36:43,  2.46s/it]

{'loss': 0.0726, 'grad_norm': 15.35232925415039, 'learning_rate': 8.373333333333335e-06, 'epoch': 1.74}


 58%|█████▊    | 3280/5625 [2:27:27<1:37:43,  2.50s/it]

{'loss': 0.1507, 'grad_norm': 1.3426826000213623, 'learning_rate': 8.337777777777777e-06, 'epoch': 1.75}


 58%|█████▊    | 3290/5625 [2:27:52<1:36:49,  2.49s/it]

{'loss': 0.1372, 'grad_norm': 12.053617477416992, 'learning_rate': 8.302222222222223e-06, 'epoch': 1.75}


 59%|█████▊    | 3300/5625 [2:28:18<1:43:40,  2.68s/it]

{'loss': 0.1403, 'grad_norm': 17.060285568237305, 'learning_rate': 8.266666666666667e-06, 'epoch': 1.76}


 59%|█████▉    | 3310/5625 [2:28:43<1:33:55,  2.43s/it]

{'loss': 0.1784, 'grad_norm': 1.4140105247497559, 'learning_rate': 8.231111111111112e-06, 'epoch': 1.77}


 59%|█████▉    | 3320/5625 [2:29:07<1:30:46,  2.36s/it]

{'loss': 0.083, 'grad_norm': 15.324326515197754, 'learning_rate': 8.195555555555556e-06, 'epoch': 1.77}


 59%|█████▉    | 3330/5625 [2:29:30<1:30:35,  2.37s/it]

{'loss': 0.1698, 'grad_norm': 6.563732147216797, 'learning_rate': 8.16e-06, 'epoch': 1.78}


 59%|█████▉    | 3340/5625 [2:29:54<1:30:08,  2.37s/it]

{'loss': 0.1726, 'grad_norm': 16.2083797454834, 'learning_rate': 8.124444444444445e-06, 'epoch': 1.78}


 60%|█████▉    | 3350/5625 [2:30:18<1:29:04,  2.35s/it]

{'loss': 0.1949, 'grad_norm': 9.157088279724121, 'learning_rate': 8.08888888888889e-06, 'epoch': 1.79}


 60%|█████▉    | 3360/5625 [2:30:41<1:28:47,  2.35s/it]

{'loss': 0.1903, 'grad_norm': 14.696646690368652, 'learning_rate': 8.053333333333335e-06, 'epoch': 1.79}


 60%|█████▉    | 3370/5625 [2:31:05<1:28:34,  2.36s/it]

{'loss': 0.2051, 'grad_norm': 4.152538299560547, 'learning_rate': 8.017777777777779e-06, 'epoch': 1.8}


 60%|██████    | 3380/5625 [2:31:28<1:27:57,  2.35s/it]

{'loss': 0.0915, 'grad_norm': 0.42692136764526367, 'learning_rate': 7.982222222222224e-06, 'epoch': 1.8}


 60%|██████    | 3390/5625 [2:31:52<1:28:04,  2.36s/it]

{'loss': 0.1166, 'grad_norm': 15.78721809387207, 'learning_rate': 7.946666666666666e-06, 'epoch': 1.81}


 60%|██████    | 3400/5625 [2:32:16<1:27:50,  2.37s/it]

{'loss': 0.1392, 'grad_norm': 12.083277702331543, 'learning_rate': 7.911111111111112e-06, 'epoch': 1.81}


 61%|██████    | 3410/5625 [2:32:39<1:26:54,  2.35s/it]

{'loss': 0.0896, 'grad_norm': 2.4911937713623047, 'learning_rate': 7.875555555555556e-06, 'epoch': 1.82}


 61%|██████    | 3420/5625 [2:33:03<1:26:40,  2.36s/it]

{'loss': 0.1199, 'grad_norm': 0.23258990049362183, 'learning_rate': 7.840000000000001e-06, 'epoch': 1.82}


 61%|██████    | 3430/5625 [2:33:26<1:26:30,  2.36s/it]

{'loss': 0.1548, 'grad_norm': 3.2532312870025635, 'learning_rate': 7.804444444444445e-06, 'epoch': 1.83}


 61%|██████    | 3440/5625 [2:33:51<1:35:35,  2.63s/it]

{'loss': 0.0828, 'grad_norm': 1.8161818981170654, 'learning_rate': 7.768888888888889e-06, 'epoch': 1.83}


 61%|██████▏   | 3450/5625 [2:34:15<1:25:41,  2.36s/it]

{'loss': 0.1488, 'grad_norm': 0.19879984855651855, 'learning_rate': 7.733333333333334e-06, 'epoch': 1.84}


 62%|██████▏   | 3460/5625 [2:34:39<1:25:01,  2.36s/it]

{'loss': 0.0698, 'grad_norm': 16.50620460510254, 'learning_rate': 7.697777777777778e-06, 'epoch': 1.85}


 62%|██████▏   | 3470/5625 [2:35:02<1:24:19,  2.35s/it]

{'loss': 0.1131, 'grad_norm': 12.78193473815918, 'learning_rate': 7.662222222222224e-06, 'epoch': 1.85}


 62%|██████▏   | 3480/5625 [2:35:26<1:24:15,  2.36s/it]

{'loss': 0.1391, 'grad_norm': 0.06519392877817154, 'learning_rate': 7.626666666666668e-06, 'epoch': 1.86}


 62%|██████▏   | 3490/5625 [2:35:49<1:23:40,  2.35s/it]

{'loss': 0.1402, 'grad_norm': 3.0600407123565674, 'learning_rate': 7.5911111111111115e-06, 'epoch': 1.86}


 62%|██████▏   | 3500/5625 [2:36:13<1:23:46,  2.37s/it]

{'loss': 0.2786, 'grad_norm': 9.521130561828613, 'learning_rate': 7.555555555555556e-06, 'epoch': 1.87}


 62%|██████▏   | 3510/5625 [2:36:36<1:22:55,  2.35s/it]

{'loss': 0.1963, 'grad_norm': 7.063930988311768, 'learning_rate': 7.520000000000001e-06, 'epoch': 1.87}


 63%|██████▎   | 3520/5625 [2:37:00<1:22:36,  2.35s/it]

{'loss': 0.0127, 'grad_norm': 1.9830913543701172, 'learning_rate': 7.4844444444444455e-06, 'epoch': 1.88}


 63%|██████▎   | 3530/5625 [2:37:24<1:22:20,  2.36s/it]

{'loss': 0.1943, 'grad_norm': 7.095722675323486, 'learning_rate': 7.44888888888889e-06, 'epoch': 1.88}


 63%|██████▎   | 3540/5625 [2:37:47<1:21:43,  2.35s/it]

{'loss': 0.1599, 'grad_norm': 27.318241119384766, 'learning_rate': 7.413333333333333e-06, 'epoch': 1.89}


 63%|██████▎   | 3550/5625 [2:38:11<1:21:47,  2.36s/it]

{'loss': 0.275, 'grad_norm': 12.140780448913574, 'learning_rate': 7.377777777777778e-06, 'epoch': 1.89}


 63%|██████▎   | 3560/5625 [2:38:37<1:38:28,  2.86s/it]

{'loss': 0.1572, 'grad_norm': 9.859357833862305, 'learning_rate': 7.342222222222223e-06, 'epoch': 1.9}


 63%|██████▎   | 3570/5625 [2:39:10<1:50:53,  3.24s/it]

{'loss': 0.2107, 'grad_norm': 15.866235733032227, 'learning_rate': 7.306666666666667e-06, 'epoch': 1.9}


 64%|██████▎   | 3580/5625 [2:39:41<1:46:10,  3.12s/it]

{'loss': 0.1851, 'grad_norm': 1.5387541055679321, 'learning_rate': 7.271111111111112e-06, 'epoch': 1.91}


 64%|██████▍   | 3590/5625 [2:40:12<1:43:41,  3.06s/it]

{'loss': 0.3322, 'grad_norm': 6.372743129730225, 'learning_rate': 7.235555555555556e-06, 'epoch': 1.91}


 64%|██████▍   | 3600/5625 [2:40:43<1:43:11,  3.06s/it]

{'loss': 0.1777, 'grad_norm': 0.5141207575798035, 'learning_rate': 7.2000000000000005e-06, 'epoch': 1.92}


 64%|██████▍   | 3610/5625 [2:41:13<1:42:43,  3.06s/it]

{'loss': 0.2369, 'grad_norm': 7.807558059692383, 'learning_rate': 7.164444444444445e-06, 'epoch': 1.93}


 64%|██████▍   | 3620/5625 [2:41:44<1:42:40,  3.07s/it]

{'loss': 0.2933, 'grad_norm': 15.195480346679688, 'learning_rate': 7.12888888888889e-06, 'epoch': 1.93}


 65%|██████▍   | 3630/5625 [2:42:15<1:41:31,  3.05s/it]

{'loss': 0.158, 'grad_norm': 1.078739047050476, 'learning_rate': 7.093333333333335e-06, 'epoch': 1.94}


 65%|██████▍   | 3640/5625 [2:42:45<1:41:16,  3.06s/it]

{'loss': 0.0747, 'grad_norm': 0.46880975365638733, 'learning_rate': 7.057777777777778e-06, 'epoch': 1.94}


 65%|██████▍   | 3650/5625 [2:43:16<1:40:54,  3.07s/it]

{'loss': 0.2597, 'grad_norm': 10.51248836517334, 'learning_rate': 7.022222222222222e-06, 'epoch': 1.95}


 65%|██████▌   | 3660/5625 [2:43:47<1:39:48,  3.05s/it]

{'loss': 0.1488, 'grad_norm': 10.352507591247559, 'learning_rate': 6.986666666666667e-06, 'epoch': 1.95}


 65%|██████▌   | 3670/5625 [2:44:17<1:39:37,  3.06s/it]

{'loss': 0.077, 'grad_norm': 4.74559211730957, 'learning_rate': 6.951111111111112e-06, 'epoch': 1.96}


 65%|██████▌   | 3680/5625 [2:44:48<1:39:28,  3.07s/it]

{'loss': 0.1665, 'grad_norm': 6.937324047088623, 'learning_rate': 6.915555555555556e-06, 'epoch': 1.96}


 66%|██████▌   | 3690/5625 [2:45:19<1:38:57,  3.07s/it]

{'loss': 0.1844, 'grad_norm': 12.172054290771484, 'learning_rate': 6.88e-06, 'epoch': 1.97}


 66%|██████▌   | 3700/5625 [2:45:50<1:38:18,  3.06s/it]

{'loss': 0.256, 'grad_norm': 11.870054244995117, 'learning_rate': 6.844444444444445e-06, 'epoch': 1.97}


 66%|██████▌   | 3710/5625 [2:46:21<1:37:34,  3.06s/it]

{'loss': 0.0827, 'grad_norm': 3.323122024536133, 'learning_rate': 6.80888888888889e-06, 'epoch': 1.98}


 66%|██████▌   | 3720/5625 [2:46:51<1:37:13,  3.06s/it]

{'loss': 0.0704, 'grad_norm': 1.7273080348968506, 'learning_rate': 6.773333333333334e-06, 'epoch': 1.98}


 66%|██████▋   | 3730/5625 [2:47:22<1:36:22,  3.05s/it]

{'loss': 0.341, 'grad_norm': 15.824512481689453, 'learning_rate': 6.737777777777779e-06, 'epoch': 1.99}


 66%|██████▋   | 3740/5625 [2:47:50<1:21:47,  2.60s/it]

{'loss': 0.1405, 'grad_norm': 0.33198991417884827, 'learning_rate': 6.702222222222224e-06, 'epoch': 1.99}


 67%|██████▋   | 3750/5625 [2:48:16<1:23:51,  2.68s/it]

{'loss': 0.1424, 'grad_norm': 9.74022388458252, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


                                                       
 67%|██████▋   | 3750/5625 [2:54:34<1:23:51,  2.68s/it]

{'eval_loss': 0.23395000398159027, 'eval_runtime': 378.027, 'eval_samples_per_second': 19.84, 'eval_steps_per_second': 1.241, 'epoch': 2.0}


 67%|██████▋   | 3760/5625 [2:55:07<3:48:01,  7.34s/it]  

{'loss': 0.0871, 'grad_norm': 1.9280635118484497, 'learning_rate': 6.631111111111111e-06, 'epoch': 2.01}


 67%|██████▋   | 3770/5625 [2:55:34<1:26:01,  2.78s/it]

{'loss': 0.1608, 'grad_norm': 10.542685508728027, 'learning_rate': 6.595555555555556e-06, 'epoch': 2.01}


 67%|██████▋   | 3780/5625 [2:56:00<1:20:34,  2.62s/it]

{'loss': 0.1255, 'grad_norm': 0.22466155886650085, 'learning_rate': 6.560000000000001e-06, 'epoch': 2.02}


 67%|██████▋   | 3790/5625 [2:56:28<1:26:00,  2.81s/it]

{'loss': 0.0922, 'grad_norm': 14.791167259216309, 'learning_rate': 6.524444444444445e-06, 'epoch': 2.02}


 68%|██████▊   | 3800/5625 [2:56:56<1:28:00,  2.89s/it]

{'loss': 0.0919, 'grad_norm': 8.266104698181152, 'learning_rate': 6.488888888888889e-06, 'epoch': 2.03}


 68%|██████▊   | 3810/5625 [2:57:25<1:27:48,  2.90s/it]

{'loss': 0.1124, 'grad_norm': 1.2754251956939697, 'learning_rate': 6.453333333333334e-06, 'epoch': 2.03}


 68%|██████▊   | 3820/5625 [2:57:54<1:24:49,  2.82s/it]

{'loss': 0.0789, 'grad_norm': 0.5824791789054871, 'learning_rate': 6.417777777777779e-06, 'epoch': 2.04}


 68%|██████▊   | 3830/5625 [2:58:22<1:25:17,  2.85s/it]

{'loss': 0.1065, 'grad_norm': 1.2696528434753418, 'learning_rate': 6.382222222222223e-06, 'epoch': 2.04}


 68%|██████▊   | 3840/5625 [2:58:51<1:24:58,  2.86s/it]

{'loss': 0.1486, 'grad_norm': 2.488131046295166, 'learning_rate': 6.346666666666668e-06, 'epoch': 2.05}


 68%|██████▊   | 3850/5625 [2:59:19<1:23:42,  2.83s/it]

{'loss': 0.0877, 'grad_norm': 6.8854875564575195, 'learning_rate': 6.311111111111111e-06, 'epoch': 2.05}


 69%|██████▊   | 3860/5625 [2:59:48<1:22:52,  2.82s/it]

{'loss': 0.118, 'grad_norm': 7.130287170410156, 'learning_rate': 6.275555555555556e-06, 'epoch': 2.06}


 69%|██████▉   | 3870/5625 [3:00:16<1:22:07,  2.81s/it]

{'loss': 0.1154, 'grad_norm': 6.2988176345825195, 'learning_rate': 6.24e-06, 'epoch': 2.06}


 69%|██████▉   | 3880/5625 [3:00:43<1:19:21,  2.73s/it]

{'loss': 0.1317, 'grad_norm': 0.4870106875896454, 'learning_rate': 6.204444444444445e-06, 'epoch': 2.07}


 69%|██████▉   | 3890/5625 [3:01:10<1:17:42,  2.69s/it]

{'loss': 0.1522, 'grad_norm': 3.1646478176116943, 'learning_rate': 6.16888888888889e-06, 'epoch': 2.07}


 69%|██████▉   | 3900/5625 [3:01:37<1:18:03,  2.71s/it]

{'loss': 0.0388, 'grad_norm': 0.39191195368766785, 'learning_rate': 6.133333333333334e-06, 'epoch': 2.08}


 70%|██████▉   | 3910/5625 [3:02:05<1:17:45,  2.72s/it]

{'loss': 0.0839, 'grad_norm': 0.14401362836360931, 'learning_rate': 6.097777777777778e-06, 'epoch': 2.09}


 70%|██████▉   | 3920/5625 [3:02:32<1:20:00,  2.82s/it]

{'loss': 0.1893, 'grad_norm': 11.101832389831543, 'learning_rate': 6.062222222222223e-06, 'epoch': 2.09}


 70%|██████▉   | 3930/5625 [3:03:00<1:19:19,  2.81s/it]

{'loss': 0.1082, 'grad_norm': 8.312585830688477, 'learning_rate': 6.026666666666668e-06, 'epoch': 2.1}


 70%|███████   | 3940/5625 [3:03:27<1:17:23,  2.76s/it]

{'loss': 0.1333, 'grad_norm': 0.9290149807929993, 'learning_rate': 5.991111111111112e-06, 'epoch': 2.1}


 70%|███████   | 3950/5625 [3:03:54<1:15:28,  2.70s/it]

{'loss': 0.0885, 'grad_norm': 14.712575912475586, 'learning_rate': 5.955555555555555e-06, 'epoch': 2.11}


 70%|███████   | 3960/5625 [3:04:22<1:15:10,  2.71s/it]

{'loss': 0.0849, 'grad_norm': 10.04532241821289, 'learning_rate': 5.92e-06, 'epoch': 2.11}


 71%|███████   | 3970/5625 [3:04:49<1:15:38,  2.74s/it]

{'loss': 0.1242, 'grad_norm': 6.147139072418213, 'learning_rate': 5.884444444444445e-06, 'epoch': 2.12}


 71%|███████   | 3980/5625 [3:05:16<1:13:53,  2.70s/it]

{'loss': 0.0756, 'grad_norm': 0.19679391384124756, 'learning_rate': 5.8488888888888895e-06, 'epoch': 2.12}


 71%|███████   | 3990/5625 [3:05:43<1:13:11,  2.69s/it]

{'loss': 0.1676, 'grad_norm': 11.52263069152832, 'learning_rate': 5.813333333333334e-06, 'epoch': 2.13}


 71%|███████   | 4000/5625 [3:06:10<1:13:09,  2.70s/it]

{'loss': 0.1066, 'grad_norm': 12.621536254882812, 'learning_rate': 5.777777777777778e-06, 'epoch': 2.13}


 71%|███████▏  | 4010/5625 [3:06:37<1:12:35,  2.70s/it]

{'loss': 0.0247, 'grad_norm': 11.307174682617188, 'learning_rate': 5.742222222222223e-06, 'epoch': 2.14}


 71%|███████▏  | 4020/5625 [3:07:04<1:12:41,  2.72s/it]

{'loss': 0.2746, 'grad_norm': 10.446011543273926, 'learning_rate': 5.706666666666667e-06, 'epoch': 2.14}


 72%|███████▏  | 4030/5625 [3:07:32<1:14:07,  2.79s/it]

{'loss': 0.103, 'grad_norm': 13.233839988708496, 'learning_rate': 5.671111111111112e-06, 'epoch': 2.15}


 72%|███████▏  | 4040/5625 [3:07:58<1:10:21,  2.66s/it]

{'loss': 0.1833, 'grad_norm': 4.650296211242676, 'learning_rate': 5.635555555555557e-06, 'epoch': 2.15}


 72%|███████▏  | 4049/5625 [3:08:22<1:12:13,  2.75s/it]

In [9]:
import torch

# Check PyTorch version
print("PyTorch version:", torch.__version__)

# Check MPS availability
if torch.backends.mps.is_available():
    print("MPS is available and will be used for acceleration!")
else:
    print("MPS is not available. Running on CPU.")


PyTorch version: 2.2.2
MPS is available and will be used for acceleration!
