In [2]:
import pandas as pd


df = pd.read_csv('dataset\dataset.csv')


print(df.head())


          userName  score                   at  \
0  Pengguna Google      5  2024-09-08 03:31:50   
1  Pengguna Google      1  2024-09-08 03:29:56   
2  Pengguna Google      5  2024-09-08 03:26:38   
3  Pengguna Google      5  2024-09-08 03:25:14   
4  Pengguna Google      1  2024-09-08 03:24:01   

                                             content  
0                                      makasih toped  
1  Aplikasi php sudah banyak dikasih promo &sudah...  
2                                        mantab... 👍  
3                                          Good good  
4  Sangat buruk sebagai pengguna lama akun affali...  


In [3]:

def label_sentiment(score):
    if score >= 4:
        return 'Positif'
    elif score == 3:
        return 'Netral'
    else:
        return 'Negatif'

df['sentiment'] = df['score'].apply(label_sentiment)


print(df[['content', 'score', 'sentiment']].head())


                                             content  score sentiment
0                                      makasih toped      5   Positif
1  Aplikasi php sudah banyak dikasih promo &sudah...      1   Negatif
2                                        mantab... 👍      5   Positif
3                                          Good good      5   Positif
4  Sangat buruk sebagai pengguna lama akun affali...      1   Negatif


In [4]:
import re
import pandas as pd


def preprocess_text(text):
    if isinstance(text, str): 
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)  
        text = text.lower().strip()  
        return text
    else:
        return ''  


df['content'] = df['content'].fillna('')


df['cleaned_content'] = df['content'].apply(preprocess_text)


print(df[['content', 'cleaned_content']].head())


                                             content  \
0                                      makasih toped   
1  Aplikasi php sudah banyak dikasih promo &sudah...   
2                                        mantab... 👍   
3                                          Good good   
4  Sangat buruk sebagai pengguna lama akun affali...   

                                     cleaned_content  
0                                      makasih toped  
1  aplikasi php sudah banyak dikasih promo sudah ...  
2                                             mantab  
3                                          good good  
4  sangat buruk sebagai pengguna lama akun affali...  


In [5]:
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")


train_encodings = tokenizer(df['cleaned_content'].tolist(), truncation=True, padding=True, max_length=128)


print(train_encodings['input_ids'][:2])


  from .autonotebook import tqdm as notebook_tqdm


[[2, 6679, 2246, 133, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 1339, 12648, 259, 271, 8852, 4525, 259, 6039, 469, 316, 1192, 11750, 16850, 737, 3854, 12510, 737, 22180, 14813, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [6]:
from sklearn.model_selection import train_test_split


train_texts, test_texts, train_labels, test_labels = train_test_split(df['cleaned_content'], df['sentiment'], test_size=0.2, stratify=df['sentiment'])


train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128)


In [7]:
import torch


label_dict = {'Negatif': 0, 'Netral': 1, 'Positif': 2}
train_labels = train_labels.map(label_dict)
test_labels = test_labels.map(label_dict)


class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
test_dataset = SentimentDataset(test_encodings, test_labels.tolist())


# IndoBERT

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments


model = BertForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", num_labels=3)


training_args = TrainingArguments(
    output_dir='./results', 
    num_train_epochs=3, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=64, 
    warmup_steps=500, 
    weight_decay=0.01, 
    logging_dir='./logs', 
    logging_steps=10, 
    evaluation_strategy="epoch",  
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


trainer.train()


# TinyBert

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


model = AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")


train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128)


train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
test_dataset = SentimentDataset(test_encodings, test_labels.tolist())


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 2/75000 [01:24<880:59:48, 42.29s/it]
                                                   
  0%|          | 12/75000 [00:00<1:08:19, 18.29it/s]

{'loss': 1.0988, 'grad_norm': 1.3116143941879272, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


                                                    
  0%|          | 24/75000 [00:01<53:30, 23.35it/s]  

{'loss': 1.0944, 'grad_norm': 1.7751038074493408, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


                                                  
  0%|          | 33/75000 [00:01<51:48, 24.11it/s]  

{'loss': 1.0894, 'grad_norm': 1.1989943981170654, 'learning_rate': 3e-06, 'epoch': 0.0}


                                                  
  0%|          | 42/75000 [00:02<50:56, 24.52it/s]  

{'loss': 1.0781, 'grad_norm': 1.4209517240524292, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


                                                  
  0%|          | 54/75000 [00:02<51:09, 24.42it/s]  

{'loss': 1.0613, 'grad_norm': 1.6097891330718994, 'learning_rate': 5e-06, 'epoch': 0.0}


                                                  
  0%|          | 63/75000 [00:02<52:41, 23.70it/s]  

{'loss': 1.033, 'grad_norm': 1.5323057174682617, 'learning_rate': 6e-06, 'epoch': 0.0}


                                                  
  0%|          | 72/75000 [00:03<51:36, 24.20it/s]  

{'loss': 0.9983, 'grad_norm': 1.6836086511611938, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.0}


                                                  
  0%|          | 84/75000 [00:03<50:52, 24.55it/s]  

{'loss': 0.9664, 'grad_norm': 1.8953711986541748, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.0}


                                                  
  0%|          | 93/75000 [00:04<51:10, 24.39it/s]  

{'loss': 0.9019, 'grad_norm': 1.7791447639465332, 'learning_rate': 9e-06, 'epoch': 0.0}


                                                  
  0%|          | 105/75000 [00:04<50:04, 24.93it/s] 

{'loss': 0.9066, 'grad_norm': 1.2083836793899536, 'learning_rate': 1e-05, 'epoch': 0.0}


                                                   
  0%|          | 114/75000 [00:05<50:29, 24.72it/s] 

{'loss': 0.8507, 'grad_norm': 1.892709732055664, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.0}


                                                   
  0%|          | 123/75000 [00:05<50:44, 24.60it/s] 

{'loss': 0.8231, 'grad_norm': 1.3668420314788818, 'learning_rate': 1.2e-05, 'epoch': 0.0}


                                                   
  0%|          | 135/75000 [00:05<50:17, 24.81it/s] 

{'loss': 0.7338, 'grad_norm': 1.8544846773147583, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


                                                   
  0%|          | 144/75000 [00:06<51:46, 24.10it/s] 

{'loss': 0.7538, 'grad_norm': 2.1945302486419678, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.01}


                                                   
  0%|          | 153/75000 [00:06<51:10, 24.38it/s] 

{'loss': 0.699, 'grad_norm': 1.0131860971450806, 'learning_rate': 1.5e-05, 'epoch': 0.01}


                                                   
  0%|          | 165/75000 [00:07<50:59, 24.46it/s] 

{'loss': 0.7012, 'grad_norm': 0.9873828291893005, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.01}


                                                   
  0%|          | 174/75000 [00:07<50:58, 24.46it/s] 

{'loss': 0.6337, 'grad_norm': 1.3643829822540283, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.01}


                                                   
  0%|          | 183/75000 [00:07<51:33, 24.18it/s] 

{'loss': 0.6237, 'grad_norm': 1.7066338062286377, 'learning_rate': 1.8e-05, 'epoch': 0.01}


                                                   
  0%|          | 195/75000 [00:08<50:12, 24.83it/s] 

{'loss': 0.6213, 'grad_norm': 2.0286905765533447, 'learning_rate': 1.9e-05, 'epoch': 0.01}


                                                   
  0%|          | 204/75000 [00:08<50:17, 24.79it/s] 

{'loss': 0.7552, 'grad_norm': 3.9919629096984863, 'learning_rate': 2e-05, 'epoch': 0.01}


                                                   
  0%|          | 213/75000 [00:09<50:27, 24.70it/s] 

{'loss': 0.6616, 'grad_norm': 5.892150402069092, 'learning_rate': 2.1e-05, 'epoch': 0.01}


                                                   
  0%|          | 222/75000 [00:09<50:28, 24.69it/s] 

{'loss': 0.5607, 'grad_norm': 2.541869878768921, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.01}


                                                   
  0%|          | 234/75000 [00:09<51:29, 24.20it/s] 

{'loss': 0.7093, 'grad_norm': 3.4431586265563965, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.01}


                                                   
  0%|          | 243/75000 [00:10<50:37, 24.61it/s] 

{'loss': 0.6305, 'grad_norm': 3.391209363937378, 'learning_rate': 2.4e-05, 'epoch': 0.01}


                                                   
  0%|          | 252/75000 [00:10<51:02, 24.41it/s] 

{'loss': 0.516, 'grad_norm': 3.8747828006744385, 'learning_rate': 2.5e-05, 'epoch': 0.01}


                                                   
  0%|          | 264/75000 [00:11<50:08, 24.84it/s] 

{'loss': 0.5068, 'grad_norm': 4.5635666847229, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.01}


                                                   
  0%|          | 273/75000 [00:11<50:32, 24.64it/s] 

{'loss': 0.5603, 'grad_norm': 12.415044784545898, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.01}


                                                   
  0%|          | 282/75000 [00:11<51:31, 24.17it/s] 

{'loss': 0.5058, 'grad_norm': 8.092973709106445, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.01}


                                                   
  0%|          | 294/75000 [00:12<50:39, 24.58it/s] 

{'loss': 0.586, 'grad_norm': 8.047506332397461, 'learning_rate': 2.9e-05, 'epoch': 0.01}


                                                   
  0%|          | 303/75000 [00:12<52:34, 23.68it/s] 

{'loss': 0.5442, 'grad_norm': 2.665869951248169, 'learning_rate': 3e-05, 'epoch': 0.01}


                                                   
  0%|          | 315/75000 [00:13<50:49, 24.49it/s] 

{'loss': 0.5078, 'grad_norm': 6.23593807220459, 'learning_rate': 3.1e-05, 'epoch': 0.01}


                                                   
  0%|          | 324/75000 [00:13<51:04, 24.37it/s] 

{'loss': 0.4531, 'grad_norm': 4.856557369232178, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.01}


                                                   
  0%|          | 333/75000 [00:13<51:12, 24.30it/s] 

{'loss': 0.4547, 'grad_norm': 6.579380035400391, 'learning_rate': 3.3e-05, 'epoch': 0.01}


                                                   
  0%|          | 345/75000 [00:14<51:05, 24.35it/s] 

{'loss': 0.47, 'grad_norm': 8.84716510772705, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.01}


                                                   
  0%|          | 354/75000 [00:14<50:30, 24.63it/s] 

{'loss': 0.4793, 'grad_norm': 7.355767250061035, 'learning_rate': 3.5e-05, 'epoch': 0.01}


                                                   
  0%|          | 363/75000 [00:15<50:44, 24.51it/s] 

{'loss': 0.5308, 'grad_norm': 0.6117813587188721, 'learning_rate': 3.6e-05, 'epoch': 0.01}


                                                   
  0%|          | 375/75000 [00:15<50:32, 24.61it/s] 

{'loss': 0.7483, 'grad_norm': 8.203561782836914, 'learning_rate': 3.7e-05, 'epoch': 0.01}


                                                   
  1%|          | 384/75000 [00:16<52:51, 23.53it/s] 

{'loss': 0.5323, 'grad_norm': 3.214229106903076, 'learning_rate': 3.8e-05, 'epoch': 0.02}


                                                   
  1%|          | 393/75000 [00:16<51:43, 24.04it/s] 

{'loss': 0.5742, 'grad_norm': 3.4071547985076904, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.02}


                                                   
  1%|          | 402/75000 [00:16<51:05, 24.34it/s] 

{'loss': 0.4225, 'grad_norm': 3.211223840713501, 'learning_rate': 4e-05, 'epoch': 0.02}


                                                   
  1%|          | 414/75000 [00:17<50:49, 24.46it/s] 

{'loss': 0.4709, 'grad_norm': 3.7085933685302734, 'learning_rate': 4.1e-05, 'epoch': 0.02}


                                                   
  1%|          | 423/75000 [00:17<50:59, 24.37it/s] 

{'loss': 0.4855, 'grad_norm': 2.7950046062469482, 'learning_rate': 4.2e-05, 'epoch': 0.02}


                                                   
  1%|          | 435/75000 [00:18<51:18, 24.22it/s] 

{'loss': 0.4353, 'grad_norm': 10.67270278930664, 'learning_rate': 4.3e-05, 'epoch': 0.02}


                                                   
  1%|          | 444/75000 [00:18<50:32, 24.59it/s] 

{'loss': 0.4578, 'grad_norm': 16.485889434814453, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.02}


                                                   
  1%|          | 453/75000 [00:18<50:11, 24.76it/s] 

{'loss': 0.4683, 'grad_norm': 7.534764289855957, 'learning_rate': 4.5e-05, 'epoch': 0.02}


                                                   
  1%|          | 462/75000 [00:19<52:22, 23.72it/s] 

{'loss': 0.5224, 'grad_norm': 3.533252716064453, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.02}


                                                   
  1%|          | 474/75000 [00:19<50:48, 24.45it/s] 

{'loss': 0.4578, 'grad_norm': 5.430450439453125, 'learning_rate': 4.7e-05, 'epoch': 0.02}


                                                   
  1%|          | 483/75000 [00:20<51:33, 24.09it/s] 

{'loss': 0.4767, 'grad_norm': 6.8285136222839355, 'learning_rate': 4.8e-05, 'epoch': 0.02}


                                                   
  1%|          | 492/75000 [00:20<51:27, 24.13it/s] 

{'loss': 0.5088, 'grad_norm': 5.810445785522461, 'learning_rate': 4.9e-05, 'epoch': 0.02}


                                                   
  1%|          | 500/75000 [00:20<50:51, 24.41it/s] 

{'loss': 0.3942, 'grad_norm': 9.954153060913086, 'learning_rate': 5e-05, 'epoch': 0.02}


                                                     
  1%|          | 513/75000 [00:21<1:04:02, 19.39it/s]

{'loss': 0.4332, 'grad_norm': 16.402345657348633, 'learning_rate': 4.999328859060403e-05, 'epoch': 0.02}


                                                     
  1%|          | 525/75000 [00:22<53:30, 23.20it/s] 

{'loss': 0.4103, 'grad_norm': 7.713848114013672, 'learning_rate': 4.998657718120805e-05, 'epoch': 0.02}


                                                   
  1%|          | 534/75000 [00:22<53:44, 23.10it/s] 

{'loss': 0.4097, 'grad_norm': 16.32294273376465, 'learning_rate': 4.997986577181208e-05, 'epoch': 0.02}


                                                   
  1%|          | 543/75000 [00:23<51:16, 24.20it/s] 

{'loss': 0.4001, 'grad_norm': 15.986708641052246, 'learning_rate': 4.997315436241611e-05, 'epoch': 0.02}


                                                   
  1%|          | 555/75000 [00:23<50:34, 24.53it/s] 

{'loss': 0.4994, 'grad_norm': 4.6486124992370605, 'learning_rate': 4.996644295302014e-05, 'epoch': 0.02}


                                                   
  1%|          | 564/75000 [00:23<51:08, 24.26it/s] 

{'loss': 0.4011, 'grad_norm': 16.934770584106445, 'learning_rate': 4.995973154362416e-05, 'epoch': 0.02}


                                                   
  1%|          | 573/75000 [00:24<52:10, 23.78it/s] 

{'loss': 0.618, 'grad_norm': 4.360320091247559, 'learning_rate': 4.995302013422819e-05, 'epoch': 0.02}


                                                   
  1%|          | 582/75000 [00:24<51:53, 23.90it/s] 

{'loss': 0.4753, 'grad_norm': 15.060856819152832, 'learning_rate': 4.994630872483222e-05, 'epoch': 0.02}


                                                   
  1%|          | 594/75000 [00:25<51:21, 24.14it/s] 

{'loss': 0.5207, 'grad_norm': 4.855071544647217, 'learning_rate': 4.9939597315436246e-05, 'epoch': 0.02}


                                                   
  1%|          | 603/75000 [00:25<51:39, 24.01it/s] 

{'loss': 0.4463, 'grad_norm': 8.714061737060547, 'learning_rate': 4.9932885906040274e-05, 'epoch': 0.02}


                                                   
  1%|          | 612/75000 [00:25<51:32, 24.06it/s] 

{'loss': 0.5099, 'grad_norm': 8.24974536895752, 'learning_rate': 4.9926174496644296e-05, 'epoch': 0.02}


                                                   
  1%|          | 624/75000 [00:26<51:40, 23.99it/s] 

{'loss': 0.4909, 'grad_norm': 2.5058257579803467, 'learning_rate': 4.9919463087248325e-05, 'epoch': 0.02}


                                                   
  1%|          | 633/75000 [00:26<51:17, 24.17it/s] 

{'loss': 0.4144, 'grad_norm': 4.816734313964844, 'learning_rate': 4.991275167785235e-05, 'epoch': 0.03}


                                                   
  1%|          | 645/75000 [00:27<51:48, 23.92it/s] 

{'loss': 0.4499, 'grad_norm': 2.0370359420776367, 'learning_rate': 4.9906040268456375e-05, 'epoch': 0.03}


                                                   
  1%|          | 654/75000 [00:27<53:30, 23.16it/s] 

{'loss': 0.4982, 'grad_norm': 26.256145477294922, 'learning_rate': 4.989932885906041e-05, 'epoch': 0.03}


                                                   
  1%|          | 663/75000 [00:28<52:03, 23.80it/s] 

{'loss': 0.382, 'grad_norm': 14.863920211791992, 'learning_rate': 4.989261744966443e-05, 'epoch': 0.03}


                                                   
  1%|          | 672/75000 [00:28<50:48, 24.38it/s] 

{'loss': 0.3859, 'grad_norm': 7.168298244476318, 'learning_rate': 4.988590604026846e-05, 'epoch': 0.03}


                                                   
  1%|          | 684/75000 [00:28<50:13, 24.66it/s] 

{'loss': 0.4154, 'grad_norm': 12.969358444213867, 'learning_rate': 4.987919463087248e-05, 'epoch': 0.03}


                                                   
  1%|          | 693/75000 [00:29<49:44, 24.90it/s] 

{'loss': 0.4581, 'grad_norm': 2.6518380641937256, 'learning_rate': 4.987248322147651e-05, 'epoch': 0.03}


                                                   
  1%|          | 705/75000 [00:29<49:56, 24.79it/s] 

{'loss': 0.4237, 'grad_norm': 19.493364334106445, 'learning_rate': 4.986577181208054e-05, 'epoch': 0.03}


                                                   
  1%|          | 714/75000 [00:30<51:06, 24.22it/s] 

{'loss': 0.4376, 'grad_norm': 11.340117454528809, 'learning_rate': 4.985906040268457e-05, 'epoch': 0.03}


                                                   
  1%|          | 723/75000 [00:30<51:24, 24.08it/s] 

{'loss': 0.3593, 'grad_norm': 2.0391931533813477, 'learning_rate': 4.9852348993288597e-05, 'epoch': 0.03}


                                                   
  1%|          | 735/75000 [00:31<51:02, 24.25it/s] 

{'loss': 0.4853, 'grad_norm': 4.26596212387085, 'learning_rate': 4.984563758389262e-05, 'epoch': 0.03}


                                                   
  1%|          | 744/75000 [00:31<51:48, 23.89it/s] 

{'loss': 0.5235, 'grad_norm': 2.9156999588012695, 'learning_rate': 4.983892617449665e-05, 'epoch': 0.03}


                                                   
  1%|          | 753/75000 [00:31<51:38, 23.96it/s] 

{'loss': 0.5218, 'grad_norm': 4.807976722717285, 'learning_rate': 4.983221476510067e-05, 'epoch': 0.03}


                                                   
  1%|          | 762/75000 [00:32<51:23, 24.08it/s] 

{'loss': 0.432, 'grad_norm': 3.1014912128448486, 'learning_rate': 4.98255033557047e-05, 'epoch': 0.03}


                                                   
  1%|          | 774/75000 [00:32<52:01, 23.78it/s] 

{'loss': 0.3545, 'grad_norm': 5.955693244934082, 'learning_rate': 4.981879194630873e-05, 'epoch': 0.03}


                                                   
  1%|          | 783/75000 [00:33<50:55, 24.29it/s] 

{'loss': 0.5376, 'grad_norm': 4.445351600646973, 'learning_rate': 4.9812080536912754e-05, 'epoch': 0.03}


                                                   
  1%|          | 795/75000 [00:33<51:15, 24.12it/s] 

{'loss': 0.2597, 'grad_norm': 9.242044448852539, 'learning_rate': 4.980536912751678e-05, 'epoch': 0.03}


                                                   
  1%|          | 804/75000 [00:33<50:34, 24.45it/s] 

{'loss': 0.5409, 'grad_norm': 20.640310287475586, 'learning_rate': 4.9798657718120805e-05, 'epoch': 0.03}


                                                   
  1%|          | 813/75000 [00:34<50:39, 24.41it/s] 

{'loss': 0.393, 'grad_norm': 31.50870704650879, 'learning_rate': 4.979194630872483e-05, 'epoch': 0.03}


                                                   
  1%|          | 822/75000 [00:34<51:17, 24.10it/s] 

{'loss': 0.4287, 'grad_norm': 8.309524536132812, 'learning_rate': 4.978523489932886e-05, 'epoch': 0.03}


                                                   
  1%|          | 834/75000 [00:35<50:02, 24.70it/s] 

{'loss': 0.4451, 'grad_norm': 12.924861907958984, 'learning_rate': 4.977852348993289e-05, 'epoch': 0.03}


                                                   
  1%|          | 843/75000 [00:35<50:55, 24.27it/s] 

{'loss': 0.4536, 'grad_norm': 0.6234538555145264, 'learning_rate': 4.977181208053692e-05, 'epoch': 0.03}


                                                   
  1%|          | 855/75000 [00:35<51:37, 23.93it/s] 

{'loss': 0.5158, 'grad_norm': 16.29106903076172, 'learning_rate': 4.976510067114094e-05, 'epoch': 0.03}


                                                   
  1%|          | 864/75000 [00:36<50:48, 24.32it/s] 

{'loss': 0.3209, 'grad_norm': 1.259250283241272, 'learning_rate': 4.975838926174497e-05, 'epoch': 0.03}


                                                   
  1%|          | 873/75000 [00:36<49:58, 24.72it/s] 

{'loss': 0.3933, 'grad_norm': 4.73485803604126, 'learning_rate': 4.975167785234899e-05, 'epoch': 0.03}


                                                   
  1%|          | 882/75000 [00:37<50:21, 24.53it/s] 

{'loss': 0.4195, 'grad_norm': 8.330217361450195, 'learning_rate': 4.974496644295302e-05, 'epoch': 0.04}


                                                   
  1%|          | 894/75000 [00:37<50:29, 24.46it/s] 

{'loss': 0.4035, 'grad_norm': 14.365376472473145, 'learning_rate': 4.9738255033557055e-05, 'epoch': 0.04}


                                                   
  1%|          | 903/75000 [00:37<50:45, 24.33it/s] 

{'loss': 0.3445, 'grad_norm': 8.631732940673828, 'learning_rate': 4.9731543624161077e-05, 'epoch': 0.04}


                                                   
  1%|          | 912/75000 [00:38<51:47, 23.84it/s] 

{'loss': 0.4793, 'grad_norm': 13.955089569091797, 'learning_rate': 4.9724832214765105e-05, 'epoch': 0.04}


                                                   
  1%|          | 924/75000 [00:38<51:06, 24.15it/s] 

{'loss': 0.4366, 'grad_norm': 3.5505003929138184, 'learning_rate': 4.971812080536913e-05, 'epoch': 0.04}


                                                   
  1%|          | 933/75000 [00:39<51:12, 24.11it/s] 

{'loss': 0.3003, 'grad_norm': 8.212333679199219, 'learning_rate': 4.9711409395973155e-05, 'epoch': 0.04}


                                                   
  1%|▏         | 942/75000 [00:39<51:04, 24.16it/s] 

{'loss': 0.3909, 'grad_norm': 7.791329383850098, 'learning_rate': 4.9704697986577184e-05, 'epoch': 0.04}


                                                   
  1%|▏         | 954/75000 [00:40<50:48, 24.29it/s] 

{'loss': 0.4505, 'grad_norm': 6.713415145874023, 'learning_rate': 4.969798657718121e-05, 'epoch': 0.04}


                                                   
  1%|▏         | 963/75000 [00:40<50:34, 24.40it/s] 

{'loss': 0.3913, 'grad_norm': 3.7163732051849365, 'learning_rate': 4.969127516778524e-05, 'epoch': 0.04}


                                                   
  1%|▏         | 972/75000 [00:40<50:55, 24.23it/s] 

{'loss': 0.4067, 'grad_norm': 2.7424190044403076, 'learning_rate': 4.968456375838926e-05, 'epoch': 0.04}


                                                   
  1%|▏         | 984/75000 [00:41<54:49, 22.50it/s] 

{'loss': 0.3958, 'grad_norm': 3.973292112350464, 'learning_rate': 4.967785234899329e-05, 'epoch': 0.04}


                                                   
  1%|▏         | 993/75000 [00:41<53:42, 22.96it/s] 

{'loss': 0.4028, 'grad_norm': 2.5181679725646973, 'learning_rate': 4.967114093959731e-05, 'epoch': 0.04}


                                                   
  1%|▏         | 1000/75000 [00:42<51:57, 23.74it/s]

{'loss': 0.3595, 'grad_norm': 9.411211013793945, 'learning_rate': 4.966442953020135e-05, 'epoch': 0.04}


                                                      
  1%|▏         | 1014/75000 [00:42<1:01:09, 20.16it/s]

{'loss': 0.4709, 'grad_norm': 4.747110843658447, 'learning_rate': 4.965771812080537e-05, 'epoch': 0.04}


                                                      
  1%|▏         | 1023/75000 [00:43<54:55, 22.45it/s]

{'loss': 0.3306, 'grad_norm': 12.697327613830566, 'learning_rate': 4.96510067114094e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1032/75000 [00:43<55:38, 22.16it/s]

{'loss': 0.4066, 'grad_norm': 7.498948097229004, 'learning_rate': 4.964429530201343e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1044/75000 [00:44<51:58, 23.71it/s]

{'loss': 0.5696, 'grad_norm': 8.43144416809082, 'learning_rate': 4.963758389261745e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1053/75000 [00:44<52:03, 23.68it/s]

{'loss': 0.3687, 'grad_norm': 3.633849620819092, 'learning_rate': 4.963087248322148e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1065/75000 [00:45<50:41, 24.31it/s]

{'loss': 0.4228, 'grad_norm': 7.0218095779418945, 'learning_rate': 4.9624161073825506e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1074/75000 [00:45<51:10, 24.08it/s]

{'loss': 0.3839, 'grad_norm': 1.4894604682922363, 'learning_rate': 4.9617449664429535e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1083/75000 [00:45<50:37, 24.33it/s]

{'loss': 0.4306, 'grad_norm': 6.847921848297119, 'learning_rate': 4.961073825503356e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1095/75000 [00:46<51:12, 24.05it/s]

{'loss': 0.3678, 'grad_norm': 7.070922374725342, 'learning_rate': 4.9604026845637585e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1104/75000 [00:46<51:17, 24.01it/s]

{'loss': 0.2743, 'grad_norm': 0.9924837946891785, 'learning_rate': 4.9597315436241614e-05, 'epoch': 0.04}


                                                    
  1%|▏         | 1113/75000 [00:47<52:52, 23.29it/s]

{'loss': 0.5257, 'grad_norm': 4.567101001739502, 'learning_rate': 4.9590604026845635e-05, 'epoch': 0.04}


                                                    
  2%|▏         | 1125/75000 [00:47<50:39, 24.30it/s]

{'loss': 0.274, 'grad_norm': 3.958390235900879, 'learning_rate': 4.958389261744967e-05, 'epoch': 0.04}


                                                    
  2%|▏         | 1134/75000 [00:47<51:03, 24.11it/s]

{'loss': 0.4245, 'grad_norm': 16.36243438720703, 'learning_rate': 4.957718120805369e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1143/75000 [00:48<51:04, 24.10it/s]

{'loss': 0.4357, 'grad_norm': 13.528900146484375, 'learning_rate': 4.957046979865772e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1152/75000 [00:48<51:06, 24.08it/s]

{'loss': 0.3444, 'grad_norm': 3.960496425628662, 'learning_rate': 4.956375838926175e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1164/75000 [00:49<49:50, 24.69it/s]

{'loss': 0.5109, 'grad_norm': 12.784829139709473, 'learning_rate': 4.955704697986577e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1173/75000 [00:49<51:59, 23.67it/s]

{'loss': 0.3187, 'grad_norm': 2.7326717376708984, 'learning_rate': 4.95503355704698e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1182/75000 [00:49<51:36, 23.84it/s]

{'loss': 0.3273, 'grad_norm': 3.4209580421447754, 'learning_rate': 4.954362416107383e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1194/75000 [00:50<50:43, 24.25it/s]

{'loss': 0.5072, 'grad_norm': 6.30408239364624, 'learning_rate': 4.953691275167786e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1203/75000 [00:50<53:13, 23.11it/s]

{'loss': 0.451, 'grad_norm': 6.042940139770508, 'learning_rate': 4.953020134228188e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1212/75000 [00:51<52:05, 23.61it/s]

{'loss': 0.4189, 'grad_norm': 4.241903781890869, 'learning_rate': 4.952348993288591e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1224/75000 [00:51<51:01, 24.10it/s]

{'loss': 0.4352, 'grad_norm': 12.849108695983887, 'learning_rate': 4.9516778523489936e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1233/75000 [00:52<50:18, 24.44it/s]

{'loss': 0.2911, 'grad_norm': 11.688567161560059, 'learning_rate': 4.951006711409396e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1242/75000 [00:52<51:06, 24.05it/s]

{'loss': 0.4719, 'grad_norm': 6.859592437744141, 'learning_rate': 4.950335570469799e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1254/75000 [00:52<50:26, 24.37it/s]

{'loss': 0.343, 'grad_norm': 1.9389395713806152, 'learning_rate': 4.9496644295302015e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1263/75000 [00:53<50:23, 24.39it/s]

{'loss': 0.3243, 'grad_norm': 1.0583902597427368, 'learning_rate': 4.948993288590604e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1275/75000 [00:53<50:22, 24.39it/s]

{'loss': 0.4092, 'grad_norm': 2.556692123413086, 'learning_rate': 4.948322147651007e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1284/75000 [00:54<50:36, 24.27it/s]

{'loss': 0.4696, 'grad_norm': 42.25453567504883, 'learning_rate': 4.9476510067114094e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1293/75000 [00:54<53:36, 22.92it/s]

{'loss': 0.4694, 'grad_norm': 3.8264119625091553, 'learning_rate': 4.946979865771812e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1305/75000 [00:55<51:09, 24.01it/s]

{'loss': 0.4587, 'grad_norm': 5.9487738609313965, 'learning_rate': 4.946308724832215e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1314/75000 [00:55<52:10, 23.54it/s]

{'loss': 0.4993, 'grad_norm': 10.513191223144531, 'learning_rate': 4.945637583892618e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1323/75000 [00:55<51:48, 23.70it/s]

{'loss': 0.4828, 'grad_norm': 7.759576797485352, 'learning_rate': 4.94496644295302e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1332/75000 [00:56<52:11, 23.52it/s]

{'loss': 0.4355, 'grad_norm': 9.151788711547852, 'learning_rate': 4.944295302013423e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1344/75000 [00:56<51:04, 24.04it/s]

{'loss': 0.4346, 'grad_norm': 4.405697345733643, 'learning_rate': 4.943624161073826e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1353/75000 [00:57<50:46, 24.18it/s]

{'loss': 0.3973, 'grad_norm': 5.635621070861816, 'learning_rate': 4.9429530201342287e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1362/75000 [00:57<52:28, 23.39it/s]

{'loss': 0.3544, 'grad_norm': 4.225963592529297, 'learning_rate': 4.9422818791946315e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1374/75000 [00:58<54:52, 22.36it/s]

{'loss': 0.3731, 'grad_norm': 6.563172817230225, 'learning_rate': 4.941610738255034e-05, 'epoch': 0.05}


                                                    
  2%|▏         | 1383/75000 [00:58<53:45, 22.82it/s]

{'loss': 0.3108, 'grad_norm': 7.735836029052734, 'learning_rate': 4.9409395973154365e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1392/75000 [00:58<52:48, 23.23it/s]

{'loss': 0.4509, 'grad_norm': 8.158673286437988, 'learning_rate': 4.940268456375839e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1404/75000 [00:59<52:29, 23.37it/s]

{'loss': 0.3437, 'grad_norm': 43.09151840209961, 'learning_rate': 4.9395973154362416e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1413/75000 [00:59<52:12, 23.49it/s]

{'loss': 0.4955, 'grad_norm': 10.811455726623535, 'learning_rate': 4.9389261744966444e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1425/75000 [01:00<51:06, 23.99it/s]

{'loss': 0.4647, 'grad_norm': 12.31628131866455, 'learning_rate': 4.938255033557047e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1434/75000 [01:00<51:49, 23.66it/s]

{'loss': 0.3993, 'grad_norm': 9.374700546264648, 'learning_rate': 4.93758389261745e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1443/75000 [01:00<51:12, 23.94it/s]

{'loss': 0.3517, 'grad_norm': 6.833562850952148, 'learning_rate': 4.936912751677852e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1452/75000 [01:01<51:17, 23.90it/s]

{'loss': 0.2627, 'grad_norm': 5.363276958465576, 'learning_rate': 4.936241610738255e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1464/75000 [01:01<53:34, 22.87it/s]

{'loss': 0.4021, 'grad_norm': 5.838996887207031, 'learning_rate': 4.935570469798658e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1473/75000 [01:02<51:33, 23.77it/s]

{'loss': 0.518, 'grad_norm': 7.052812099456787, 'learning_rate': 4.934899328859061e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1482/75000 [01:02<51:21, 23.86it/s]

{'loss': 0.3658, 'grad_norm': 5.191783905029297, 'learning_rate': 4.934228187919464e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1494/75000 [01:03<50:02, 24.48it/s]

{'loss': 0.4207, 'grad_norm': 10.67072582244873, 'learning_rate': 4.933557046979866e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1500/75000 [01:03<50:14, 24.39it/s]

{'loss': 0.4194, 'grad_norm': 2.102109670639038, 'learning_rate': 4.932885906040269e-05, 'epoch': 0.06}


                                                      
  2%|▏         | 1515/75000 [01:04<1:02:06, 19.72it/s]

{'loss': 0.4248, 'grad_norm': 19.59059715270996, 'learning_rate': 4.932214765100671e-05, 'epoch': 0.06}


                                                      
  2%|▏         | 1524/75000 [01:04<54:36, 22.42it/s]

{'loss': 0.4043, 'grad_norm': 14.720948219299316, 'learning_rate': 4.931543624161074e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1533/75000 [01:05<52:11, 23.46it/s]

{'loss': 0.2538, 'grad_norm': 4.270163059234619, 'learning_rate': 4.9308724832214767e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1542/75000 [01:05<51:57, 23.56it/s]

{'loss': 0.3418, 'grad_norm': 6.461110591888428, 'learning_rate': 4.9302013422818795e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1554/75000 [01:05<50:39, 24.17it/s]

{'loss': 0.4232, 'grad_norm': 6.085655689239502, 'learning_rate': 4.9295302013422824e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1563/75000 [01:06<50:33, 24.21it/s]

{'loss': 0.3634, 'grad_norm': 15.693767547607422, 'learning_rate': 4.9288590604026845e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1572/75000 [01:06<51:14, 23.88it/s]

{'loss': 0.3608, 'grad_norm': 5.450690269470215, 'learning_rate': 4.9281879194630874e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1584/75000 [01:07<50:36, 24.18it/s]

{'loss': 0.3797, 'grad_norm': 5.530896186828613, 'learning_rate': 4.92751677852349e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1593/75000 [01:07<51:25, 23.79it/s]

{'loss': 0.3981, 'grad_norm': 6.543654441833496, 'learning_rate': 4.926845637583893e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1602/75000 [01:07<51:16, 23.86it/s]

{'loss': 0.4482, 'grad_norm': 6.79025411605835, 'learning_rate': 4.926174496644296e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1614/75000 [01:08<49:52, 24.52it/s]

{'loss': 0.3716, 'grad_norm': 2.075640916824341, 'learning_rate': 4.925503355704698e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1623/75000 [01:08<50:43, 24.11it/s]

{'loss': 0.5376, 'grad_norm': 4.190098762512207, 'learning_rate': 4.924832214765101e-05, 'epoch': 0.06}


                                                    
  2%|▏         | 1635/75000 [01:09<52:43, 23.19it/s]

{'loss': 0.3853, 'grad_norm': 9.343158721923828, 'learning_rate': 4.924161073825503e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1644/75000 [01:09<52:09, 23.44it/s]

{'loss': 0.3671, 'grad_norm': 8.677580833435059, 'learning_rate': 4.923489932885906e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1653/75000 [01:10<51:15, 23.85it/s]

{'loss': 0.3468, 'grad_norm': 5.252985954284668, 'learning_rate': 4.922818791946309e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1662/75000 [01:10<50:46, 24.07it/s]

{'loss': 0.3742, 'grad_norm': 9.43946647644043, 'learning_rate': 4.922147651006712e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1674/75000 [01:10<49:10, 24.85it/s]

{'loss': 0.4155, 'grad_norm': 9.650106430053711, 'learning_rate': 4.9214765100671146e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1683/75000 [01:11<49:28, 24.70it/s]

{'loss': 0.3818, 'grad_norm': 3.758129835128784, 'learning_rate': 4.920805369127517e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1695/75000 [01:11<49:04, 24.90it/s]

{'loss': 0.3856, 'grad_norm': 7.799080848693848, 'learning_rate': 4.9201342281879196e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1704/75000 [01:12<49:28, 24.69it/s]

{'loss': 0.3213, 'grad_norm': 0.4736994206905365, 'learning_rate': 4.9194630872483225e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1713/75000 [01:12<52:30, 23.26it/s]

{'loss': 0.368, 'grad_norm': 0.8661525845527649, 'learning_rate': 4.918791946308725e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1722/75000 [01:12<52:06, 23.44it/s]

{'loss': 0.4423, 'grad_norm': 2.1185765266418457, 'learning_rate': 4.918120805369128e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1734/75000 [01:13<51:21, 23.78it/s]

{'loss': 0.4858, 'grad_norm': 8.034360885620117, 'learning_rate': 4.9174496644295304e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1743/75000 [01:13<51:20, 23.78it/s]

{'loss': 0.3556, 'grad_norm': 4.147830009460449, 'learning_rate': 4.916778523489933e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1752/75000 [01:14<51:09, 23.86it/s]

{'loss': 0.3377, 'grad_norm': 3.2265474796295166, 'learning_rate': 4.9161073825503354e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1764/75000 [01:14<50:47, 24.03it/s]

{'loss': 0.444, 'grad_norm': 12.461884498596191, 'learning_rate': 4.915436241610738e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1773/75000 [01:15<50:51, 24.00it/s]

{'loss': 0.3926, 'grad_norm': 6.961667060852051, 'learning_rate': 4.914765100671141e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1782/75000 [01:15<52:34, 23.21it/s]

{'loss': 0.4004, 'grad_norm': 5.127128601074219, 'learning_rate': 4.914093959731544e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1794/75000 [01:15<50:55, 23.96it/s]

{'loss': 0.3512, 'grad_norm': 4.183096885681152, 'learning_rate': 4.913422818791947e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1803/75000 [01:16<51:30, 23.69it/s]

{'loss': 0.3752, 'grad_norm': 1.52204430103302, 'learning_rate': 4.912751677852349e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1815/75000 [01:16<51:38, 23.62it/s]

{'loss': 0.374, 'grad_norm': 3.451469898223877, 'learning_rate': 4.912080536912752e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1824/75000 [01:17<51:39, 23.61it/s]

{'loss': 0.3775, 'grad_norm': 5.7904181480407715, 'learning_rate': 4.911409395973155e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1833/75000 [01:17<51:07, 23.85it/s]

{'loss': 0.3431, 'grad_norm': 9.338968276977539, 'learning_rate': 4.9107382550335576e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1842/75000 [01:18<51:03, 23.88it/s]

{'loss': 0.4526, 'grad_norm': 11.914287567138672, 'learning_rate': 4.91006711409396e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1854/75000 [01:18<50:27, 24.16it/s]

{'loss': 0.3889, 'grad_norm': 1.5787476301193237, 'learning_rate': 4.9093959731543626e-05, 'epoch': 0.07}


                                                    
  2%|▏         | 1863/75000 [01:18<51:50, 23.51it/s]

{'loss': 0.3493, 'grad_norm': 9.604819297790527, 'learning_rate': 4.9087248322147654e-05, 'epoch': 0.07}


                                                    
  2%|▎         | 1875/75000 [01:19<50:28, 24.15it/s]

{'loss': 0.4694, 'grad_norm': 6.204939365386963, 'learning_rate': 4.9080536912751676e-05, 'epoch': 0.07}


                                                    
  3%|▎         | 1884/75000 [01:19<50:54, 23.94it/s]

{'loss': 0.3024, 'grad_norm': 4.426483154296875, 'learning_rate': 4.907382550335571e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1893/75000 [01:20<51:22, 23.72it/s]

{'loss': 0.361, 'grad_norm': 3.161574125289917, 'learning_rate': 4.906711409395973e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1902/75000 [01:20<51:49, 23.51it/s]

{'loss': 0.5321, 'grad_norm': 4.915019989013672, 'learning_rate': 4.906040268456376e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1914/75000 [01:21<50:19, 24.20it/s]

{'loss': 0.5333, 'grad_norm': 13.672557830810547, 'learning_rate': 4.905369127516779e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1923/75000 [01:21<50:36, 24.06it/s]

{'loss': 0.3631, 'grad_norm': 3.117110013961792, 'learning_rate': 4.904697986577181e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1932/75000 [01:21<52:04, 23.39it/s]

{'loss': 0.3521, 'grad_norm': 4.270979881286621, 'learning_rate': 4.904026845637584e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1944/75000 [01:22<55:18, 22.02it/s]

{'loss': 0.4427, 'grad_norm': 6.9525580406188965, 'learning_rate': 4.903355704697987e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1953/75000 [01:22<52:33, 23.16it/s]

{'loss': 0.3083, 'grad_norm': 6.962002277374268, 'learning_rate': 4.90268456375839e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1965/75000 [01:23<50:54, 23.91it/s]

{'loss': 0.3961, 'grad_norm': 7.733392715454102, 'learning_rate': 4.902013422818792e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1974/75000 [01:23<51:09, 23.79it/s]

{'loss': 0.2985, 'grad_norm': 8.915204048156738, 'learning_rate': 4.901342281879195e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1983/75000 [01:23<50:53, 23.91it/s]

{'loss': 0.4546, 'grad_norm': 0.9501029253005981, 'learning_rate': 4.900671140939598e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 1992/75000 [01:24<54:59, 22.13it/s]

{'loss': 0.2551, 'grad_norm': 1.1402658224105835, 'learning_rate': 4.9e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2000/75000 [01:24<54:36, 22.28it/s]

{'loss': 0.3458, 'grad_norm': 4.491228103637695, 'learning_rate': 4.8993288590604034e-05, 'epoch': 0.08}


                                                      
  3%|▎         | 2015/75000 [01:25<1:00:34, 20.08it/s]

{'loss': 0.369, 'grad_norm': 7.784203052520752, 'learning_rate': 4.8986577181208056e-05, 'epoch': 0.08}


                                                      
  3%|▎         | 2024/75000 [01:26<53:53, 22.57it/s]

{'loss': 0.4013, 'grad_norm': 4.807943820953369, 'learning_rate': 4.8979865771812084e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2033/75000 [01:26<52:11, 23.30it/s]

{'loss': 0.4839, 'grad_norm': 2.6259350776672363, 'learning_rate': 4.8973154362416106e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2042/75000 [01:26<51:56, 23.41it/s]

{'loss': 0.3801, 'grad_norm': 2.876549243927002, 'learning_rate': 4.8966442953020134e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2054/75000 [01:27<51:56, 23.40it/s]

{'loss': 0.3008, 'grad_norm': 1.9068974256515503, 'learning_rate': 4.895973154362416e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2063/75000 [01:27<51:08, 23.77it/s]

{'loss': 0.3726, 'grad_norm': 3.3645668029785156, 'learning_rate': 4.895302013422819e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2072/75000 [01:28<52:20, 23.22it/s]

{'loss': 0.3127, 'grad_norm': 1.2612035274505615, 'learning_rate': 4.894630872483222e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2084/75000 [01:28<51:04, 23.80it/s]

{'loss': 0.301, 'grad_norm': 3.3327553272247314, 'learning_rate': 4.893959731543624e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2093/75000 [01:29<51:36, 23.54it/s]

{'loss': 0.4088, 'grad_norm': 8.609478950500488, 'learning_rate': 4.893288590604027e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2102/75000 [01:29<52:40, 23.07it/s]

{'loss': 0.4134, 'grad_norm': 5.639704704284668, 'learning_rate': 4.89261744966443e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2114/75000 [01:29<51:26, 23.62it/s]

{'loss': 0.3515, 'grad_norm': 9.479950904846191, 'learning_rate': 4.891946308724832e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2123/75000 [01:30<54:02, 22.48it/s]

{'loss': 0.2602, 'grad_norm': 6.025579452514648, 'learning_rate': 4.8912751677852356e-05, 'epoch': 0.08}


                                                    
  3%|▎         | 2132/75000 [01:30<51:58, 23.37it/s]

{'loss': 0.3044, 'grad_norm': 3.6255600452423096, 'learning_rate': 4.890604026845638e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2144/75000 [01:31<50:53, 23.86it/s]

{'loss': 0.3895, 'grad_norm': 14.356486320495605, 'learning_rate': 4.8899328859060406e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2153/75000 [01:31<51:06, 23.76it/s]

{'loss': 0.3927, 'grad_norm': 9.343496322631836, 'learning_rate': 4.889261744966443e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2162/75000 [01:32<52:17, 23.21it/s]

{'loss': 0.3463, 'grad_norm': 5.087536334991455, 'learning_rate': 4.888590604026846e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2174/75000 [01:32<51:02, 23.78it/s]

{'loss': 0.4192, 'grad_norm': 5.837234020233154, 'learning_rate': 4.8879194630872485e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2183/75000 [01:32<55:04, 22.03it/s]

{'loss': 0.3932, 'grad_norm': 5.362022876739502, 'learning_rate': 4.8872483221476514e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2192/75000 [01:33<52:35, 23.08it/s]

{'loss': 0.285, 'grad_norm': 3.8169753551483154, 'learning_rate': 4.886577181208054e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2204/75000 [01:33<51:58, 23.34it/s]

{'loss': 0.4298, 'grad_norm': 2.3903021812438965, 'learning_rate': 4.8859060402684564e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2213/75000 [01:34<51:13, 23.68it/s]

{'loss': 0.3519, 'grad_norm': 4.915894031524658, 'learning_rate': 4.885234899328859e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2225/75000 [01:34<50:27, 24.03it/s]

{'loss': 0.4311, 'grad_norm': 11.968145370483398, 'learning_rate': 4.8845637583892614e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2234/75000 [01:35<51:21, 23.61it/s]

{'loss': 0.4043, 'grad_norm': 13.523951530456543, 'learning_rate': 4.883892617449665e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2243/75000 [01:35<51:17, 23.64it/s]

{'loss': 0.3928, 'grad_norm': 4.226171970367432, 'learning_rate': 4.883221476510068e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2252/75000 [01:35<56:02, 21.63it/s]

{'loss': 0.3822, 'grad_norm': 1.6503746509552002, 'learning_rate': 4.88255033557047e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2264/75000 [01:36<51:42, 23.44it/s]

{'loss': 0.506, 'grad_norm': 2.196451425552368, 'learning_rate': 4.881879194630873e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2273/75000 [01:36<51:35, 23.50it/s]

{'loss': 0.3614, 'grad_norm': 13.2523775100708, 'learning_rate': 4.881208053691275e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2282/75000 [01:37<51:17, 23.63it/s]

{'loss': 0.5049, 'grad_norm': 6.903136253356934, 'learning_rate': 4.880536912751678e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2294/75000 [01:37<50:21, 24.06it/s]

{'loss': 0.4285, 'grad_norm': 7.81834602355957, 'learning_rate': 4.879865771812081e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2303/75000 [01:38<50:58, 23.77it/s]

{'loss': 0.3995, 'grad_norm': 4.544469356536865, 'learning_rate': 4.8791946308724836e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2312/75000 [01:38<51:18, 23.61it/s]

{'loss': 0.2798, 'grad_norm': 10.797348022460938, 'learning_rate': 4.8785234899328864e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2324/75000 [01:39<53:46, 22.52it/s]

{'loss': 0.4703, 'grad_norm': 25.89810562133789, 'learning_rate': 4.8778523489932886e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2333/75000 [01:39<52:20, 23.14it/s]

{'loss': 0.3617, 'grad_norm': 3.3228588104248047, 'learning_rate': 4.8771812080536915e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2342/75000 [01:39<51:53, 23.33it/s]

{'loss': 0.4316, 'grad_norm': 13.15185546875, 'learning_rate': 4.8765100671140937e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2354/75000 [01:40<50:43, 23.87it/s]

{'loss': 0.3502, 'grad_norm': 2.692988157272339, 'learning_rate': 4.875838926174497e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2363/75000 [01:40<51:39, 23.44it/s]

{'loss': 0.3848, 'grad_norm': 4.257599830627441, 'learning_rate': 4.8751677852349e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2372/75000 [01:41<52:14, 23.17it/s]

{'loss': 0.4103, 'grad_norm': 2.3885300159454346, 'learning_rate': 4.874496644295302e-05, 'epoch': 0.09}


                                                    
  3%|▎         | 2384/75000 [01:41<51:37, 23.44it/s]

{'loss': 0.3957, 'grad_norm': 2.39809250831604, 'learning_rate': 4.873825503355705e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2393/75000 [01:41<52:34, 23.02it/s]

{'loss': 0.3479, 'grad_norm': 1.1406453847885132, 'learning_rate': 4.873154362416107e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2402/75000 [01:42<53:08, 22.77it/s]

{'loss': 0.4295, 'grad_norm': 8.187786102294922, 'learning_rate': 4.87248322147651e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2414/75000 [01:42<51:11, 23.63it/s]

{'loss': 0.3063, 'grad_norm': 6.318447113037109, 'learning_rate': 4.871812080536913e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2423/75000 [01:43<50:43, 23.85it/s]

{'loss': 0.3042, 'grad_norm': 6.322178840637207, 'learning_rate': 4.871140939597316e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2432/75000 [01:43<51:12, 23.62it/s]

{'loss': 0.4124, 'grad_norm': 5.983307838439941, 'learning_rate': 4.870469798657719e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2444/75000 [01:44<50:38, 23.88it/s]

{'loss': 0.4955, 'grad_norm': 16.927749633789062, 'learning_rate': 4.869798657718121e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2453/75000 [01:44<51:46, 23.35it/s]

{'loss': 0.4384, 'grad_norm': 3.7167482376098633, 'learning_rate': 4.869127516778524e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2462/75000 [01:44<52:35, 22.99it/s]

{'loss': 0.2565, 'grad_norm': 4.343326568603516, 'learning_rate': 4.868456375838926e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2474/75000 [01:45<50:58, 23.72it/s]

{'loss': 0.2784, 'grad_norm': 4.600754737854004, 'learning_rate': 4.8677852348993294e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2483/75000 [01:45<52:42, 22.93it/s]

{'loss': 0.3433, 'grad_norm': 6.821732044219971, 'learning_rate': 4.8671140939597316e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2492/75000 [01:46<52:24, 23.06it/s]

{'loss': 0.3028, 'grad_norm': 8.93161392211914, 'learning_rate': 4.8664429530201344e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2500/75000 [01:46<51:24, 23.51it/s]

{'loss': 0.2802, 'grad_norm': 7.283571243286133, 'learning_rate': 4.865771812080537e-05, 'epoch': 0.1}


                                                      
  3%|▎         | 2513/75000 [01:47<1:02:14, 19.41it/s]

{'loss': 0.4202, 'grad_norm': 7.5948004722595215, 'learning_rate': 4.8651006711409395e-05, 'epoch': 0.1}


                                                      
  3%|▎         | 2522/75000 [01:47<54:02, 22.35it/s]

{'loss': 0.4328, 'grad_norm': 3.7962331771850586, 'learning_rate': 4.864429530201342e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2534/75000 [01:48<52:10, 23.15it/s]

{'loss': 0.3823, 'grad_norm': 6.532349586486816, 'learning_rate': 4.863758389261745e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2543/75000 [01:48<50:27, 23.94it/s]

{'loss': 0.3585, 'grad_norm': 1.0810264348983765, 'learning_rate': 4.863087248322148e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2552/75000 [01:49<50:11, 24.06it/s]

{'loss': 0.3636, 'grad_norm': 8.162856101989746, 'learning_rate': 4.862416107382551e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2564/75000 [01:49<51:02, 23.65it/s]

{'loss': 0.2647, 'grad_norm': 2.288961410522461, 'learning_rate': 4.861744966442953e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2573/75000 [01:49<50:03, 24.12it/s]

{'loss': 0.3437, 'grad_norm': 12.644082069396973, 'learning_rate': 4.861073825503356e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2582/75000 [01:50<49:58, 24.15it/s]

{'loss': 0.3858, 'grad_norm': 13.351898193359375, 'learning_rate': 4.860402684563759e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2594/75000 [01:50<51:21, 23.49it/s]

{'loss': 0.4562, 'grad_norm': 9.904934883117676, 'learning_rate': 4.8597315436241616e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2603/75000 [01:51<51:24, 23.47it/s]

{'loss': 0.3462, 'grad_norm': 2.04388165473938, 'learning_rate': 4.859060402684564e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2615/75000 [01:51<50:27, 23.91it/s]

{'loss': 0.4576, 'grad_norm': 15.630359649658203, 'learning_rate': 4.858389261744967e-05, 'epoch': 0.1}


                                                    
  3%|▎         | 2624/75000 [01:52<53:00, 22.76it/s]

{'loss': 0.3593, 'grad_norm': 9.160200119018555, 'learning_rate': 4.8577181208053695e-05, 'epoch': 0.1}


                                                    
  4%|▎         | 2633/75000 [01:52<50:58, 23.66it/s]

{'loss': 0.3764, 'grad_norm': 3.5777361392974854, 'learning_rate': 4.857046979865772e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2642/75000 [01:52<50:43, 23.78it/s]

{'loss': 0.3345, 'grad_norm': 5.0516791343688965, 'learning_rate': 4.8563758389261746e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2654/75000 [01:53<50:40, 23.80it/s]

{'loss': 0.3855, 'grad_norm': 3.9661567211151123, 'learning_rate': 4.8557046979865774e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2663/75000 [01:53<50:57, 23.66it/s]

{'loss': 0.4704, 'grad_norm': 2.451723337173462, 'learning_rate': 4.85503355704698e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2672/75000 [01:54<50:54, 23.68it/s]

{'loss': 0.2576, 'grad_norm': 10.504595756530762, 'learning_rate': 4.8543624161073824e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2684/75000 [01:54<50:49, 23.72it/s]

{'loss': 0.3443, 'grad_norm': 6.382250785827637, 'learning_rate': 4.853691275167785e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2693/75000 [01:55<51:22, 23.46it/s]

{'loss': 0.4088, 'grad_norm': 6.111443519592285, 'learning_rate': 4.853020134228188e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2702/75000 [01:55<51:09, 23.55it/s]

{'loss': 0.414, 'grad_norm': 6.074635982513428, 'learning_rate': 4.852348993288591e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2711/75000 [01:55<56:12, 21.44it/s]

{'loss': 0.3509, 'grad_norm': 11.582777976989746, 'learning_rate': 4.851677852348994e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2723/75000 [01:56<52:18, 23.03it/s]

{'loss': 0.266, 'grad_norm': 5.363979339599609, 'learning_rate': 4.851006711409396e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2732/75000 [01:56<52:45, 22.83it/s]

{'loss': 0.4623, 'grad_norm': 27.042537689208984, 'learning_rate': 4.850335570469799e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2744/75000 [01:57<52:14, 23.05it/s]

{'loss': 0.3585, 'grad_norm': 10.352961540222168, 'learning_rate': 4.849664429530202e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2753/75000 [01:57<52:20, 23.00it/s]

{'loss': 0.3371, 'grad_norm': 7.63236141204834, 'learning_rate': 4.848993288590604e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2762/75000 [01:58<52:55, 22.75it/s]

{'loss': 0.3748, 'grad_norm': 0.710597813129425, 'learning_rate': 4.848322147651007e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2774/75000 [01:58<55:16, 21.78it/s]

{'loss': 0.2679, 'grad_norm': 1.0117379426956177, 'learning_rate': 4.8476510067114096e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2783/75000 [01:58<52:28, 22.94it/s]

{'loss': 0.443, 'grad_norm': 13.943826675415039, 'learning_rate': 4.8469798657718125e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2792/75000 [01:59<52:09, 23.08it/s]

{'loss': 0.4245, 'grad_norm': 7.954198837280273, 'learning_rate': 4.846308724832215e-05, 'epoch': 0.11}


                                                    
  4%|▎         | 2804/75000 [01:59<52:10, 23.06it/s]

{'loss': 0.5341, 'grad_norm': 14.013322830200195, 'learning_rate': 4.8456375838926175e-05, 'epoch': 0.11}


                                                    
  4%|▍         | 2813/75000 [02:00<52:05, 23.09it/s]

{'loss': 0.4181, 'grad_norm': 11.501877784729004, 'learning_rate': 4.8449664429530204e-05, 'epoch': 0.11}


                                                    
  4%|▍         | 2822/75000 [02:00<51:49, 23.21it/s]

{'loss': 0.3279, 'grad_norm': 4.840210437774658, 'learning_rate': 4.844295302013423e-05, 'epoch': 0.11}


                                                    
  4%|▍         | 2834/75000 [02:01<50:39, 23.75it/s]

{'loss': 0.3659, 'grad_norm': 1.3254601955413818, 'learning_rate': 4.843624161073826e-05, 'epoch': 0.11}


                                                    
  4%|▍         | 2843/75000 [02:01<52:05, 23.08it/s]

{'loss': 0.4275, 'grad_norm': 9.470460891723633, 'learning_rate': 4.842953020134228e-05, 'epoch': 0.11}


                                                    
  4%|▍         | 2852/75000 [02:01<51:39, 23.28it/s]

{'loss': 0.3223, 'grad_norm': 12.75432300567627, 'learning_rate': 4.842281879194631e-05, 'epoch': 0.11}


                                                    
  4%|▍         | 2864/75000 [02:02<52:08, 23.06it/s]

{'loss': 0.3473, 'grad_norm': 1.840166449546814, 'learning_rate': 4.841610738255033e-05, 'epoch': 0.11}


                                                    
  4%|▍         | 2873/75000 [02:02<53:56, 22.29it/s]

{'loss': 0.3359, 'grad_norm': 6.351339817047119, 'learning_rate': 4.840939597315436e-05, 'epoch': 0.11}


                                                    
  4%|▍         | 2882/75000 [02:03<53:38, 22.40it/s]

{'loss': 0.4319, 'grad_norm': 8.514195442199707, 'learning_rate': 4.84026845637584e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2894/75000 [02:03<51:19, 23.41it/s]

{'loss': 0.4151, 'grad_norm': 5.585944652557373, 'learning_rate': 4.839597315436242e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2903/75000 [02:04<50:29, 23.80it/s]

{'loss': 0.4233, 'grad_norm': 8.620409965515137, 'learning_rate': 4.838926174496645e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2912/75000 [02:04<51:23, 23.37it/s]

{'loss': 0.2882, 'grad_norm': 8.762982368469238, 'learning_rate': 4.838255033557047e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2924/75000 [02:05<50:36, 23.73it/s]

{'loss': 0.3309, 'grad_norm': 1.9726577997207642, 'learning_rate': 4.83758389261745e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2933/75000 [02:05<51:11, 23.46it/s]

{'loss': 0.3815, 'grad_norm': 3.804250955581665, 'learning_rate': 4.8369127516778526e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2942/75000 [02:05<53:26, 22.47it/s]

{'loss': 0.2604, 'grad_norm': 2.1294124126434326, 'learning_rate': 4.8362416107382555e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2954/75000 [02:06<51:40, 23.23it/s]

{'loss': 0.4882, 'grad_norm': 21.694936752319336, 'learning_rate': 4.835570469798658e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2963/75000 [02:06<52:17, 22.96it/s]

{'loss': 0.3081, 'grad_norm': 1.1413875818252563, 'learning_rate': 4.8348993288590605e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2972/75000 [02:07<51:13, 23.44it/s]

{'loss': 0.3446, 'grad_norm': 6.327580451965332, 'learning_rate': 4.8342281879194633e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2984/75000 [02:07<52:17, 22.95it/s]

{'loss': 0.4815, 'grad_norm': 2.4228053092956543, 'learning_rate': 4.8335570469798655e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 2993/75000 [02:07<51:00, 23.53it/s]

{'loss': 0.3896, 'grad_norm': 8.845487594604492, 'learning_rate': 4.8328859060402684e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3000/75000 [02:08<49:53, 24.05it/s]

{'loss': 0.4146, 'grad_norm': 8.564385414123535, 'learning_rate': 4.832214765100672e-05, 'epoch': 0.12}


                                                      
  4%|▍         | 3014/75000 [02:09<59:03, 20.31it/s]  

{'loss': 0.4489, 'grad_norm': 15.13084602355957, 'learning_rate': 4.831543624161074e-05, 'epoch': 0.12}


                                                      
  4%|▍         | 3023/75000 [02:09<54:55, 21.84it/s]

{'loss': 0.3826, 'grad_norm': 8.437830924987793, 'learning_rate': 4.830872483221477e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3032/75000 [02:10<1:06:17, 18.09it/s]

{'loss': 0.3033, 'grad_norm': 4.230763912200928, 'learning_rate': 4.830201342281879e-05, 'epoch': 0.12}


                                                      
  4%|▍         | 3043/75000 [02:10<55:33, 21.58it/s]

{'loss': 0.3456, 'grad_norm': 8.943860054016113, 'learning_rate': 4.829530201342282e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3052/75000 [02:11<1:03:21, 18.92it/s]

{'loss': 0.3824, 'grad_norm': 7.59787130355835, 'learning_rate': 4.828859060402685e-05, 'epoch': 0.12}


                                                      
  4%|▍         | 3062/75000 [02:11<1:00:56, 19.68it/s]

{'loss': 0.3814, 'grad_norm': 6.9431304931640625, 'learning_rate': 4.828187919463088e-05, 'epoch': 0.12}


                                                      
  4%|▍         | 3074/75000 [02:12<53:24, 22.45it/s]

{'loss': 0.3981, 'grad_norm': 5.121312141418457, 'learning_rate': 4.8275167785234905e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3083/75000 [02:12<52:17, 22.92it/s]

{'loss': 0.4345, 'grad_norm': 4.7154059410095215, 'learning_rate': 4.826845637583893e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3092/75000 [02:12<54:33, 21.96it/s]

{'loss': 0.4319, 'grad_norm': 11.10753059387207, 'learning_rate': 4.8261744966442956e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3104/75000 [02:13<53:38, 22.34it/s]

{'loss': 0.267, 'grad_norm': 4.381641387939453, 'learning_rate': 4.825503355704698e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3113/75000 [02:13<54:39, 21.92it/s]

{'loss': 0.2605, 'grad_norm': 11.755895614624023, 'learning_rate': 4.824832214765101e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3122/75000 [02:14<54:29, 21.98it/s]

{'loss': 0.3784, 'grad_norm': 8.545806884765625, 'learning_rate': 4.824161073825504e-05, 'epoch': 0.12}


                                                    
  4%|▍         | 3134/75000 [02:14<53:58, 22.19it/s]

{'loss': 0.2059, 'grad_norm': 6.243729591369629, 'learning_rate': 4.823489932885906e-05, 'epoch': 0.13}


                                                    
  4%|▍         | 3143/75000 [02:15<54:43, 21.89it/s]

{'loss': 0.5272, 'grad_norm': 2.3854963779449463, 'learning_rate': 4.822818791946309e-05, 'epoch': 0.13}


                                                    
  4%|▍         | 3152/75000 [02:15<58:05, 20.62it/s]

{'loss': 0.3918, 'grad_norm': 10.209918975830078, 'learning_rate': 4.822147651006711e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3161/75000 [02:16<1:08:51, 17.39it/s]

{'loss': 0.3242, 'grad_norm': 13.126078605651855, 'learning_rate': 4.821476510067114e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3172/75000 [02:17<1:25:00, 14.08it/s]

{'loss': 0.4215, 'grad_norm': 0.8320462107658386, 'learning_rate': 4.820805369127517e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3182/75000 [02:17<1:18:26, 15.26it/s]

{'loss': 0.403, 'grad_norm': 2.340843677520752, 'learning_rate': 4.82013422818792e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3192/75000 [02:18<1:08:23, 17.50it/s]

{'loss': 0.3581, 'grad_norm': 7.616116523742676, 'learning_rate': 4.819463087248323e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3202/75000 [02:18<1:03:31, 18.84it/s]

{'loss': 0.2569, 'grad_norm': 4.2784810066223145, 'learning_rate': 4.818791946308725e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3213/75000 [02:19<1:04:49, 18.45it/s]

{'loss': 0.4466, 'grad_norm': 6.215148448944092, 'learning_rate': 4.818120805369128e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3222/75000 [02:19<1:08:26, 17.48it/s]

{'loss': 0.435, 'grad_norm': 1.4806690216064453, 'learning_rate': 4.81744966442953e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3232/75000 [02:20<1:10:25, 16.98it/s]

{'loss': 0.31, 'grad_norm': 1.5211561918258667, 'learning_rate': 4.8167785234899335e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3242/75000 [02:21<1:02:39, 19.09it/s]

{'loss': 0.4335, 'grad_norm': 8.421146392822266, 'learning_rate': 4.816107382550336e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3252/75000 [02:21<1:03:52, 18.72it/s]

{'loss': 0.4122, 'grad_norm': 5.369472503662109, 'learning_rate': 4.8154362416107385e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3264/75000 [02:22<1:00:33, 19.74it/s]

{'loss': 0.3484, 'grad_norm': 7.2004594802856445, 'learning_rate': 4.8147651006711414e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3274/75000 [02:22<1:00:01, 19.92it/s]

{'loss': 0.3712, 'grad_norm': 14.115731239318848, 'learning_rate': 4.8140939597315436e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3283/75000 [02:23<1:00:17, 19.82it/s]

{'loss': 0.3632, 'grad_norm': 1.750133991241455, 'learning_rate': 4.8134228187919464e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3294/75000 [02:23<59:05, 20.22it/s]  

{'loss': 0.4261, 'grad_norm': 10.220624923706055, 'learning_rate': 4.812751677852349e-05, 'epoch': 0.13}


                                                    
  4%|▍         | 3303/75000 [02:24<59:17, 20.15it/s]

{'loss': 0.3454, 'grad_norm': 4.102756023406982, 'learning_rate': 4.812080536912752e-05, 'epoch': 0.13}


                                                    
  4%|▍         | 3312/75000 [02:24<1:00:20, 19.80it/s]

{'loss': 0.3947, 'grad_norm': 18.849349975585938, 'learning_rate': 4.811409395973154e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3322/75000 [02:25<1:02:14, 19.20it/s]

{'loss': 0.283, 'grad_norm': 5.023336887359619, 'learning_rate': 4.810738255033557e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3332/75000 [02:25<1:01:30, 19.42it/s]

{'loss': 0.3362, 'grad_norm': 3.825035810470581, 'learning_rate': 4.81006711409396e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3344/75000 [02:26<59:47, 19.97it/s]

{'loss': 0.3381, 'grad_norm': 5.222691535949707, 'learning_rate': 4.809395973154362e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3352/75000 [02:26<1:06:17, 18.01it/s]

{'loss': 0.4136, 'grad_norm': 3.2111432552337646, 'learning_rate': 4.808724832214766e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3362/75000 [02:27<1:04:46, 18.43it/s]

{'loss': 0.3086, 'grad_norm': 5.788478851318359, 'learning_rate': 4.808053691275168e-05, 'epoch': 0.13}


                                                      
  4%|▍         | 3372/75000 [02:27<1:04:03, 18.63it/s]

{'loss': 0.2837, 'grad_norm': 5.432831287384033, 'learning_rate': 4.807382550335571e-05, 'epoch': 0.13}


                                                      
  5%|▍         | 3382/75000 [02:28<1:04:32, 18.50it/s]

{'loss': 0.3627, 'grad_norm': 3.4436371326446533, 'learning_rate': 4.8067114093959736e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3393/75000 [02:28<1:03:00, 18.94it/s]

{'loss': 0.308, 'grad_norm': 7.764435768127441, 'learning_rate': 4.806040268456376e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3404/75000 [02:29<1:02:34, 19.07it/s]

{'loss': 0.3425, 'grad_norm': 21.355125427246094, 'learning_rate': 4.8053691275167786e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3414/75000 [02:29<1:01:11, 19.50it/s]

{'loss': 0.3342, 'grad_norm': 7.972958087921143, 'learning_rate': 4.8046979865771815e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3422/75000 [02:30<1:02:27, 19.10it/s]

{'loss': 0.557, 'grad_norm': 20.39080238342285, 'learning_rate': 4.8040268456375843e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3433/75000 [02:30<1:01:53, 19.27it/s]

{'loss': 0.4189, 'grad_norm': 3.381418466567993, 'learning_rate': 4.8033557046979865e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3441/75000 [02:31<1:01:05, 19.52it/s]

{'loss': 0.3793, 'grad_norm': 5.587764263153076, 'learning_rate': 4.8026845637583894e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3453/75000 [02:31<1:01:00, 19.55it/s]

{'loss': 0.2754, 'grad_norm': 4.415328502655029, 'learning_rate': 4.802013422818792e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3463/75000 [02:32<1:02:11, 19.17it/s]

{'loss': 0.4489, 'grad_norm': 19.80845832824707, 'learning_rate': 4.801342281879195e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3474/75000 [02:33<1:01:07, 19.51it/s]

{'loss': 0.5026, 'grad_norm': 8.661416053771973, 'learning_rate': 4.800671140939598e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3483/75000 [02:33<1:03:45, 18.70it/s]

{'loss': 0.2491, 'grad_norm': 9.6940279006958, 'learning_rate': 4.8e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3492/75000 [02:34<1:02:51, 18.96it/s]

{'loss': 0.4418, 'grad_norm': 24.953601837158203, 'learning_rate': 4.799328859060403e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3500/75000 [02:34<1:02:25, 19.09it/s]

{'loss': 0.3339, 'grad_norm': 14.075594902038574, 'learning_rate': 4.798657718120805e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3514/75000 [02:38<2:10:08,  9.16it/s]

{'loss': 0.4362, 'grad_norm': 4.385720729827881, 'learning_rate': 4.797986577181208e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3523/75000 [02:38<1:24:18, 14.13it/s]

{'loss': 0.3136, 'grad_norm': 7.780208110809326, 'learning_rate': 4.797315436241611e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3533/75000 [02:39<1:17:09, 15.44it/s]

{'loss': 0.3614, 'grad_norm': 4.839570999145508, 'learning_rate': 4.796644295302014e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3543/75000 [02:39<1:10:33, 16.88it/s]

{'loss': 0.3292, 'grad_norm': 5.006289482116699, 'learning_rate': 4.7959731543624166e-05, 'epoch': 0.14}


                                                      
  5%|▍         | 3554/75000 [02:40<58:35, 20.32it/s]  

{'loss': 0.4718, 'grad_norm': 4.636200904846191, 'learning_rate': 4.795302013422819e-05, 'epoch': 0.14}


                                                    
  5%|▍         | 3563/75000 [02:40<54:35, 21.81it/s]

{'loss': 0.4028, 'grad_norm': 3.916158676147461, 'learning_rate': 4.7946308724832216e-05, 'epoch': 0.14}


                                                    
  5%|▍         | 3572/75000 [02:41<53:22, 22.30it/s]

{'loss': 0.371, 'grad_norm': 2.0018038749694824, 'learning_rate': 4.7939597315436245e-05, 'epoch': 0.14}


                                                    
  5%|▍         | 3584/75000 [02:41<55:00, 21.64it/s]

{'loss': 0.3652, 'grad_norm': 6.449899196624756, 'learning_rate': 4.793288590604027e-05, 'epoch': 0.14}


                                                    
  5%|▍         | 3593/75000 [02:42<53:30, 22.24it/s]

{'loss': 0.3779, 'grad_norm': 8.885716438293457, 'learning_rate': 4.79261744966443e-05, 'epoch': 0.14}


                                                    
  5%|▍         | 3602/75000 [02:42<53:04, 22.42it/s]

{'loss': 0.4155, 'grad_norm': 4.378307342529297, 'learning_rate': 4.7919463087248323e-05, 'epoch': 0.14}


                                                    
  5%|▍         | 3611/75000 [02:42<56:52, 20.92it/s]

{'loss': 0.3591, 'grad_norm': 4.369993686676025, 'learning_rate': 4.791275167785235e-05, 'epoch': 0.14}


                                                    
  5%|▍         | 3623/75000 [02:43<57:16, 20.77it/s]

{'loss': 0.4701, 'grad_norm': 9.786579132080078, 'learning_rate': 4.7906040268456374e-05, 'epoch': 0.14}


                                                    
  5%|▍         | 3632/75000 [02:43<55:15, 21.53it/s]

{'loss': 0.335, 'grad_norm': 0.6364625692367554, 'learning_rate': 4.78993288590604e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3644/75000 [02:44<54:12, 21.94it/s]

{'loss': 0.3867, 'grad_norm': 4.548059940338135, 'learning_rate': 4.789261744966443e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3653/75000 [02:44<54:06, 21.97it/s]

{'loss': 0.3408, 'grad_norm': 4.010698318481445, 'learning_rate': 4.788590604026846e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3662/75000 [02:45<58:16, 20.40it/s]

{'loss': 0.3588, 'grad_norm': 3.9962127208709717, 'learning_rate': 4.787919463087249e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3674/75000 [02:45<55:50, 21.29it/s]

{'loss': 0.3104, 'grad_norm': 2.5629563331604004, 'learning_rate': 4.787248322147651e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3683/75000 [02:46<55:47, 21.31it/s]

{'loss': 0.3585, 'grad_norm': 2.1163413524627686, 'learning_rate': 4.786577181208054e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3692/75000 [02:46<55:30, 21.41it/s]

{'loss': 0.3481, 'grad_norm': 1.7738735675811768, 'learning_rate': 4.785906040268457e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3704/75000 [02:47<56:46, 20.93it/s]

{'loss': 0.2982, 'grad_norm': 9.466076850891113, 'learning_rate': 4.7852348993288595e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3713/75000 [02:47<55:41, 21.34it/s]

{'loss': 0.2705, 'grad_norm': 5.638863563537598, 'learning_rate': 4.7845637583892624e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3722/75000 [02:48<55:37, 21.36it/s]

{'loss': 0.3866, 'grad_norm': 5.247855186462402, 'learning_rate': 4.7838926174496646e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3734/75000 [02:48<55:50, 21.27it/s]

{'loss': 0.378, 'grad_norm': 5.067558765411377, 'learning_rate': 4.7832214765100674e-05, 'epoch': 0.15}


                                                    
  5%|▍         | 3743/75000 [02:49<55:22, 21.45it/s]

{'loss': 0.3407, 'grad_norm': 7.475472927093506, 'learning_rate': 4.7825503355704696e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3752/75000 [02:49<1:01:11, 19.41it/s]

{'loss': 0.3568, 'grad_norm': 8.213970184326172, 'learning_rate': 4.7818791946308725e-05, 'epoch': 0.15}


                                                      
  5%|▌         | 3764/75000 [02:50<56:27, 21.03it/s]

{'loss': 0.35, 'grad_norm': 5.349862098693848, 'learning_rate': 4.781208053691276e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3773/75000 [02:50<56:01, 21.19it/s]

{'loss': 0.3572, 'grad_norm': 3.5041184425354004, 'learning_rate': 4.780536912751678e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3782/75000 [02:51<55:35, 21.35it/s]

{'loss': 0.2451, 'grad_norm': 8.339896202087402, 'learning_rate': 4.779865771812081e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3794/75000 [02:51<55:09, 21.52it/s]

{'loss': 0.3178, 'grad_norm': 3.4644663333892822, 'learning_rate': 4.779194630872483e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3803/75000 [02:51<55:07, 21.53it/s]

{'loss': 0.2609, 'grad_norm': 4.961986064910889, 'learning_rate': 4.778523489932886e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3812/75000 [02:52<55:15, 21.47it/s]

{'loss': 0.3427, 'grad_norm': 3.244101047515869, 'learning_rate': 4.777852348993289e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3824/75000 [02:52<56:22, 21.04it/s]

{'loss': 0.4488, 'grad_norm': 4.2227911949157715, 'learning_rate': 4.777181208053692e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3833/75000 [02:53<55:59, 21.19it/s]

{'loss': 0.3475, 'grad_norm': 0.7619775533676147, 'learning_rate': 4.7765100671140946e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3842/75000 [02:53<55:50, 21.24it/s]

{'loss': 0.3723, 'grad_norm': 4.616543292999268, 'learning_rate': 4.775838926174497e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3854/75000 [02:54<55:39, 21.30it/s]

{'loss': 0.2274, 'grad_norm': 12.432145118713379, 'learning_rate': 4.7751677852348996e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3863/75000 [02:54<55:47, 21.25it/s]

{'loss': 0.3689, 'grad_norm': 5.017914295196533, 'learning_rate': 4.774496644295302e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3872/75000 [02:55<55:39, 21.30it/s]

{'loss': 0.4216, 'grad_norm': 4.382110595703125, 'learning_rate': 4.773825503355705e-05, 'epoch': 0.15}


                                                    
  5%|▌         | 3884/75000 [02:55<55:37, 21.31it/s]

{'loss': 0.3416, 'grad_norm': 12.496879577636719, 'learning_rate': 4.7731543624161075e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 3893/75000 [02:56<59:08, 20.04it/s]  

{'loss': 0.3675, 'grad_norm': 7.773672580718994, 'learning_rate': 4.7724832214765104e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3902/75000 [02:56<56:46, 20.87it/s]

{'loss': 0.4095, 'grad_norm': 6.837217330932617, 'learning_rate': 4.771812080536913e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3914/75000 [02:57<55:02, 21.52it/s]

{'loss': 0.3282, 'grad_norm': 4.492092132568359, 'learning_rate': 4.7711409395973154e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3923/75000 [02:57<55:17, 21.43it/s]

{'loss': 0.4749, 'grad_norm': 15.66370677947998, 'learning_rate': 4.770469798657718e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3932/75000 [02:58<56:04, 21.12it/s]

{'loss': 0.265, 'grad_norm': 3.8438191413879395, 'learning_rate': 4.769798657718121e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3944/75000 [02:58<55:30, 21.34it/s]

{'loss': 0.3312, 'grad_norm': 5.97737455368042, 'learning_rate': 4.769127516778524e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3953/75000 [02:59<56:11, 21.07it/s]

{'loss': 0.399, 'grad_norm': 4.589356899261475, 'learning_rate': 4.768456375838926e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3962/75000 [02:59<57:14, 20.68it/s]

{'loss': 0.3151, 'grad_norm': 5.4972100257873535, 'learning_rate': 4.767785234899329e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3974/75000 [03:00<56:01, 21.13it/s]

{'loss': 0.3595, 'grad_norm': 2.9614005088806152, 'learning_rate': 4.767114093959732e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3983/75000 [03:00<54:44, 21.62it/s]

{'loss': 0.3152, 'grad_norm': 3.9891862869262695, 'learning_rate': 4.766442953020134e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 3992/75000 [03:00<53:44, 22.02it/s]

{'loss': 0.3246, 'grad_norm': 3.758237838745117, 'learning_rate': 4.7657718120805376e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 4000/75000 [03:01<53:56, 21.93it/s]

{'loss': 0.2458, 'grad_norm': 7.39774751663208, 'learning_rate': 4.76510067114094e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4013/75000 [03:02<1:04:03, 18.47it/s]

{'loss': 0.3217, 'grad_norm': 7.536201477050781, 'learning_rate': 4.7644295302013426e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4020/75000 [03:02<59:03, 20.03it/s]

{'loss': 0.5372, 'grad_norm': 15.247597694396973, 'learning_rate': 4.7637583892617455e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4032/75000 [03:03<1:09:47, 16.95it/s]

{'loss': 0.4001, 'grad_norm': 3.945066452026367, 'learning_rate': 4.7630872483221476e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4042/75000 [03:03<1:03:50, 18.52it/s]

{'loss': 0.3251, 'grad_norm': 4.218967437744141, 'learning_rate': 4.7624161073825505e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4053/75000 [03:04<1:01:12, 19.32it/s]

{'loss': 0.3508, 'grad_norm': 3.396616220474243, 'learning_rate': 4.7617449664429534e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4061/75000 [03:04<1:00:40, 19.49it/s]

{'loss': 0.3737, 'grad_norm': 7.3311967849731445, 'learning_rate': 4.761073825503356e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4073/75000 [03:05<59:40, 19.81it/s]

{'loss': 0.4573, 'grad_norm': 2.739884614944458, 'learning_rate': 4.7604026845637584e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4081/75000 [03:05<1:01:41, 19.16it/s]

{'loss': 0.4552, 'grad_norm': 3.286351442337036, 'learning_rate': 4.759731543624161e-05, 'epoch': 0.16}


                                                      
  5%|▌         | 4093/75000 [03:06<59:09, 19.98it/s]

{'loss': 0.4228, 'grad_norm': 1.1263536214828491, 'learning_rate': 4.759060402684564e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 4102/75000 [03:06<56:50, 20.79it/s]

{'loss': 0.3503, 'grad_norm': 2.2521862983703613, 'learning_rate': 4.758389261744966e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 4114/75000 [03:07<55:15, 21.38it/s]

{'loss': 0.4124, 'grad_norm': 4.949965953826904, 'learning_rate': 4.75771812080537e-05, 'epoch': 0.16}


                                                    
  5%|▌         | 4123/75000 [03:07<55:19, 21.35it/s]

{'loss': 0.4903, 'grad_norm': 4.640139579772949, 'learning_rate': 4.757046979865772e-05, 'epoch': 0.16}


                                                    
  6%|▌         | 4132/75000 [03:08<55:43, 21.20it/s]

{'loss': 0.3304, 'grad_norm': 2.1319985389709473, 'learning_rate': 4.756375838926175e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4144/75000 [03:08<54:53, 21.51it/s]

{'loss': 0.3354, 'grad_norm': 2.6518781185150146, 'learning_rate': 4.755704697986577e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4153/75000 [03:09<53:17, 22.16it/s]

{'loss': 0.3429, 'grad_norm': 12.822389602661133, 'learning_rate': 4.75503355704698e-05, 'epoch': 0.17}


                                                      
  6%|▌         | 4162/75000 [03:09<56:51, 20.77it/s]

{'loss': 0.3555, 'grad_norm': 1.2503817081451416, 'learning_rate': 4.754362416107383e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4174/75000 [03:10<53:14, 22.17it/s]

{'loss': 0.3529, 'grad_norm': 5.894168376922607, 'learning_rate': 4.7536912751677856e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4183/75000 [03:10<52:03, 22.67it/s]

{'loss': 0.3629, 'grad_norm': 11.771062850952148, 'learning_rate': 4.7530201342281884e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4192/75000 [03:11<52:04, 22.66it/s]

{'loss': 0.3666, 'grad_norm': 11.129125595092773, 'learning_rate': 4.7523489932885906e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4204/75000 [03:11<54:38, 21.59it/s]

{'loss': 0.4531, 'grad_norm': 2.1827142238616943, 'learning_rate': 4.7516778523489935e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4213/75000 [03:12<54:49, 21.52it/s]

{'loss': 0.3091, 'grad_norm': 2.817373275756836, 'learning_rate': 4.751006711409396e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4222/75000 [03:12<55:15, 21.34it/s]

{'loss': 0.2681, 'grad_norm': 3.0673513412475586, 'learning_rate': 4.7503355704697985e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4234/75000 [03:13<54:34, 21.61it/s]

{'loss': 0.5257, 'grad_norm': 6.612462520599365, 'learning_rate': 4.749664429530202e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4243/75000 [03:13<57:38, 20.46it/s]

{'loss': 0.3805, 'grad_norm': 4.864968776702881, 'learning_rate': 4.748993288590604e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4252/75000 [03:13<56:31, 20.86it/s]

{'loss': 0.4055, 'grad_norm': 18.42658042907715, 'learning_rate': 4.748322147651007e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4264/75000 [03:14<56:02, 21.04it/s]

{'loss': 0.4153, 'grad_norm': 5.048088550567627, 'learning_rate': 4.747651006711409e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4273/75000 [03:14<55:33, 21.22it/s]

{'loss': 0.3531, 'grad_norm': 5.876439094543457, 'learning_rate': 4.746979865771812e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4282/75000 [03:15<55:31, 21.23it/s]

{'loss': 0.3108, 'grad_norm': 8.620153427124023, 'learning_rate': 4.746308724832215e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4294/75000 [03:15<55:01, 21.41it/s]

{'loss': 0.2189, 'grad_norm': 4.498604774475098, 'learning_rate': 4.745637583892618e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4303/75000 [03:16<56:56, 20.70it/s]

{'loss': 0.3222, 'grad_norm': 3.6637089252471924, 'learning_rate': 4.7449664429530207e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4312/75000 [03:16<55:48, 21.11it/s]

{'loss': 0.4141, 'grad_norm': 6.894354820251465, 'learning_rate': 4.744295302013423e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4324/75000 [03:17<56:35, 20.81it/s]

{'loss': 0.3543, 'grad_norm': 2.0487592220306396, 'learning_rate': 4.743624161073826e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4333/75000 [03:17<55:56, 21.05it/s]

{'loss': 0.3667, 'grad_norm': 1.1318341493606567, 'learning_rate': 4.742953020134228e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4342/75000 [03:18<56:50, 20.72it/s]

{'loss': 0.2203, 'grad_norm': 0.739234209060669, 'learning_rate': 4.7422818791946314e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4354/75000 [03:18<56:39, 20.78it/s]

{'loss': 0.4002, 'grad_norm': 5.8532257080078125, 'learning_rate': 4.741610738255034e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4363/75000 [03:19<55:37, 21.16it/s]

{'loss': 0.3684, 'grad_norm': 5.098097324371338, 'learning_rate': 4.7409395973154364e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4372/75000 [03:19<57:22, 20.51it/s]

{'loss': 0.3989, 'grad_norm': 3.340759038925171, 'learning_rate': 4.740268456375839e-05, 'epoch': 0.17}


                                                    
  6%|▌         | 4384/75000 [03:20<55:13, 21.31it/s]

{'loss': 0.3389, 'grad_norm': 2.5885262489318848, 'learning_rate': 4.7395973154362415e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4393/75000 [03:20<56:22, 20.88it/s]

{'loss': 0.5353, 'grad_norm': 7.07647705078125, 'learning_rate': 4.738926174496644e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4404/75000 [03:21<59:17, 19.85it/s]  

{'loss': 0.382, 'grad_norm': 8.761993408203125, 'learning_rate': 4.738255033557047e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4413/75000 [03:21<57:29, 20.46it/s]

{'loss': 0.3423, 'grad_norm': 11.855944633483887, 'learning_rate': 4.73758389261745e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4422/75000 [03:22<57:14, 20.55it/s]

{'loss': 0.373, 'grad_norm': 5.114828109741211, 'learning_rate': 4.736912751677853e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4434/75000 [03:22<1:00:31, 19.43it/s]

{'loss': 0.3033, 'grad_norm': 9.554969787597656, 'learning_rate': 4.736241610738255e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4443/75000 [03:23<57:06, 20.59it/s]

{'loss': 0.3744, 'grad_norm': 3.185915946960449, 'learning_rate': 4.735570469798658e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4452/75000 [03:23<58:08, 20.23it/s]

{'loss': 0.4415, 'grad_norm': 6.761259078979492, 'learning_rate': 4.73489932885906e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4463/75000 [03:24<1:01:25, 19.14it/s]

{'loss': 0.371, 'grad_norm': 2.5112078189849854, 'learning_rate': 4.7342281879194636e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4472/75000 [03:24<56:30, 20.80it/s]

{'loss': 0.3359, 'grad_norm': 9.373588562011719, 'learning_rate': 4.7335570469798665e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4484/75000 [03:25<54:46, 21.46it/s]

{'loss': 0.3274, 'grad_norm': 8.075209617614746, 'learning_rate': 4.7328859060402687e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4493/75000 [03:25<55:30, 21.17it/s]

{'loss': 0.3121, 'grad_norm': 8.15015983581543, 'learning_rate': 4.7322147651006715e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4500/75000 [03:25<59:15, 19.83it/s]

{'loss': 0.2309, 'grad_norm': 4.813826560974121, 'learning_rate': 4.731543624161074e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4514/75000 [03:27<1:07:43, 17.35it/s]

{'loss': 0.3702, 'grad_norm': 27.99703025817871, 'learning_rate': 4.7308724832214765e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4522/75000 [03:27<1:02:09, 18.90it/s]

{'loss': 0.3697, 'grad_norm': 1.3700697422027588, 'learning_rate': 4.7302013422818794e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4534/75000 [03:28<57:20, 20.48it/s]

{'loss': 0.3422, 'grad_norm': 8.686705589294434, 'learning_rate': 4.729530201342282e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4543/75000 [03:28<57:01, 20.59it/s]

{'loss': 0.3704, 'grad_norm': 1.6836639642715454, 'learning_rate': 4.728859060402685e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4552/75000 [03:28<56:47, 20.67it/s]

{'loss': 0.3116, 'grad_norm': 7.2337117195129395, 'learning_rate': 4.728187919463087e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4563/75000 [03:29<1:02:12, 18.87it/s]

{'loss': 0.3374, 'grad_norm': 6.905903339385986, 'learning_rate': 4.72751677852349e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4573/75000 [03:30<1:05:46, 17.84it/s]

{'loss': 0.4143, 'grad_norm': 2.360677480697632, 'learning_rate': 4.726845637583892e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4584/75000 [03:30<58:38, 20.01it/s]  

{'loss': 0.2735, 'grad_norm': 3.694851875305176, 'learning_rate': 4.726174496644296e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4593/75000 [03:30<56:50, 20.64it/s]

{'loss': 0.3381, 'grad_norm': 5.483936309814453, 'learning_rate': 4.725503355704699e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4602/75000 [03:31<58:10, 20.17it/s]

{'loss': 0.3835, 'grad_norm': 3.91381573677063, 'learning_rate': 4.724832214765101e-05, 'epoch': 0.18}


                                                    
  6%|▌         | 4614/75000 [03:32<57:01, 20.57it/s]

{'loss': 0.4074, 'grad_norm': 1.184897541999817, 'learning_rate': 4.724161073825504e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4622/75000 [03:32<1:00:57, 19.24it/s]

{'loss': 0.3228, 'grad_norm': 2.159796953201294, 'learning_rate': 4.723489932885906e-05, 'epoch': 0.18}


                                                      
  6%|▌         | 4634/75000 [03:33<56:48, 20.65it/s]

{'loss': 0.325, 'grad_norm': 7.495087146759033, 'learning_rate': 4.722818791946309e-05, 'epoch': 0.19}


                                                    
  6%|▌         | 4643/75000 [03:33<56:46, 20.65it/s]

{'loss': 0.3523, 'grad_norm': 4.901764392852783, 'learning_rate': 4.7221476510067116e-05, 'epoch': 0.19}


                                                    
  6%|▌         | 4652/75000 [03:33<56:51, 20.62it/s]

{'loss': 0.5083, 'grad_norm': 5.062805652618408, 'learning_rate': 4.7214765100671145e-05, 'epoch': 0.19}


                                                    
  6%|▌         | 4661/75000 [03:34<57:06, 20.53it/s]

{'loss': 0.3192, 'grad_norm': 4.156803131103516, 'learning_rate': 4.720805369127517e-05, 'epoch': 0.19}


                                                      
  6%|▌         | 4672/75000 [03:34<59:24, 19.73it/s]  

{'loss': 0.3427, 'grad_norm': 5.810929775238037, 'learning_rate': 4.7201342281879195e-05, 'epoch': 0.19}


                                                    
  6%|▌         | 4684/75000 [03:35<58:29, 20.03it/s]

{'loss': 0.2515, 'grad_norm': 7.881313323974609, 'learning_rate': 4.7194630872483224e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4693/75000 [03:35<56:56, 20.58it/s]

{'loss': 0.4194, 'grad_norm': 6.487396717071533, 'learning_rate': 4.718791946308725e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4702/75000 [03:36<55:20, 21.17it/s]

{'loss': 0.3545, 'grad_norm': 8.033446311950684, 'learning_rate': 4.718120805369128e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4714/75000 [03:36<53:44, 21.80it/s]

{'loss': 0.2601, 'grad_norm': 2.496222734451294, 'learning_rate': 4.71744966442953e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4723/75000 [03:37<53:44, 21.80it/s]

{'loss': 0.3716, 'grad_norm': 7.124742031097412, 'learning_rate': 4.716778523489933e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4732/75000 [03:37<53:58, 21.70it/s]

{'loss': 0.4313, 'grad_norm': 4.826117515563965, 'learning_rate': 4.716107382550336e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4744/75000 [03:38<54:02, 21.67it/s]

{'loss': 0.4488, 'grad_norm': 4.1159467697143555, 'learning_rate': 4.715436241610738e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4753/75000 [03:38<54:07, 21.63it/s]

{'loss': 0.3644, 'grad_norm': 6.125425338745117, 'learning_rate': 4.714765100671141e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4762/75000 [03:39<54:42, 21.40it/s]

{'loss': 0.3502, 'grad_norm': 2.2916908264160156, 'learning_rate': 4.714093959731544e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4774/75000 [03:39<53:59, 21.68it/s]

{'loss': 0.3011, 'grad_norm': 4.545671463012695, 'learning_rate': 4.713422818791947e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4783/75000 [03:40<53:31, 21.86it/s]

{'loss': 0.3405, 'grad_norm': 5.120434284210205, 'learning_rate': 4.712751677852349e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4792/75000 [03:40<53:29, 21.87it/s]

{'loss': 0.4244, 'grad_norm': 20.88670539855957, 'learning_rate': 4.712080536912752e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4804/75000 [03:41<53:29, 21.87it/s]

{'loss': 0.4125, 'grad_norm': 3.774336338043213, 'learning_rate': 4.7114093959731546e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4813/75000 [03:41<53:36, 21.82it/s]

{'loss': 0.3469, 'grad_norm': 2.5534322261810303, 'learning_rate': 4.7107382550335574e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4822/75000 [03:41<53:31, 21.85it/s]

{'loss': 0.3169, 'grad_norm': 4.685521125793457, 'learning_rate': 4.71006711409396e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4834/75000 [03:42<55:00, 21.26it/s]

{'loss': 0.2802, 'grad_norm': 4.619132041931152, 'learning_rate': 4.7093959731543625e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4843/75000 [03:42<54:53, 21.30it/s]

{'loss': 0.3752, 'grad_norm': 4.7706475257873535, 'learning_rate': 4.708724832214765e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4852/75000 [03:43<53:55, 21.68it/s]

{'loss': 0.4152, 'grad_norm': 5.546014785766602, 'learning_rate': 4.708053691275168e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4864/75000 [03:43<54:48, 21.33it/s]

{'loss': 0.458, 'grad_norm': 2.4239859580993652, 'learning_rate': 4.7073825503355704e-05, 'epoch': 0.19}


                                                    
  6%|▋         | 4873/75000 [03:44<54:31, 21.44it/s]

{'loss': 0.3987, 'grad_norm': 7.749297618865967, 'learning_rate': 4.706711409395973e-05, 'epoch': 0.19}


                                                    
  7%|▋         | 4882/75000 [03:44<54:56, 21.27it/s]

{'loss': 0.3359, 'grad_norm': 5.19368839263916, 'learning_rate': 4.706040268456376e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4894/75000 [03:45<53:28, 21.85it/s]

{'loss': 0.3429, 'grad_norm': 9.39710521697998, 'learning_rate': 4.705369127516779e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4903/75000 [03:45<53:52, 21.69it/s]

{'loss': 0.3057, 'grad_norm': 3.501336097717285, 'learning_rate': 4.704697986577181e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4912/75000 [03:46<53:39, 21.77it/s]

{'loss': 0.3681, 'grad_norm': 5.290151596069336, 'learning_rate': 4.704026845637584e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4924/75000 [03:46<56:21, 20.72it/s]

{'loss': 0.3203, 'grad_norm': 6.909508228302002, 'learning_rate': 4.703355704697987e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4933/75000 [03:47<56:28, 20.68it/s]

{'loss': 0.3445, 'grad_norm': 3.057708501815796, 'learning_rate': 4.7026845637583897e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4942/75000 [03:47<59:01, 19.78it/s]

{'loss': 0.3531, 'grad_norm': 6.203635215759277, 'learning_rate': 4.7020134228187925e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4954/75000 [03:48<54:54, 21.26it/s]

{'loss': 0.3553, 'grad_norm': 9.055878639221191, 'learning_rate': 4.701342281879195e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4963/75000 [03:48<54:50, 21.28it/s]

{'loss': 0.483, 'grad_norm': 6.869553089141846, 'learning_rate': 4.7006711409395975e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4972/75000 [03:48<53:55, 21.64it/s]

{'loss': 0.3797, 'grad_norm': 2.369492292404175, 'learning_rate': 4.7e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4984/75000 [03:49<53:55, 21.64it/s]

{'loss': 0.4183, 'grad_norm': 1.8725049495697021, 'learning_rate': 4.6993288590604026e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 4993/75000 [03:49<53:29, 21.82it/s]

{'loss': 0.329, 'grad_norm': 2.8810343742370605, 'learning_rate': 4.698657718120806e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5000/75000 [03:50<53:00, 22.01it/s]

{'loss': 0.4203, 'grad_norm': 3.4606359004974365, 'learning_rate': 4.697986577181208e-05, 'epoch': 0.2}


                                                      
  7%|▋         | 5014/75000 [03:51<1:09:06, 16.88it/s]

{'loss': 0.25, 'grad_norm': 5.192765712738037, 'learning_rate': 4.697315436241611e-05, 'epoch': 0.2}


                                                      
  7%|▋         | 5023/75000 [03:51<58:51, 19.82it/s]  

{'loss': 0.3206, 'grad_norm': 2.6630139350891113, 'learning_rate': 4.696644295302013e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5032/75000 [03:52<54:53, 21.25it/s]

{'loss': 0.4038, 'grad_norm': 3.570382595062256, 'learning_rate': 4.695973154362416e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5044/75000 [03:52<56:12, 20.74it/s]

{'loss': 0.2841, 'grad_norm': 6.427110195159912, 'learning_rate': 4.695302013422819e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5053/75000 [03:53<54:22, 21.44it/s]

{'loss': 0.3871, 'grad_norm': 13.003752708435059, 'learning_rate': 4.694630872483222e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5062/75000 [03:53<54:07, 21.53it/s]

{'loss': 0.2813, 'grad_norm': 3.2830021381378174, 'learning_rate': 4.693959731543625e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5074/75000 [03:54<53:10, 21.92it/s]

{'loss': 0.4555, 'grad_norm': 0.9843200445175171, 'learning_rate': 4.693288590604027e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5083/75000 [03:54<54:22, 21.43it/s]

{'loss': 0.2469, 'grad_norm': 0.8409639000892639, 'learning_rate': 4.69261744966443e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5092/75000 [03:55<55:03, 21.16it/s]

{'loss': 0.3446, 'grad_norm': 3.6488118171691895, 'learning_rate': 4.691946308724832e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5104/75000 [03:55<56:44, 20.53it/s]

{'loss': 0.3083, 'grad_norm': 8.680975914001465, 'learning_rate': 4.691275167785235e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5113/75000 [03:56<54:09, 21.50it/s]

{'loss': 0.3952, 'grad_norm': 1.9991194009780884, 'learning_rate': 4.690604026845638e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5122/75000 [03:56<56:49, 20.49it/s]

{'loss': 0.3809, 'grad_norm': 8.40574836730957, 'learning_rate': 4.6899328859060405e-05, 'epoch': 0.2}


                                                    
  7%|▋         | 5134/75000 [03:57<53:47, 21.65it/s]

{'loss': 0.4319, 'grad_norm': 5.319028377532959, 'learning_rate': 4.6892617449664434e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5143/75000 [03:57<56:05, 20.76it/s]

{'loss': 0.4558, 'grad_norm': 3.4258084297180176, 'learning_rate': 4.6885906040268455e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5152/75000 [03:57<55:53, 20.83it/s]

{'loss': 0.4793, 'grad_norm': 1.9234000444412231, 'learning_rate': 4.6879194630872484e-05, 'epoch': 0.21}


                                                      
  7%|▋         | 5162/75000 [03:58<1:01:43, 18.86it/s]

{'loss': 0.4479, 'grad_norm': 10.314254760742188, 'learning_rate': 4.687248322147651e-05, 'epoch': 0.21}


                                                      
  7%|▋         | 5174/75000 [03:59<56:47, 20.49it/s]

{'loss': 0.4265, 'grad_norm': 12.705106735229492, 'learning_rate': 4.686577181208054e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5183/75000 [03:59<57:45, 20.14it/s]

{'loss': 0.423, 'grad_norm': 7.4689202308654785, 'learning_rate': 4.685906040268457e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5192/75000 [03:59<56:58, 20.42it/s]

{'loss': 0.3538, 'grad_norm': 4.643073081970215, 'learning_rate': 4.685234899328859e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5201/75000 [04:00<57:14, 20.32it/s]

{'loss': 0.3219, 'grad_norm': 1.1785976886749268, 'learning_rate': 4.684563758389262e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5213/75000 [04:00<1:01:15, 18.99it/s]

{'loss': 0.4162, 'grad_norm': 1.08719801902771, 'learning_rate': 4.683892617449664e-05, 'epoch': 0.21}


                                                      
  7%|▋         | 5222/75000 [04:01<57:43, 20.14it/s]

{'loss': 0.4745, 'grad_norm': 9.085742950439453, 'learning_rate': 4.683221476510068e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5234/75000 [04:02<56:39, 20.52it/s]

{'loss': 0.3746, 'grad_norm': 1.1487675905227661, 'learning_rate': 4.6825503355704706e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5243/75000 [04:02<56:28, 20.59it/s]

{'loss': 0.3743, 'grad_norm': 5.186017036437988, 'learning_rate': 4.681879194630873e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5254/75000 [04:03<58:01, 20.03it/s]

{'loss': 0.5421, 'grad_norm': 3.3698129653930664, 'learning_rate': 4.6812080536912756e-05, 'epoch': 0.21}


                                                      
  7%|▋         | 5262/75000 [04:03<59:03, 19.68it/s]  

{'loss': 0.3529, 'grad_norm': 3.370413064956665, 'learning_rate': 4.680536912751678e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5273/75000 [04:03<56:30, 20.57it/s]

{'loss': 0.3811, 'grad_norm': 1.6700514554977417, 'learning_rate': 4.6798657718120806e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5282/75000 [04:04<55:55, 20.78it/s]

{'loss': 0.3397, 'grad_norm': 5.102237224578857, 'learning_rate': 4.6791946308724835e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5294/75000 [04:04<55:56, 20.77it/s]

{'loss': 0.5443, 'grad_norm': 10.27430248260498, 'learning_rate': 4.678523489932886e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5303/75000 [04:05<55:23, 20.97it/s]

{'loss': 0.3924, 'grad_norm': 3.505183219909668, 'learning_rate': 4.677852348993289e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5312/75000 [04:05<55:40, 20.86it/s]

{'loss': 0.3163, 'grad_norm': 9.751174926757812, 'learning_rate': 4.6771812080536914e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5324/75000 [04:06<58:11, 19.95it/s]

{'loss': 0.3222, 'grad_norm': 1.990936279296875, 'learning_rate': 4.676510067114094e-05, 'epoch': 0.21}


                                                      
  7%|▋         | 5332/75000 [04:06<58:37, 19.80it/s]

{'loss': 0.3627, 'grad_norm': 3.0976977348327637, 'learning_rate': 4.6758389261744964e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5344/75000 [04:07<56:17, 20.62it/s]

{'loss': 0.4692, 'grad_norm': 2.429328203201294, 'learning_rate': 4.6751677852349e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5353/75000 [04:07<54:22, 21.35it/s]

{'loss': 0.3522, 'grad_norm': 1.686549425125122, 'learning_rate': 4.674496644295302e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5362/75000 [04:08<53:14, 21.80it/s]

{'loss': 0.3567, 'grad_norm': 4.728954792022705, 'learning_rate': 4.673825503355705e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5374/75000 [04:08<53:22, 21.74it/s]

{'loss': 0.2937, 'grad_norm': 0.7117664813995361, 'learning_rate': 4.673154362416108e-05, 'epoch': 0.21}


                                                    
  7%|▋         | 5383/75000 [04:09<52:25, 22.13it/s]

{'loss': 0.5156, 'grad_norm': 7.8733296394348145, 'learning_rate': 4.67248322147651e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5392/75000 [04:09<55:15, 20.99it/s]

{'loss': 0.3799, 'grad_norm': 3.9037885665893555, 'learning_rate': 4.671812080536913e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5404/75000 [04:10<56:08, 20.66it/s]

{'loss': 0.3564, 'grad_norm': 1.5829851627349854, 'learning_rate': 4.671140939597316e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5413/75000 [04:10<56:44, 20.44it/s]

{'loss': 0.3125, 'grad_norm': 4.456943511962891, 'learning_rate': 4.6704697986577186e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5422/75000 [04:11<56:45, 20.43it/s]

{'loss': 0.4074, 'grad_norm': 7.082636833190918, 'learning_rate': 4.6697986577181214e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5434/75000 [04:11<55:51, 20.75it/s]

{'loss': 0.4326, 'grad_norm': 9.631388664245605, 'learning_rate': 4.6691275167785236e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5443/75000 [04:12<56:28, 20.53it/s]

{'loss': 0.2684, 'grad_norm': 6.311165809631348, 'learning_rate': 4.6684563758389264e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5452/75000 [04:12<56:46, 20.41it/s]

{'loss': 0.5079, 'grad_norm': 6.421510696411133, 'learning_rate': 4.6677852348993286e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5463/75000 [04:13<58:30, 19.81it/s]

{'loss': 0.4195, 'grad_norm': 1.3394114971160889, 'learning_rate': 4.667114093959732e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5472/75000 [04:13<59:44, 19.40it/s]

{'loss': 0.3133, 'grad_norm': 4.108067989349365, 'learning_rate': 4.666442953020134e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5483/75000 [04:14<1:00:07, 19.27it/s]

{'loss': 0.3952, 'grad_norm': 2.162108898162842, 'learning_rate': 4.665771812080537e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5494/75000 [04:14<1:00:05, 19.28it/s]

{'loss': 0.3906, 'grad_norm': 3.1963677406311035, 'learning_rate': 4.66510067114094e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5500/75000 [04:15<1:04:52, 17.86it/s]

{'loss': 0.2614, 'grad_norm': 5.260818958282471, 'learning_rate': 4.664429530201342e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5512/75000 [04:16<1:14:18, 15.59it/s]

{'loss': 0.3813, 'grad_norm': 0.9118140339851379, 'learning_rate': 4.663758389261745e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5523/75000 [04:16<1:04:21, 17.99it/s]

{'loss': 0.3964, 'grad_norm': 4.699179649353027, 'learning_rate': 4.663087248322148e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5532/75000 [04:17<1:01:02, 18.97it/s]

{'loss': 0.3045, 'grad_norm': 5.1098408699035645, 'learning_rate': 4.662416107382551e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5542/75000 [04:17<59:13, 19.55it/s]

{'loss': 0.4444, 'grad_norm': 4.215862274169922, 'learning_rate': 4.661744966442953e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5552/75000 [04:18<1:00:09, 19.24it/s]

{'loss': 0.2925, 'grad_norm': 1.6418536901474, 'learning_rate': 4.661073825503356e-05, 'epoch': 0.22}


                                                      
  7%|▋         | 5562/75000 [04:18<58:49, 19.67it/s]

{'loss': 0.2608, 'grad_norm': 9.932950973510742, 'learning_rate': 4.660402684563759e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5574/75000 [04:19<57:33, 20.10it/s]

{'loss': 0.4136, 'grad_norm': 7.034643650054932, 'learning_rate': 4.6597315436241615e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5583/75000 [04:19<56:25, 20.50it/s]

{'loss': 0.3388, 'grad_norm': 3.727855682373047, 'learning_rate': 4.6590604026845644e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5592/75000 [04:20<55:49, 20.72it/s]

{'loss': 0.3678, 'grad_norm': 8.533095359802246, 'learning_rate': 4.6583892617449666e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5604/75000 [04:20<55:45, 20.74it/s]

{'loss': 0.392, 'grad_norm': 5.292483806610107, 'learning_rate': 4.6577181208053694e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5613/75000 [04:21<56:44, 20.38it/s]

{'loss': 0.2374, 'grad_norm': 2.324920654296875, 'learning_rate': 4.6570469798657716e-05, 'epoch': 0.22}


                                                    
  7%|▋         | 5622/75000 [04:21<56:00, 20.64it/s]

{'loss': 0.3596, 'grad_norm': 8.243095397949219, 'learning_rate': 4.6563758389261744e-05, 'epoch': 0.22}


                                                    
  8%|▊         | 5634/75000 [04:22<55:52, 20.69it/s]

{'loss': 0.3787, 'grad_norm': 4.8193769454956055, 'learning_rate': 4.655704697986577e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5643/75000 [04:22<55:57, 20.66it/s]

{'loss': 0.3604, 'grad_norm': 5.279313087463379, 'learning_rate': 4.65503355704698e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5652/75000 [04:23<55:17, 20.90it/s]

{'loss': 0.3114, 'grad_norm': 4.0732574462890625, 'learning_rate': 4.654362416107383e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5664/75000 [04:23<55:42, 20.74it/s]

{'loss': 0.351, 'grad_norm': 12.148902893066406, 'learning_rate': 4.653691275167785e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5673/75000 [04:24<57:41, 20.03it/s]

{'loss': 0.4001, 'grad_norm': 1.9579318761825562, 'learning_rate': 4.653020134228188e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5682/75000 [04:24<57:07, 20.22it/s]

{'loss': 0.3197, 'grad_norm': 4.559133052825928, 'learning_rate': 4.652348993288591e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5694/75000 [04:25<55:21, 20.87it/s]

{'loss': 0.3908, 'grad_norm': 2.962291717529297, 'learning_rate': 4.651677852348994e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5703/75000 [04:25<56:47, 20.34it/s]

{'loss': 0.3458, 'grad_norm': 5.614072322845459, 'learning_rate': 4.6510067114093966e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5712/75000 [04:26<56:31, 20.43it/s]

{'loss': 0.3502, 'grad_norm': 2.845191240310669, 'learning_rate': 4.650335570469799e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5724/75000 [04:26<57:42, 20.01it/s]

{'loss': 0.3905, 'grad_norm': 2.5877304077148438, 'learning_rate': 4.6496644295302016e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5732/75000 [04:27<59:13, 19.49it/s]

{'loss': 0.2743, 'grad_norm': 7.733984470367432, 'learning_rate': 4.648993288590604e-05, 'epoch': 0.23}


                                                      
  8%|▊         | 5742/75000 [04:27<59:32, 19.39it/s]

{'loss': 0.3139, 'grad_norm': 3.6001763343811035, 'learning_rate': 4.648322147651007e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5754/75000 [04:28<56:26, 20.45it/s]

{'loss': 0.4652, 'grad_norm': 3.0864439010620117, 'learning_rate': 4.6476510067114095e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5763/75000 [04:28<55:46, 20.69it/s]

{'loss': 0.3539, 'grad_norm': 2.0680532455444336, 'learning_rate': 4.6469798657718124e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5772/75000 [04:29<56:41, 20.35it/s]

{'loss': 0.3238, 'grad_norm': 1.808609962463379, 'learning_rate': 4.646308724832215e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5784/75000 [04:29<56:55, 20.26it/s]

{'loss': 0.3088, 'grad_norm': 1.5611578226089478, 'learning_rate': 4.6456375838926174e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5793/75000 [04:30<56:14, 20.51it/s]

{'loss': 0.3457, 'grad_norm': 3.4023497104644775, 'learning_rate': 4.64496644295302e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5802/75000 [04:30<56:51, 20.28it/s]

{'loss': 0.3155, 'grad_norm': 5.268247604370117, 'learning_rate': 4.644295302013423e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5814/75000 [04:31<55:19, 20.85it/s]

{'loss': 0.4565, 'grad_norm': 2.7367169857025146, 'learning_rate': 4.643624161073826e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5823/75000 [04:31<57:34, 20.02it/s]

{'loss': 0.3681, 'grad_norm': 6.397464275360107, 'learning_rate': 4.642953020134229e-05, 'epoch': 0.23}


                                                      
  8%|▊         | 5832/75000 [04:32<58:59, 19.54it/s]  

{'loss': 0.3447, 'grad_norm': 2.0630321502685547, 'learning_rate': 4.642281879194631e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5844/75000 [04:32<56:26, 20.42it/s]

{'loss': 0.3666, 'grad_norm': 5.032108783721924, 'learning_rate': 4.641610738255034e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5853/75000 [04:33<55:34, 20.74it/s]

{'loss': 0.3734, 'grad_norm': 0.4865943491458893, 'learning_rate': 4.640939597315436e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5862/75000 [04:33<56:52, 20.26it/s]

{'loss': 0.344, 'grad_norm': 1.469289779663086, 'learning_rate': 4.640268456375839e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5874/75000 [04:34<56:11, 20.50it/s]

{'loss': 0.4093, 'grad_norm': 15.126214981079102, 'learning_rate': 4.6395973154362424e-05, 'epoch': 0.23}


                                                    
  8%|▊         | 5883/75000 [04:34<57:57, 19.88it/s]

{'loss': 0.2212, 'grad_norm': 4.2781982421875, 'learning_rate': 4.6389261744966446e-05, 'epoch': 0.24}


                                                      
  8%|▊         | 5892/75000 [04:35<1:02:37, 18.39it/s]

{'loss': 0.4168, 'grad_norm': 3.9940130710601807, 'learning_rate': 4.6382550335570474e-05, 'epoch': 0.24}


                                                      
  8%|▊         | 5904/75000 [04:35<57:55, 19.88it/s]

{'loss': 0.2934, 'grad_norm': 8.14118766784668, 'learning_rate': 4.6375838926174496e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5913/75000 [04:36<57:10, 20.14it/s]

{'loss': 0.3238, 'grad_norm': 5.904720783233643, 'learning_rate': 4.6369127516778525e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5922/75000 [04:36<57:03, 20.18it/s]

{'loss': 0.2917, 'grad_norm': 1.0589925050735474, 'learning_rate': 4.636241610738255e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5934/75000 [04:37<55:09, 20.87it/s]

{'loss': 0.2587, 'grad_norm': 3.9367456436157227, 'learning_rate': 4.635570469798658e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5943/75000 [04:37<56:04, 20.53it/s]

{'loss': 0.4344, 'grad_norm': 2.0419821739196777, 'learning_rate': 4.634899328859061e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5952/75000 [04:38<57:20, 20.07it/s]

{'loss': 0.3291, 'grad_norm': 1.9178622961044312, 'learning_rate': 4.634228187919463e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5964/75000 [04:38<56:44, 20.28it/s]

{'loss': 0.3577, 'grad_norm': 2.8553626537323, 'learning_rate': 4.633557046979866e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5973/75000 [04:39<56:23, 20.40it/s]

{'loss': 0.4467, 'grad_norm': 3.647557258605957, 'learning_rate': 4.632885906040268e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5982/75000 [04:39<55:50, 20.60it/s]

{'loss': 0.34, 'grad_norm': 5.446438312530518, 'learning_rate': 4.632214765100671e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 5994/75000 [04:40<56:15, 20.44it/s]

{'loss': 0.3588, 'grad_norm': 7.93130350112915, 'learning_rate': 4.631543624161074e-05, 'epoch': 0.24}


                                                      
  8%|▊         | 6000/75000 [04:40<1:01:14, 18.78it/s]

{'loss': 0.3793, 'grad_norm': 4.830775737762451, 'learning_rate': 4.630872483221477e-05, 'epoch': 0.24}


                                                      
  8%|▊         | 6013/75000 [04:41<1:12:23, 15.88it/s]

{'loss': 0.3126, 'grad_norm': 6.981159687042236, 'learning_rate': 4.63020134228188e-05, 'epoch': 0.24}


                                                      
  8%|▊         | 6024/75000 [04:41<57:50, 19.88it/s]  

{'loss': 0.3358, 'grad_norm': 10.273917198181152, 'learning_rate': 4.629530201342282e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6033/75000 [04:42<55:46, 20.61it/s]

{'loss': 0.3165, 'grad_norm': 4.70413875579834, 'learning_rate': 4.628859060402685e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6042/75000 [04:42<54:27, 21.10it/s]

{'loss': 0.3062, 'grad_norm': 1.1825894117355347, 'learning_rate': 4.6281879194630876e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6054/75000 [04:43<54:55, 20.92it/s]

{'loss': 0.3143, 'grad_norm': 6.402121067047119, 'learning_rate': 4.6275167785234904e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6063/75000 [04:43<53:55, 21.31it/s]

{'loss': 0.4419, 'grad_norm': 9.733360290527344, 'learning_rate': 4.626845637583893e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6072/75000 [04:44<53:16, 21.57it/s]

{'loss': 0.3274, 'grad_norm': 0.5568538904190063, 'learning_rate': 4.6261744966442954e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6084/75000 [04:44<52:22, 21.93it/s]

{'loss': 0.3621, 'grad_norm': 5.830625534057617, 'learning_rate': 4.625503355704698e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6093/75000 [04:45<52:23, 21.92it/s]

{'loss': 0.3967, 'grad_norm': 6.648717880249023, 'learning_rate': 4.6248322147651005e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6102/75000 [04:45<53:47, 21.35it/s]

{'loss': 0.4381, 'grad_norm': 5.362377643585205, 'learning_rate': 4.624161073825504e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6114/75000 [04:46<54:43, 20.98it/s]

{'loss': 0.2843, 'grad_norm': 7.615537166595459, 'learning_rate': 4.623489932885906e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6123/75000 [04:46<57:54, 19.82it/s]

{'loss': 0.2816, 'grad_norm': 1.1466180086135864, 'learning_rate': 4.622818791946309e-05, 'epoch': 0.24}


                                                    
  8%|▊         | 6132/75000 [04:47<54:51, 20.92it/s]

{'loss': 0.3184, 'grad_norm': 4.826162815093994, 'learning_rate': 4.622147651006712e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6144/75000 [04:47<53:40, 21.38it/s]

{'loss': 0.5253, 'grad_norm': 9.56184196472168, 'learning_rate': 4.621476510067114e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6153/75000 [04:48<53:58, 21.26it/s]

{'loss': 0.2578, 'grad_norm': 6.350054740905762, 'learning_rate': 4.620805369127517e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6162/75000 [04:48<55:27, 20.69it/s]

{'loss': 0.395, 'grad_norm': 7.771651268005371, 'learning_rate': 4.62013422818792e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6171/75000 [04:48<57:45, 19.86it/s]

{'loss': 0.3939, 'grad_norm': 4.87445068359375, 'learning_rate': 4.6194630872483226e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6183/75000 [04:49<58:57, 19.45it/s]

{'loss': 0.3642, 'grad_norm': 10.457026481628418, 'learning_rate': 4.618791946308725e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6192/75000 [04:50<59:26, 19.29it/s]

{'loss': 0.2639, 'grad_norm': 3.786489248275757, 'learning_rate': 4.618120805369128e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6203/75000 [04:50<58:14, 19.69it/s]

{'loss': 0.2986, 'grad_norm': 19.22458839416504, 'learning_rate': 4.6174496644295305e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6213/75000 [04:51<58:24, 19.63it/s]

{'loss': 0.4489, 'grad_norm': 6.140187740325928, 'learning_rate': 4.616778523489933e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6222/75000 [04:51<1:00:55, 18.82it/s]

{'loss': 0.3258, 'grad_norm': 7.308470726013184, 'learning_rate': 4.616107382550336e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6231/75000 [04:52<59:29, 19.26it/s]

{'loss': 0.2957, 'grad_norm': 14.113238334655762, 'learning_rate': 4.6154362416107384e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6243/75000 [04:52<59:51, 19.14it/s]  

{'loss': 0.3372, 'grad_norm': 3.151455879211426, 'learning_rate': 4.614765100671141e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6252/75000 [04:53<1:00:36, 18.90it/s]

{'loss': 0.4881, 'grad_norm': 1.5035220384597778, 'learning_rate': 4.6140939597315434e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6261/75000 [04:53<59:50, 19.15it/s]

{'loss': 0.3826, 'grad_norm': 2.3154654502868652, 'learning_rate': 4.613422818791946e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6273/75000 [04:54<58:58, 19.42it/s]

{'loss': 0.4439, 'grad_norm': 2.495025634765625, 'learning_rate': 4.612751677852349e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6283/75000 [04:54<1:00:29, 18.93it/s]

{'loss': 0.304, 'grad_norm': 1.9753155708312988, 'learning_rate': 4.612080536912752e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6294/75000 [04:55<59:29, 19.25it/s]

{'loss': 0.2377, 'grad_norm': 2.5934457778930664, 'learning_rate': 4.611409395973155e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6304/75000 [04:55<58:05, 19.71it/s]

{'loss': 0.3596, 'grad_norm': 11.0869140625, 'learning_rate': 4.610738255033557e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6313/75000 [04:56<1:00:10, 19.02it/s]

{'loss': 0.3878, 'grad_norm': 2.956727981567383, 'learning_rate': 4.61006711409396e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6323/75000 [04:56<58:11, 19.67it/s]

{'loss': 0.2855, 'grad_norm': 2.689978837966919, 'learning_rate': 4.609395973154363e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6333/75000 [04:57<1:04:28, 17.75it/s]

{'loss': 0.4496, 'grad_norm': 2.299898147583008, 'learning_rate': 4.608724832214765e-05, 'epoch': 0.25}


                                                      
  8%|▊         | 6344/75000 [04:57<58:59, 19.40it/s]  

{'loss': 0.2388, 'grad_norm': 3.1158299446105957, 'learning_rate': 4.6080536912751685e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6351/75000 [04:58<58:43, 19.48it/s]

{'loss': 0.4135, 'grad_norm': 1.1467458009719849, 'learning_rate': 4.6073825503355706e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6363/75000 [04:58<58:25, 19.58it/s]

{'loss': 0.3055, 'grad_norm': 3.507918357849121, 'learning_rate': 4.6067114093959735e-05, 'epoch': 0.25}


                                                    
  8%|▊         | 6373/75000 [04:59<58:21, 19.60it/s]

{'loss': 0.378, 'grad_norm': 4.486663818359375, 'learning_rate': 4.606040268456376e-05, 'epoch': 0.25}


                                                    
  9%|▊         | 6382/75000 [04:59<58:47, 19.45it/s]

{'loss': 0.3976, 'grad_norm': 4.167145252227783, 'learning_rate': 4.6053691275167785e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 6392/75000 [05:00<1:00:50, 18.79it/s]

{'loss': 0.323, 'grad_norm': 9.731345176696777, 'learning_rate': 4.6046979865771814e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 6402/75000 [05:00<59:36, 19.18it/s]

{'loss': 0.3483, 'grad_norm': 1.0449961423873901, 'learning_rate': 4.604026845637584e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6412/75000 [05:01<59:15, 19.29it/s]

{'loss': 0.3736, 'grad_norm': 4.139800548553467, 'learning_rate': 4.603355704697987e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6423/75000 [05:01<58:27, 19.55it/s]

{'loss': 0.3473, 'grad_norm': 8.233661651611328, 'learning_rate': 4.602684563758389e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6434/75000 [05:02<58:07, 19.66it/s]

{'loss': 0.3806, 'grad_norm': 2.342620611190796, 'learning_rate': 4.602013422818792e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6443/75000 [05:03<58:53, 19.40it/s]

{'loss': 0.3828, 'grad_norm': 2.1333084106445312, 'learning_rate': 4.601342281879194e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 6452/75000 [05:03<1:02:08, 18.39it/s]

{'loss': 0.3339, 'grad_norm': 4.114003658294678, 'learning_rate': 4.600671140939598e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 6464/75000 [05:04<58:31, 19.52it/s]

{'loss': 0.2345, 'grad_norm': 6.877819538116455, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6473/75000 [05:04<59:27, 19.21it/s]  

{'loss': 0.3139, 'grad_norm': 6.272565841674805, 'learning_rate': 4.599328859060403e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6483/75000 [05:05<58:24, 19.55it/s]

{'loss': 0.256, 'grad_norm': 9.256997108459473, 'learning_rate': 4.598657718120806e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6492/75000 [05:05<56:29, 20.21it/s]

{'loss': 0.4432, 'grad_norm': 12.89069938659668, 'learning_rate': 4.597986577181208e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6500/75000 [05:05<57:10, 19.97it/s]

{'loss': 0.4363, 'grad_norm': 8.42404556274414, 'learning_rate': 4.597315436241611e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 6512/75000 [05:06<1:06:56, 17.05it/s]

{'loss': 0.3834, 'grad_norm': 1.8367704153060913, 'learning_rate': 4.5966442953020136e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 6524/75000 [05:07<58:46, 19.42it/s]  

{'loss': 0.2275, 'grad_norm': 9.553085327148438, 'learning_rate': 4.5959731543624165e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6532/75000 [05:07<56:09, 20.32it/s]

{'loss': 0.3747, 'grad_norm': 2.3566527366638184, 'learning_rate': 4.595302013422819e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6541/75000 [05:08<55:53, 20.41it/s]

{'loss': 0.4879, 'grad_norm': 5.326749801635742, 'learning_rate': 4.5946308724832215e-05, 'epoch': 0.26}


                                                    
  9%|▊         | 6550/75000 [05:08<54:17, 21.01it/s]

{'loss': 0.3699, 'grad_norm': 11.542953491210938, 'learning_rate': 4.5939597315436243e-05, 'epoch': 0.26}


                                                      
  9%|▊         | 6562/75000 [05:09<56:40, 20.12it/s]

{'loss': 0.4947, 'grad_norm': 5.52243185043335, 'learning_rate': 4.5932885906040265e-05, 'epoch': 0.26}


                                                    
  9%|▉         | 6574/75000 [05:09<54:59, 20.74it/s]

{'loss': 0.4025, 'grad_norm': 7.063526630401611, 'learning_rate': 4.59261744966443e-05, 'epoch': 0.26}


                                                    
  9%|▉         | 6583/75000 [05:10<54:54, 20.77it/s]

{'loss': 0.2424, 'grad_norm': 4.141355037689209, 'learning_rate': 4.591946308724833e-05, 'epoch': 0.26}


                                                    
  9%|▉         | 6592/75000 [05:10<55:46, 20.44it/s]

{'loss': 0.3622, 'grad_norm': 4.008228778839111, 'learning_rate': 4.591275167785235e-05, 'epoch': 0.26}


                                                    
  9%|▉         | 6604/75000 [05:11<54:26, 20.94it/s]

{'loss': 0.2938, 'grad_norm': 3.8276710510253906, 'learning_rate': 4.590604026845638e-05, 'epoch': 0.26}


                                                      
  9%|▉         | 6613/75000 [05:11<57:53, 19.69it/s]

{'loss': 0.324, 'grad_norm': 4.90342903137207, 'learning_rate': 4.58993288590604e-05, 'epoch': 0.26}


                                                    
  9%|▉         | 6622/75000 [05:12<56:23, 20.21it/s]

{'loss': 0.338, 'grad_norm': 9.354639053344727, 'learning_rate': 4.589261744966443e-05, 'epoch': 0.26}


                                                    
  9%|▉         | 6634/75000 [05:12<55:15, 20.62it/s]

{'loss': 0.4258, 'grad_norm': 3.4173262119293213, 'learning_rate': 4.588590604026846e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6643/75000 [05:13<55:01, 20.70it/s]

{'loss': 0.3231, 'grad_norm': 5.279852867126465, 'learning_rate': 4.587919463087249e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6652/75000 [05:13<56:08, 20.29it/s]

{'loss': 0.272, 'grad_norm': 4.165163040161133, 'learning_rate': 4.5872483221476515e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6664/75000 [05:14<56:43, 20.08it/s]

{'loss': 0.2282, 'grad_norm': 12.90716552734375, 'learning_rate': 4.586577181208054e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6673/75000 [05:14<55:58, 20.35it/s]

{'loss': 0.2329, 'grad_norm': 8.270874977111816, 'learning_rate': 4.5859060402684566e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6682/75000 [05:15<55:11, 20.63it/s]

{'loss': 0.3741, 'grad_norm': 9.565910339355469, 'learning_rate': 4.585234899328859e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6694/75000 [05:15<55:03, 20.68it/s]

{'loss': 0.4826, 'grad_norm': 3.7981173992156982, 'learning_rate': 4.584563758389262e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6703/75000 [05:16<54:44, 20.80it/s]

{'loss': 0.4242, 'grad_norm': 2.99980092048645, 'learning_rate': 4.583892617449665e-05, 'epoch': 0.27}


                                                      
  9%|▉         | 6712/75000 [05:16<58:24, 19.48it/s]  

{'loss': 0.3714, 'grad_norm': 1.262550950050354, 'learning_rate': 4.583221476510067e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6723/75000 [05:17<56:26, 20.16it/s]

{'loss': 0.2847, 'grad_norm': 4.124421119689941, 'learning_rate': 4.58255033557047e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6732/75000 [05:17<56:37, 20.09it/s]

{'loss': 0.2489, 'grad_norm': 3.212435483932495, 'learning_rate': 4.581879194630872e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6744/75000 [05:18<55:01, 20.67it/s]

{'loss': 0.4584, 'grad_norm': 3.515092372894287, 'learning_rate': 4.581208053691275e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6753/75000 [05:18<54:59, 20.68it/s]

{'loss': 0.2836, 'grad_norm': 5.697812557220459, 'learning_rate': 4.580536912751678e-05, 'epoch': 0.27}


                                                      
  9%|▉         | 6764/75000 [05:19<59:09, 19.22it/s]  

{'loss': 0.3783, 'grad_norm': 7.71139669418335, 'learning_rate': 4.579865771812081e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6773/75000 [05:19<56:22, 20.17it/s]

{'loss': 0.3809, 'grad_norm': 4.612039566040039, 'learning_rate': 4.579194630872484e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6782/75000 [05:20<55:26, 20.51it/s]

{'loss': 0.4576, 'grad_norm': 2.1066431999206543, 'learning_rate': 4.578523489932886e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6794/75000 [05:20<54:55, 20.70it/s]

{'loss': 0.4325, 'grad_norm': 6.18121862411499, 'learning_rate': 4.577852348993289e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6803/75000 [05:21<55:15, 20.57it/s]

{'loss': 0.3168, 'grad_norm': 2.568225145339966, 'learning_rate': 4.5771812080536916e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6814/75000 [05:21<57:40, 19.70it/s]

{'loss': 0.4599, 'grad_norm': 5.109865665435791, 'learning_rate': 4.5765100671140945e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6822/75000 [05:22<56:10, 20.23it/s]

{'loss': 0.3669, 'grad_norm': 3.4854300022125244, 'learning_rate': 4.575838926174497e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6834/75000 [05:22<55:15, 20.56it/s]

{'loss': 0.3358, 'grad_norm': 3.759261131286621, 'learning_rate': 4.5751677852348995e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6843/75000 [05:23<54:48, 20.73it/s]

{'loss': 0.4446, 'grad_norm': 5.97281551361084, 'learning_rate': 4.5744966442953024e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6852/75000 [05:23<55:02, 20.63it/s]

{'loss': 0.2868, 'grad_norm': 2.2309365272521973, 'learning_rate': 4.5738255033557046e-05, 'epoch': 0.27}


                                                      
  9%|▉         | 6864/75000 [05:24<57:45, 19.66it/s]  

{'loss': 0.3508, 'grad_norm': 3.8509321212768555, 'learning_rate': 4.5731543624161074e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6872/75000 [05:24<56:56, 19.94it/s]

{'loss': 0.3444, 'grad_norm': 5.098649501800537, 'learning_rate': 4.57248322147651e-05, 'epoch': 0.27}


                                                    
  9%|▉         | 6884/75000 [05:25<55:08, 20.59it/s]

{'loss': 0.3285, 'grad_norm': 6.5601091384887695, 'learning_rate': 4.571812080536913e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6893/75000 [05:25<55:47, 20.34it/s]

{'loss': 0.4344, 'grad_norm': 5.254306793212891, 'learning_rate': 4.571140939597316e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6902/75000 [05:26<55:18, 20.52it/s]

{'loss': 0.2356, 'grad_norm': 9.720727920532227, 'learning_rate': 4.570469798657718e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6914/75000 [05:26<56:33, 20.06it/s]

{'loss': 0.3865, 'grad_norm': 4.279430866241455, 'learning_rate': 4.569798657718121e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6923/75000 [05:27<55:29, 20.45it/s]

{'loss': 0.349, 'grad_norm': 2.2089147567749023, 'learning_rate': 4.569127516778524e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6932/75000 [05:27<55:48, 20.33it/s]

{'loss': 0.3705, 'grad_norm': 1.8140512704849243, 'learning_rate': 4.568456375838927e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6944/75000 [05:28<53:55, 21.04it/s]

{'loss': 0.3279, 'grad_norm': 10.391048431396484, 'learning_rate': 4.567785234899329e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6953/75000 [05:28<55:37, 20.39it/s]

{'loss': 0.2749, 'grad_norm': 2.8197782039642334, 'learning_rate': 4.567114093959732e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6964/75000 [05:29<55:35, 20.40it/s]

{'loss': 0.3511, 'grad_norm': 3.4259345531463623, 'learning_rate': 4.5664429530201346e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6973/75000 [05:29<55:47, 20.32it/s]

{'loss': 0.2295, 'grad_norm': 2.666958808898926, 'learning_rate': 4.565771812080537e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6982/75000 [05:30<54:49, 20.68it/s]

{'loss': 0.3936, 'grad_norm': 7.227280139923096, 'learning_rate': 4.5651006711409396e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 6994/75000 [05:30<55:12, 20.53it/s]

{'loss': 0.432, 'grad_norm': 2.5648186206817627, 'learning_rate': 4.5644295302013425e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7000/75000 [05:30<54:58, 20.62it/s]

{'loss': 0.3139, 'grad_norm': 3.0109829902648926, 'learning_rate': 4.5637583892617453e-05, 'epoch': 0.28}


                                                      
  9%|▉         | 7014/75000 [05:32<1:08:32, 16.53it/s]

{'loss': 0.3576, 'grad_norm': 2.1830034255981445, 'learning_rate': 4.5630872483221475e-05, 'epoch': 0.28}


                                                      
  9%|▉         | 7024/75000 [05:32<57:54, 19.56it/s]  

{'loss': 0.4619, 'grad_norm': 2.394906997680664, 'learning_rate': 4.5624161073825504e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7033/75000 [05:33<56:08, 20.18it/s]

{'loss': 0.2653, 'grad_norm': 5.218151092529297, 'learning_rate': 4.561744966442953e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7042/75000 [05:33<55:29, 20.41it/s]

{'loss': 0.3462, 'grad_norm': 6.49889612197876, 'learning_rate': 4.561073825503356e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7054/75000 [05:34<56:29, 20.05it/s]

{'loss': 0.3681, 'grad_norm': 0.4873902499675751, 'learning_rate': 4.560402684563759e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7063/75000 [05:34<56:47, 19.94it/s]

{'loss': 0.3327, 'grad_norm': 3.962689161300659, 'learning_rate': 4.559731543624161e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7072/75000 [05:34<55:43, 20.32it/s]

{'loss': 0.3536, 'grad_norm': 2.805122137069702, 'learning_rate': 4.559060402684564e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7084/75000 [05:35<56:21, 20.08it/s]

{'loss': 0.274, 'grad_norm': 6.24314022064209, 'learning_rate': 4.558389261744966e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7093/75000 [05:35<55:32, 20.38it/s]

{'loss': 0.3905, 'grad_norm': 1.803537130355835, 'learning_rate': 4.557718120805369e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7102/75000 [05:36<57:14, 19.77it/s]

{'loss': 0.2912, 'grad_norm': 5.7911882400512695, 'learning_rate': 4.5570469798657725e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7114/75000 [05:37<55:01, 20.57it/s]

{'loss': 0.3795, 'grad_norm': 5.49617338180542, 'learning_rate': 4.556375838926175e-05, 'epoch': 0.28}


                                                    
  9%|▉         | 7123/75000 [05:37<54:39, 20.70it/s]

{'loss': 0.3279, 'grad_norm': 2.5290167331695557, 'learning_rate': 4.5557046979865776e-05, 'epoch': 0.28}


                                                    
 10%|▉         | 7132/75000 [05:37<54:58, 20.57it/s]

{'loss': 0.237, 'grad_norm': 3.6973304748535156, 'learning_rate': 4.55503355704698e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7144/75000 [05:38<55:58, 20.21it/s]

{'loss': 0.3392, 'grad_norm': 2.4003713130950928, 'learning_rate': 4.5543624161073826e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7152/75000 [05:38<57:23, 19.70it/s]

{'loss': 0.373, 'grad_norm': 10.415186882019043, 'learning_rate': 4.5536912751677855e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7164/75000 [05:39<55:06, 20.51it/s]

{'loss': 0.4182, 'grad_norm': 9.989782333374023, 'learning_rate': 4.553020134228188e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7173/75000 [05:39<55:07, 20.51it/s]

{'loss': 0.3458, 'grad_norm': 3.2477738857269287, 'learning_rate': 4.552348993288591e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7182/75000 [05:40<54:58, 20.56it/s]

{'loss': 0.3616, 'grad_norm': 5.122692108154297, 'learning_rate': 4.5516778523489933e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7192/75000 [05:40<58:27, 19.33it/s]

{'loss': 0.3434, 'grad_norm': 0.8755768537521362, 'learning_rate': 4.551006711409396e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7202/75000 [05:41<57:19, 19.71it/s]

{'loss': 0.3082, 'grad_norm': 3.0995657444000244, 'learning_rate': 4.5503355704697984e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7214/75000 [05:42<57:43, 19.57it/s]

{'loss': 0.2803, 'grad_norm': 0.6110240817070007, 'learning_rate': 4.549664429530201e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7223/75000 [05:42<57:50, 19.53it/s]

{'loss': 0.349, 'grad_norm': 29.89638328552246, 'learning_rate': 4.548993288590605e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7234/75000 [05:43<56:35, 19.95it/s]

{'loss': 0.2911, 'grad_norm': 2.3597257137298584, 'learning_rate': 4.548322147651007e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 7242/75000 [05:43<1:02:16, 18.14it/s]

{'loss': 0.372, 'grad_norm': 6.59394645690918, 'learning_rate': 4.54765100671141e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 7253/75000 [05:44<58:54, 19.17it/s]

{'loss': 0.165, 'grad_norm': 4.787361145019531, 'learning_rate': 4.546979865771812e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7262/75000 [05:44<1:00:01, 18.81it/s]

{'loss': 0.4098, 'grad_norm': 3.9290640354156494, 'learning_rate': 4.546308724832215e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 7273/75000 [05:45<58:45, 19.21it/s]

{'loss': 0.4309, 'grad_norm': 2.3426549434661865, 'learning_rate': 4.545637583892618e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 7282/75000 [05:45<58:57, 19.14it/s]

{'loss': 0.3305, 'grad_norm': 2.4729971885681152, 'learning_rate': 4.5449664429530205e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7294/75000 [05:46<57:25, 19.65it/s]

{'loss': 0.2948, 'grad_norm': 6.271608352661133, 'learning_rate': 4.5442953020134234e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7302/75000 [05:46<1:05:52, 17.13it/s]

{'loss': 0.2477, 'grad_norm': 0.9850167036056519, 'learning_rate': 4.5436241610738256e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 7311/75000 [05:47<1:00:51, 18.53it/s]

{'loss': 0.3575, 'grad_norm': 4.841651916503906, 'learning_rate': 4.5429530201342284e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 7322/75000 [05:47<1:00:13, 18.73it/s]

{'loss': 0.3869, 'grad_norm': 11.692490577697754, 'learning_rate': 4.5422818791946306e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 7332/75000 [05:48<58:38, 19.23it/s]

{'loss': 0.3416, 'grad_norm': 2.171032428741455, 'learning_rate': 4.541610738255034e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7342/75000 [05:48<58:09, 19.39it/s]

{'loss': 0.2655, 'grad_norm': 12.681577682495117, 'learning_rate': 4.540939597315437e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7352/75000 [05:49<58:13, 19.36it/s]

{'loss': 0.2622, 'grad_norm': 2.403139352798462, 'learning_rate': 4.540268456375839e-05, 'epoch': 0.29}


                                                      
 10%|▉         | 7364/75000 [05:49<59:55, 18.81it/s]  

{'loss': 0.531, 'grad_norm': 13.637492179870605, 'learning_rate': 4.539597315436242e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7374/75000 [05:50<58:29, 19.27it/s]

{'loss': 0.26, 'grad_norm': 3.3406875133514404, 'learning_rate': 4.538926174496644e-05, 'epoch': 0.29}


                                                    
 10%|▉         | 7383/75000 [05:50<58:30, 19.26it/s]

{'loss': 0.3951, 'grad_norm': 1.363208293914795, 'learning_rate': 4.538255033557047e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7393/75000 [05:51<57:02, 19.75it/s]

{'loss': 0.3467, 'grad_norm': 2.8784334659576416, 'learning_rate': 4.53758389261745e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7402/75000 [05:51<58:22, 19.30it/s]

{'loss': 0.3899, 'grad_norm': 2.3368170261383057, 'learning_rate': 4.536912751677853e-05, 'epoch': 0.3}


                                                      
 10%|▉         | 7411/75000 [05:52<59:02, 19.08it/s]

{'loss': 0.4045, 'grad_norm': 10.104990005493164, 'learning_rate': 4.5362416107382556e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7423/75000 [05:52<57:34, 19.56it/s]

{'loss': 0.2759, 'grad_norm': 3.9959325790405273, 'learning_rate': 4.535570469798658e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7433/75000 [05:53<58:11, 19.35it/s]

{'loss': 0.3862, 'grad_norm': 5.536287784576416, 'learning_rate': 4.5348993288590606e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7443/75000 [05:53<57:47, 19.48it/s]

{'loss': 0.4329, 'grad_norm': 15.765776634216309, 'learning_rate': 4.534228187919463e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7451/75000 [05:54<57:53, 19.45it/s]

{'loss': 0.4937, 'grad_norm': 3.9509689807891846, 'learning_rate': 4.5335570469798664e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7463/75000 [05:55<58:36, 19.21it/s]

{'loss': 0.365, 'grad_norm': 1.842581033706665, 'learning_rate': 4.5328859060402685e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7473/75000 [05:55<57:59, 19.41it/s]

{'loss': 0.3062, 'grad_norm': 10.319418907165527, 'learning_rate': 4.5322147651006714e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7482/75000 [05:55<58:05, 19.37it/s]

{'loss': 0.4217, 'grad_norm': 3.3308539390563965, 'learning_rate': 4.531543624161074e-05, 'epoch': 0.3}


                                                    
 10%|▉         | 7493/75000 [05:56<56:34, 19.89it/s]

{'loss': 0.3628, 'grad_norm': 0.6778048276901245, 'learning_rate': 4.5308724832214764e-05, 'epoch': 0.3}


                                                    
 10%|█         | 7500/75000 [05:56<56:05, 20.06it/s]

{'loss': 0.3558, 'grad_norm': 2.5141093730926514, 'learning_rate': 4.530201342281879e-05, 'epoch': 0.3}


                                                      
 10%|█         | 7512/75000 [05:57<1:09:47, 16.12it/s]

{'loss': 0.2788, 'grad_norm': 9.894217491149902, 'learning_rate': 4.529530201342282e-05, 'epoch': 0.3}


                                                      
 10%|█         | 7522/75000 [05:58<1:03:52, 17.61it/s]

{'loss': 0.338, 'grad_norm': 6.020656585693359, 'learning_rate': 4.528859060402685e-05, 'epoch': 0.3}


                                                      
 10%|█         | 7533/75000 [05:59<58:52, 19.10it/s]

{'loss': 0.5301, 'grad_norm': 3.8788044452667236, 'learning_rate': 4.528187919463088e-05, 'epoch': 0.3}


                                                    
 10%|█         | 7543/75000 [05:59<58:30, 19.22it/s]

{'loss': 0.3128, 'grad_norm': 4.074212074279785, 'learning_rate': 4.52751677852349e-05, 'epoch': 0.3}


                                                    
 10%|█         | 7552/75000 [05:59<57:25, 19.58it/s]

{'loss': 0.4119, 'grad_norm': 10.96511173248291, 'learning_rate': 4.526845637583893e-05, 'epoch': 0.3}


                                                    
 10%|█         | 7563/75000 [06:00<58:25, 19.24it/s]

{'loss': 0.4096, 'grad_norm': 3.7509610652923584, 'learning_rate': 4.526174496644295e-05, 'epoch': 0.3}


                                                      
 10%|█         | 7573/75000 [06:01<1:00:05, 18.70it/s]

{'loss': 0.389, 'grad_norm': 4.863401412963867, 'learning_rate': 4.5255033557046986e-05, 'epoch': 0.3}


                                                      
 10%|█         | 7582/75000 [06:01<58:43, 19.13it/s]

{'loss': 0.29, 'grad_norm': 6.978484153747559, 'learning_rate': 4.524832214765101e-05, 'epoch': 0.3}


                                                    
 10%|█         | 7593/75000 [06:02<56:38, 19.84it/s]

{'loss': 0.3724, 'grad_norm': 8.969144821166992, 'learning_rate': 4.5241610738255036e-05, 'epoch': 0.3}


                                                    
 10%|█         | 7602/75000 [06:02<58:32, 19.19it/s]

{'loss': 0.4342, 'grad_norm': 0.7582075595855713, 'learning_rate': 4.5234899328859065e-05, 'epoch': 0.3}


                                                    
 10%|█         | 7612/75000 [06:03<57:13, 19.63it/s]

{'loss': 0.3688, 'grad_norm': 2.4463372230529785, 'learning_rate': 4.5228187919463086e-05, 'epoch': 0.3}


                                                      
 10%|█         | 7623/75000 [06:03<59:12, 18.97it/s]  

{'loss': 0.3163, 'grad_norm': 18.5849666595459, 'learning_rate': 4.5221476510067115e-05, 'epoch': 0.3}


                                                    
 10%|█         | 7631/75000 [06:04<59:25, 18.89it/s]

{'loss': 0.3037, 'grad_norm': 4.4306111335754395, 'learning_rate': 4.5214765100671144e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7643/75000 [06:04<57:47, 19.43it/s]

{'loss': 0.4154, 'grad_norm': 3.332805871963501, 'learning_rate': 4.520805369127517e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7652/75000 [06:05<57:47, 19.42it/s]

{'loss': 0.311, 'grad_norm': 7.4332170486450195, 'learning_rate': 4.5201342281879194e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7662/75000 [06:05<58:36, 19.15it/s]

{'loss': 0.5384, 'grad_norm': 9.093486785888672, 'learning_rate': 4.519463087248322e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7673/75000 [06:06<56:39, 19.81it/s]

{'loss': 0.3103, 'grad_norm': 4.445775508880615, 'learning_rate': 4.518791946308725e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7684/75000 [06:06<54:22, 20.63it/s]

{'loss': 0.2886, 'grad_norm': 5.3870110511779785, 'learning_rate': 4.518120805369128e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7693/75000 [06:07<54:25, 20.61it/s]

{'loss': 0.446, 'grad_norm': 0.5734545588493347, 'learning_rate': 4.517449664429531e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7702/75000 [06:07<55:55, 20.05it/s]

{'loss': 0.2808, 'grad_norm': 1.653735876083374, 'learning_rate': 4.516778523489933e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7714/75000 [06:08<54:12, 20.69it/s]

{'loss': 0.3281, 'grad_norm': 1.2066845893859863, 'learning_rate': 4.516107382550336e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7723/75000 [06:08<59:41, 18.78it/s]

{'loss': 0.2792, 'grad_norm': 8.57590389251709, 'learning_rate': 4.515436241610739e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7734/75000 [06:09<56:02, 20.01it/s]

{'loss': 0.1637, 'grad_norm': 10.062469482421875, 'learning_rate': 4.514765100671141e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7742/75000 [06:09<58:10, 19.27it/s]

{'loss': 0.3092, 'grad_norm': 2.5917229652404785, 'learning_rate': 4.514093959731544e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7753/75000 [06:10<55:48, 20.08it/s]

{'loss': 0.3097, 'grad_norm': 2.3955740928649902, 'learning_rate': 4.5134228187919466e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7762/75000 [06:10<55:13, 20.29it/s]

{'loss': 0.4402, 'grad_norm': 7.746185302734375, 'learning_rate': 4.5127516778523494e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7774/75000 [06:11<55:40, 20.13it/s]

{'loss': 0.4, 'grad_norm': 13.505059242248535, 'learning_rate': 4.5120805369127516e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7783/75000 [06:11<55:08, 20.32it/s]

{'loss': 0.2842, 'grad_norm': 5.57073974609375, 'learning_rate': 4.5114093959731545e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7792/75000 [06:12<54:58, 20.38it/s]

{'loss': 0.3255, 'grad_norm': 1.2665841579437256, 'learning_rate': 4.510738255033557e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7804/75000 [06:12<54:38, 20.50it/s]

{'loss': 0.3216, 'grad_norm': 2.3760781288146973, 'learning_rate': 4.51006711409396e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7813/75000 [06:13<54:46, 20.44it/s]

{'loss': 0.4009, 'grad_norm': 5.255484580993652, 'learning_rate': 4.509395973154363e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7824/75000 [06:13<55:47, 20.07it/s]

{'loss': 0.3961, 'grad_norm': 2.941593647003174, 'learning_rate': 4.508724832214765e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7833/75000 [06:14<55:06, 20.32it/s]

{'loss': 0.3478, 'grad_norm': 1.6910287141799927, 'learning_rate': 4.508053691275168e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7842/75000 [06:14<54:56, 20.37it/s]

{'loss': 0.3701, 'grad_norm': 5.013025283813477, 'learning_rate': 4.50738255033557e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7854/75000 [06:15<53:56, 20.75it/s]

{'loss': 0.3019, 'grad_norm': 4.386809349060059, 'learning_rate': 4.506711409395973e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7863/75000 [06:15<56:22, 19.85it/s]

{'loss': 0.4908, 'grad_norm': 2.4263522624969482, 'learning_rate': 4.506040268456376e-05, 'epoch': 0.31}


                                                    
 10%|█         | 7872/75000 [06:16<54:57, 20.36it/s]

{'loss': 0.3201, 'grad_norm': 3.899048328399658, 'learning_rate': 4.505369127516779e-05, 'epoch': 0.31}


                                                    
 11%|█         | 7884/75000 [06:16<54:54, 20.37it/s]

{'loss': 0.2665, 'grad_norm': 7.382803916931152, 'learning_rate': 4.5046979865771817e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7893/75000 [06:17<54:20, 20.58it/s]

{'loss': 0.4126, 'grad_norm': 3.918252944946289, 'learning_rate': 4.504026845637584e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7902/75000 [06:17<55:08, 20.28it/s]

{'loss': 0.3504, 'grad_norm': 5.056530952453613, 'learning_rate': 4.503355704697987e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7914/75000 [06:18<56:22, 19.83it/s]

{'loss': 0.335, 'grad_norm': 1.399867296218872, 'learning_rate': 4.5026845637583895e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7923/75000 [06:18<55:04, 20.30it/s]

{'loss': 0.4035, 'grad_norm': 1.3642672300338745, 'learning_rate': 4.5020134228187924e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7932/75000 [06:19<54:17, 20.59it/s]

{'loss': 0.4768, 'grad_norm': 2.3592529296875, 'learning_rate': 4.501342281879195e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7944/75000 [06:19<54:26, 20.53it/s]

{'loss': 0.3799, 'grad_norm': 3.7132225036621094, 'learning_rate': 4.5006711409395974e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7953/75000 [06:20<53:59, 20.70it/s]

{'loss': 0.3637, 'grad_norm': 1.3139568567276, 'learning_rate': 4.5e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7964/75000 [06:20<56:42, 19.70it/s]

{'loss': 0.4248, 'grad_norm': 5.362308979034424, 'learning_rate': 4.4993288590604025e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7973/75000 [06:21<55:04, 20.28it/s]

{'loss': 0.3378, 'grad_norm': 3.1261959075927734, 'learning_rate': 4.498657718120805e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7982/75000 [06:21<55:29, 20.13it/s]

{'loss': 0.3419, 'grad_norm': 14.808661460876465, 'learning_rate': 4.497986577181209e-05, 'epoch': 0.32}


                                                    
 11%|█         | 7994/75000 [06:22<53:33, 20.85it/s]

{'loss': 0.3445, 'grad_norm': 2.2009830474853516, 'learning_rate': 4.497315436241611e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8000/75000 [06:22<54:19, 20.55it/s]

{'loss': 0.4125, 'grad_norm': 2.3941895961761475, 'learning_rate': 4.496644295302014e-05, 'epoch': 0.32}


                                                      
 11%|█         | 8012/75000 [06:23<1:09:08, 16.15it/s]

{'loss': 0.3181, 'grad_norm': 1.1435118913650513, 'learning_rate': 4.495973154362416e-05, 'epoch': 0.32}


                                                      
 11%|█         | 8024/75000 [06:23<55:44, 20.02it/s]  

{'loss': 0.394, 'grad_norm': 22.473461151123047, 'learning_rate': 4.495302013422819e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8033/75000 [06:24<53:19, 20.93it/s]

{'loss': 0.3154, 'grad_norm': 0.7002191543579102, 'learning_rate': 4.494630872483222e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8042/75000 [06:24<52:10, 21.39it/s]

{'loss': 0.5147, 'grad_norm': 8.623794555664062, 'learning_rate': 4.4939597315436246e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8054/75000 [06:25<52:30, 21.25it/s]

{'loss': 0.2585, 'grad_norm': 1.1532413959503174, 'learning_rate': 4.4932885906040275e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8063/75000 [06:25<51:31, 21.65it/s]

{'loss': 0.2324, 'grad_norm': 2.639147996902466, 'learning_rate': 4.4926174496644297e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8072/75000 [06:26<51:42, 21.57it/s]

{'loss': 0.4622, 'grad_norm': 5.854421615600586, 'learning_rate': 4.4919463087248325e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8084/75000 [06:26<51:08, 21.80it/s]

{'loss': 0.2908, 'grad_norm': 4.1906280517578125, 'learning_rate': 4.491275167785235e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8093/75000 [06:27<55:23, 20.13it/s]

{'loss': 0.3825, 'grad_norm': 1.9685264825820923, 'learning_rate': 4.4906040268456375e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8102/75000 [06:27<54:53, 20.31it/s]

{'loss': 0.4742, 'grad_norm': 3.0492630004882812, 'learning_rate': 4.4899328859060404e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8114/75000 [06:28<53:48, 20.72it/s]

{'loss': 0.3134, 'grad_norm': 2.33758544921875, 'learning_rate': 4.489261744966443e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8123/75000 [06:28<54:13, 20.56it/s]

{'loss': 0.235, 'grad_norm': 4.175380706787109, 'learning_rate': 4.488590604026846e-05, 'epoch': 0.32}


                                                    
 11%|█         | 8132/75000 [06:29<54:07, 20.59it/s]

{'loss': 0.3277, 'grad_norm': 1.7290984392166138, 'learning_rate': 4.487919463087248e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8144/75000 [06:29<54:09, 20.58it/s]

{'loss': 0.3526, 'grad_norm': 3.3107714653015137, 'learning_rate': 4.487248322147651e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8153/75000 [06:30<54:39, 20.39it/s]

{'loss': 0.3885, 'grad_norm': 2.2493624687194824, 'learning_rate': 4.486577181208054e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8162/75000 [06:30<54:56, 20.28it/s]

{'loss': 0.4258, 'grad_norm': 5.389364719390869, 'learning_rate': 4.485906040268457e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8174/75000 [06:31<53:21, 20.87it/s]

{'loss': 0.4053, 'grad_norm': 3.5120904445648193, 'learning_rate': 4.48523489932886e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8183/75000 [06:31<59:01, 18.87it/s]

{'loss': 0.4051, 'grad_norm': 3.5942907333374023, 'learning_rate': 4.484563758389262e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8192/75000 [06:32<55:38, 20.01it/s]

{'loss': 0.4058, 'grad_norm': 4.969775199890137, 'learning_rate': 4.483892617449665e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8204/75000 [06:32<54:24, 20.46it/s]

{'loss': 0.3322, 'grad_norm': 9.693963050842285, 'learning_rate': 4.483221476510067e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8213/75000 [06:33<53:52, 20.66it/s]

{'loss': 0.3307, 'grad_norm': 1.642944097518921, 'learning_rate': 4.4825503355704704e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8222/75000 [06:33<54:24, 20.46it/s]

{'loss': 0.3375, 'grad_norm': 10.924424171447754, 'learning_rate': 4.4818791946308726e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8234/75000 [06:34<54:50, 20.29it/s]

{'loss': 0.343, 'grad_norm': 9.94165325164795, 'learning_rate': 4.4812080536912755e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8243/75000 [06:34<55:13, 20.15it/s]

{'loss': 0.3819, 'grad_norm': 7.9867353439331055, 'learning_rate': 4.480536912751678e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8252/75000 [06:34<55:09, 20.17it/s]

{'loss': 0.4098, 'grad_norm': 2.49477481842041, 'learning_rate': 4.4798657718120805e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8264/75000 [06:35<54:06, 20.56it/s]

{'loss': 0.423, 'grad_norm': 2.5930771827697754, 'learning_rate': 4.4791946308724834e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8270/75000 [06:35<53:34, 20.76it/s]

{'loss': 0.3447, 'grad_norm': 2.2136454582214355, 'learning_rate': 4.478523489932886e-05, 'epoch': 0.33}


                                                      
 11%|█         | 8284/75000 [06:36<55:49, 19.92it/s]

{'loss': 0.2151, 'grad_norm': 8.093342781066895, 'learning_rate': 4.477852348993289e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8293/75000 [06:36<54:01, 20.58it/s]

{'loss': 0.4003, 'grad_norm': 18.139707565307617, 'learning_rate': 4.477181208053691e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8302/75000 [06:37<54:10, 20.52it/s]

{'loss': 0.3745, 'grad_norm': 1.874172329902649, 'learning_rate': 4.476510067114094e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8314/75000 [06:38<54:03, 20.56it/s]

{'loss': 0.4143, 'grad_norm': 13.145030975341797, 'learning_rate': 4.475838926174497e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8322/75000 [06:38<56:34, 19.64it/s]

{'loss': 0.3357, 'grad_norm': 5.121534824371338, 'learning_rate': 4.475167785234899e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8334/75000 [06:39<54:25, 20.41it/s]

{'loss': 0.3505, 'grad_norm': 9.198314666748047, 'learning_rate': 4.474496644295303e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8343/75000 [06:39<55:14, 20.11it/s]

{'loss': 0.3157, 'grad_norm': 3.974574327468872, 'learning_rate': 4.473825503355705e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8352/75000 [06:39<54:36, 20.34it/s]

{'loss': 0.4324, 'grad_norm': 1.5703123807907104, 'learning_rate': 4.473154362416108e-05, 'epoch': 0.33}


                                                      
 11%|█         | 8361/75000 [06:40<58:02, 19.13it/s]  

{'loss': 0.4319, 'grad_norm': 1.7237240076065063, 'learning_rate': 4.4724832214765106e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8373/75000 [06:40<54:29, 20.38it/s]

{'loss': 0.4148, 'grad_norm': 5.268634796142578, 'learning_rate': 4.471812080536913e-05, 'epoch': 0.33}


                                                    
 11%|█         | 8382/75000 [06:41<54:31, 20.36it/s]

{'loss': 0.3572, 'grad_norm': 2.952427387237549, 'learning_rate': 4.4711409395973156e-05, 'epoch': 0.34}


                                                    
 11%|█         | 8394/75000 [06:41<53:45, 20.65it/s]

{'loss': 0.3638, 'grad_norm': 5.399343490600586, 'learning_rate': 4.4704697986577184e-05, 'epoch': 0.34}


                                                    
 11%|█         | 8403/75000 [06:42<54:28, 20.38it/s]

{'loss': 0.4521, 'grad_norm': 5.933408737182617, 'learning_rate': 4.469798657718121e-05, 'epoch': 0.34}


                                                      
 11%|█         | 8414/75000 [06:43<56:20, 19.70it/s]

{'loss': 0.2785, 'grad_norm': 6.127044677734375, 'learning_rate': 4.4691275167785235e-05, 'epoch': 0.34}


                                                    
 11%|█         | 8423/75000 [06:43<55:10, 20.11it/s]

{'loss': 0.3267, 'grad_norm': 6.984605312347412, 'learning_rate': 4.468456375838926e-05, 'epoch': 0.34}


                                                    
 11%|█         | 8432/75000 [06:43<54:35, 20.32it/s]

{'loss': 0.4055, 'grad_norm': 10.979036331176758, 'learning_rate': 4.467785234899329e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8444/75000 [06:44<53:51, 20.60it/s]

{'loss': 0.2821, 'grad_norm': 14.167398452758789, 'learning_rate': 4.4671140939597314e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8453/75000 [06:44<55:50, 19.86it/s]

{'loss': 0.2765, 'grad_norm': 3.9098803997039795, 'learning_rate': 4.466442953020135e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8462/75000 [06:45<53:59, 20.54it/s]

{'loss': 0.2914, 'grad_norm': 6.790858745574951, 'learning_rate': 4.465771812080537e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8474/75000 [06:45<53:47, 20.61it/s]

{'loss': 0.3407, 'grad_norm': 4.674802780151367, 'learning_rate': 4.46510067114094e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8483/75000 [06:46<53:24, 20.76it/s]

{'loss': 0.2498, 'grad_norm': 1.4094407558441162, 'learning_rate': 4.464429530201342e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8492/75000 [06:46<55:35, 19.94it/s]

{'loss': 0.3846, 'grad_norm': 5.871237277984619, 'learning_rate': 4.463758389261745e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8500/75000 [06:47<54:21, 20.39it/s]

{'loss': 0.2222, 'grad_norm': 11.018077850341797, 'learning_rate': 4.463087248322148e-05, 'epoch': 0.34}


                                                      
 11%|█▏        | 8512/75000 [06:48<1:05:33, 16.90it/s]

{'loss': 0.4272, 'grad_norm': 3.541952133178711, 'learning_rate': 4.4624161073825507e-05, 'epoch': 0.34}


                                                      
 11%|█▏        | 8524/75000 [06:48<56:38, 19.56it/s]  

{'loss': 0.3131, 'grad_norm': 1.5314931869506836, 'learning_rate': 4.4617449664429535e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8533/75000 [06:49<1:00:21, 18.35it/s]

{'loss': 0.2509, 'grad_norm': 1.0725646018981934, 'learning_rate': 4.461073825503356e-05, 'epoch': 0.34}


                                                      
 11%|█▏        | 8544/75000 [06:49<54:55, 20.16it/s]

{'loss': 0.2576, 'grad_norm': 3.351619243621826, 'learning_rate': 4.4604026845637585e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8553/75000 [06:50<54:12, 20.43it/s]

{'loss': 0.4143, 'grad_norm': 1.5050829648971558, 'learning_rate': 4.459731543624161e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8564/75000 [06:50<54:49, 20.20it/s]

{'loss': 0.3709, 'grad_norm': 10.98513412475586, 'learning_rate': 4.459060402684564e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8572/75000 [06:51<57:11, 19.36it/s]

{'loss': 0.3332, 'grad_norm': 1.8828121423721313, 'learning_rate': 4.458389261744967e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8584/75000 [06:51<54:21, 20.37it/s]

{'loss': 0.2617, 'grad_norm': 3.473083257675171, 'learning_rate': 4.457718120805369e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8593/75000 [06:52<53:47, 20.58it/s]

{'loss': 0.307, 'grad_norm': 12.201290130615234, 'learning_rate': 4.457046979865772e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8602/75000 [06:52<53:53, 20.54it/s]

{'loss': 0.2684, 'grad_norm': 11.160505294799805, 'learning_rate': 4.456375838926174e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8614/75000 [06:53<54:44, 20.21it/s]

{'loss': 0.4433, 'grad_norm': 5.151673316955566, 'learning_rate': 4.455704697986577e-05, 'epoch': 0.34}


                                                    
 11%|█▏        | 8623/75000 [06:53<54:07, 20.44it/s]

{'loss': 0.3783, 'grad_norm': 11.411554336547852, 'learning_rate': 4.45503355704698e-05, 'epoch': 0.34}


                                                    
 12%|█▏        | 8632/75000 [06:54<53:24, 20.71it/s]

{'loss': 0.3697, 'grad_norm': 2.030668258666992, 'learning_rate': 4.454362416107383e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8644/75000 [06:54<52:52, 20.92it/s]

{'loss': 0.3958, 'grad_norm': 2.6492693424224854, 'learning_rate': 4.453691275167786e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8653/75000 [06:55<53:16, 20.76it/s]

{'loss': 0.2862, 'grad_norm': 1.5759508609771729, 'learning_rate': 4.453020134228188e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8664/75000 [06:55<54:53, 20.14it/s]

{'loss': 0.3085, 'grad_norm': 2.83268141746521, 'learning_rate': 4.452348993288591e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8673/75000 [06:56<53:53, 20.51it/s]

{'loss': 0.2701, 'grad_norm': 4.137202262878418, 'learning_rate': 4.451677852348993e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8682/75000 [06:56<54:27, 20.30it/s]

{'loss': 0.3242, 'grad_norm': 6.970683574676514, 'learning_rate': 4.4510067114093965e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8694/75000 [06:57<53:28, 20.66it/s]

{'loss': 0.5241, 'grad_norm': 2.202010154724121, 'learning_rate': 4.450335570469799e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8702/75000 [06:57<59:21, 18.62it/s]  

{'loss': 0.326, 'grad_norm': 3.3617379665374756, 'learning_rate': 4.4496644295302015e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8713/75000 [06:58<59:12, 18.66it/s]

{'loss': 0.4186, 'grad_norm': 0.7008519172668457, 'learning_rate': 4.4489932885906044e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8723/75000 [06:58<1:01:04, 18.09it/s]

{'loss': 0.2941, 'grad_norm': 2.8706493377685547, 'learning_rate': 4.4483221476510065e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8732/75000 [06:59<58:05, 19.01it/s]

{'loss': 0.2842, 'grad_norm': 1.7286309003829956, 'learning_rate': 4.4476510067114094e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8744/75000 [06:59<57:50, 19.09it/s]  

{'loss': 0.3677, 'grad_norm': 13.862147331237793, 'learning_rate': 4.446979865771812e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8753/75000 [07:00<58:53, 18.75it/s]

{'loss': 0.3996, 'grad_norm': 2.022719621658325, 'learning_rate': 4.446308724832215e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8763/75000 [07:01<1:08:33, 16.10it/s]

{'loss': 0.402, 'grad_norm': 4.085710525512695, 'learning_rate': 4.445637583892618e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8773/75000 [07:01<59:24, 18.58it/s]  

{'loss': 0.3952, 'grad_norm': 7.535821914672852, 'learning_rate': 4.44496644295302e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8783/75000 [07:02<1:03:48, 17.30it/s]

{'loss': 0.3312, 'grad_norm': 3.0864317417144775, 'learning_rate': 4.444295302013423e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8793/75000 [07:02<1:04:17, 17.16it/s]

{'loss': 0.4214, 'grad_norm': 3.3291494846343994, 'learning_rate': 4.443624161073825e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8802/75000 [07:03<1:01:28, 17.95it/s]

{'loss': 0.3343, 'grad_norm': 4.162344932556152, 'learning_rate': 4.442953020134229e-05, 'epoch': 0.35}


                                                      
 12%|█▏        | 8814/75000 [07:03<53:42, 20.54it/s]

{'loss': 0.3468, 'grad_norm': 1.571069359779358, 'learning_rate': 4.4422818791946316e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8823/75000 [07:04<54:58, 20.06it/s]

{'loss': 0.4704, 'grad_norm': 14.328081130981445, 'learning_rate': 4.441610738255034e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8832/75000 [07:04<54:57, 20.07it/s]

{'loss': 0.4615, 'grad_norm': 2.1256909370422363, 'learning_rate': 4.4409395973154366e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8844/75000 [07:05<51:25, 21.44it/s]

{'loss': 0.3628, 'grad_norm': 1.3260246515274048, 'learning_rate': 4.440268456375839e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8853/75000 [07:05<51:32, 21.39it/s]

{'loss': 0.2565, 'grad_norm': 1.8790113925933838, 'learning_rate': 4.4395973154362416e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8862/75000 [07:06<50:59, 21.62it/s]

{'loss': 0.3873, 'grad_norm': 2.7596349716186523, 'learning_rate': 4.4389261744966445e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8874/75000 [07:06<51:38, 21.34it/s]

{'loss': 0.3599, 'grad_norm': 3.802093029022217, 'learning_rate': 4.438255033557047e-05, 'epoch': 0.35}


                                                    
 12%|█▏        | 8883/75000 [07:07<52:39, 20.93it/s]

{'loss': 0.4079, 'grad_norm': 3.31083607673645, 'learning_rate': 4.43758389261745e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8892/75000 [07:07<51:26, 21.42it/s]

{'loss': 0.3373, 'grad_norm': 3.5817344188690186, 'learning_rate': 4.4369127516778524e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8904/75000 [07:07<51:24, 21.43it/s]

{'loss': 0.3789, 'grad_norm': 1.3832228183746338, 'learning_rate': 4.436241610738255e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8913/75000 [07:08<53:44, 20.49it/s]

{'loss': 0.3424, 'grad_norm': 2.939614772796631, 'learning_rate': 4.435570469798658e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8922/75000 [07:08<52:47, 20.86it/s]

{'loss': 0.269, 'grad_norm': 1.8631150722503662, 'learning_rate': 4.434899328859061e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8934/75000 [07:09<50:47, 21.68it/s]

{'loss': 0.4298, 'grad_norm': 8.495457649230957, 'learning_rate': 4.434228187919463e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8943/75000 [07:09<53:36, 20.54it/s]

{'loss': 0.3435, 'grad_norm': 3.3552374839782715, 'learning_rate': 4.433557046979866e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8952/75000 [07:10<52:07, 21.12it/s]

{'loss': 0.3231, 'grad_norm': 4.459932804107666, 'learning_rate': 4.432885906040269e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8964/75000 [07:10<51:11, 21.50it/s]

{'loss': 0.3521, 'grad_norm': 3.0986382961273193, 'learning_rate': 4.432214765100671e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8973/75000 [07:11<51:08, 21.51it/s]

{'loss': 0.2523, 'grad_norm': 2.642141342163086, 'learning_rate': 4.431543624161074e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8982/75000 [07:11<52:16, 21.05it/s]

{'loss': 0.2857, 'grad_norm': 3.1503665447235107, 'learning_rate': 4.430872483221477e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 8994/75000 [07:12<50:04, 21.97it/s]

{'loss': 0.3016, 'grad_norm': 8.411314010620117, 'learning_rate': 4.4302013422818796e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9000/75000 [07:12<50:37, 21.73it/s]

{'loss': 0.3449, 'grad_norm': 5.575490474700928, 'learning_rate': 4.4295302013422824e-05, 'epoch': 0.36}


                                                      
 12%|█▏        | 9014/75000 [07:17<2:37:24,  6.99it/s]

{'loss': 0.4218, 'grad_norm': 1.2378017902374268, 'learning_rate': 4.4288590604026846e-05, 'epoch': 0.36}


                                                      
 12%|█▏        | 9023/75000 [07:17<1:23:15, 13.21it/s]

{'loss': 0.2808, 'grad_norm': 3.617659091949463, 'learning_rate': 4.4281879194630874e-05, 'epoch': 0.36}


                                                      
 12%|█▏        | 9032/75000 [07:17<1:02:08, 17.69it/s]

{'loss': 0.2965, 'grad_norm': 13.8612060546875, 'learning_rate': 4.42751677852349e-05, 'epoch': 0.36}


                                                      
 12%|█▏        | 9044/75000 [07:18<52:12, 21.06it/s]

{'loss': 0.4488, 'grad_norm': 8.415536880493164, 'learning_rate': 4.426845637583893e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9053/75000 [07:18<47:27, 23.16it/s]

{'loss': 0.2912, 'grad_norm': 12.34848403930664, 'learning_rate': 4.426174496644295e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9062/75000 [07:19<47:14, 23.26it/s]

{'loss': 0.5224, 'grad_norm': 8.119352340698242, 'learning_rate': 4.425503355704698e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9074/75000 [07:19<45:05, 24.37it/s]

{'loss': 0.2493, 'grad_norm': 5.283204078674316, 'learning_rate': 4.424832214765101e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9083/75000 [07:20<44:43, 24.57it/s]

{'loss': 0.2675, 'grad_norm': 4.804227828979492, 'learning_rate': 4.424161073825503e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9095/75000 [07:20<45:22, 24.21it/s]

{'loss': 0.2924, 'grad_norm': 1.8282488584518433, 'learning_rate': 4.423489932885906e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9104/75000 [07:20<45:59, 23.88it/s]

{'loss': 0.2745, 'grad_norm': 10.145812034606934, 'learning_rate': 4.422818791946309e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9113/75000 [07:21<46:38, 23.54it/s]

{'loss': 0.4155, 'grad_norm': 7.151334285736084, 'learning_rate': 4.422147651006712e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9125/75000 [07:21<46:48, 23.46it/s]

{'loss': 0.3245, 'grad_norm': 6.387355327606201, 'learning_rate': 4.421476510067114e-05, 'epoch': 0.36}


                                                    
 12%|█▏        | 9134/75000 [07:22<48:34, 22.60it/s]

{'loss': 0.3812, 'grad_norm': 5.640979290008545, 'learning_rate': 4.420805369127517e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9143/75000 [07:22<48:59, 22.40it/s]

{'loss': 0.2591, 'grad_norm': 4.0553460121154785, 'learning_rate': 4.42013422818792e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9152/75000 [07:23<52:56, 20.73it/s]

{'loss': 0.4165, 'grad_norm': 5.187581539154053, 'learning_rate': 4.4194630872483225e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9164/75000 [07:23<53:53, 20.36it/s]

{'loss': 0.3282, 'grad_norm': 6.500277519226074, 'learning_rate': 4.4187919463087254e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9173/75000 [07:24<54:32, 20.12it/s]

{'loss': 0.2585, 'grad_norm': 5.114272594451904, 'learning_rate': 4.4181208053691276e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9184/75000 [07:24<54:28, 20.14it/s]

{'loss': 0.3108, 'grad_norm': 1.8136194944381714, 'learning_rate': 4.4174496644295304e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9193/75000 [07:25<54:58, 19.95it/s]

{'loss': 0.313, 'grad_norm': 6.507150650024414, 'learning_rate': 4.416778523489933e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9202/75000 [07:25<53:21, 20.55it/s]

{'loss': 0.3624, 'grad_norm': 4.196324348449707, 'learning_rate': 4.4161073825503354e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9214/75000 [07:26<53:00, 20.68it/s]

{'loss': 0.2553, 'grad_norm': 4.048314571380615, 'learning_rate': 4.415436241610739e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9223/75000 [07:26<54:08, 20.25it/s]

{'loss': 0.3351, 'grad_norm': 4.111754417419434, 'learning_rate': 4.414765100671141e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9232/75000 [07:27<53:06, 20.64it/s]

{'loss': 0.3914, 'grad_norm': 3.152423858642578, 'learning_rate': 4.414093959731544e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9244/75000 [07:27<54:35, 20.08it/s]

{'loss': 0.3212, 'grad_norm': 4.542724609375, 'learning_rate': 4.413422818791946e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9253/75000 [07:28<51:58, 21.08it/s]

{'loss': 0.2714, 'grad_norm': 5.851998329162598, 'learning_rate': 4.412751677852349e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9262/75000 [07:28<51:40, 21.20it/s]

{'loss': 0.4208, 'grad_norm': 4.996169567108154, 'learning_rate': 4.412080536912752e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9274/75000 [07:29<53:21, 20.53it/s]

{'loss': 0.4831, 'grad_norm': 4.545636177062988, 'learning_rate': 4.411409395973155e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9283/75000 [07:29<50:53, 21.52it/s]

{'loss': 0.4281, 'grad_norm': 9.079461097717285, 'learning_rate': 4.4107382550335576e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9292/75000 [07:29<51:36, 21.22it/s]

{'loss': 0.3468, 'grad_norm': 3.0023653507232666, 'learning_rate': 4.41006711409396e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9304/75000 [07:30<52:06, 21.02it/s]

{'loss': 0.3705, 'grad_norm': 2.880181312561035, 'learning_rate': 4.4093959731543626e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9313/75000 [07:30<52:42, 20.77it/s]

{'loss': 0.3697, 'grad_norm': 2.806931495666504, 'learning_rate': 4.408724832214765e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9322/75000 [07:31<51:37, 21.20it/s]

{'loss': 0.4231, 'grad_norm': 2.398669719696045, 'learning_rate': 4.408053691275168e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9334/75000 [07:31<51:24, 21.29it/s]

{'loss': 0.3524, 'grad_norm': 3.193035364151001, 'learning_rate': 4.407382550335571e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9343/75000 [07:32<53:12, 20.57it/s]

{'loss': 0.2525, 'grad_norm': 2.0435683727264404, 'learning_rate': 4.4067114093959734e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9352/75000 [07:32<59:49, 18.29it/s]

{'loss': 0.223, 'grad_norm': 5.665548801422119, 'learning_rate': 4.406040268456376e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9364/75000 [07:33<55:33, 19.69it/s]

{'loss': 0.417, 'grad_norm': 5.328485012054443, 'learning_rate': 4.4053691275167784e-05, 'epoch': 0.37}


                                                    
 12%|█▏        | 9373/75000 [07:33<56:03, 19.51it/s]

{'loss': 0.363, 'grad_norm': 3.824078321456909, 'learning_rate': 4.404697986577181e-05, 'epoch': 0.37}


                                                    
 13%|█▎        | 9381/75000 [07:34<53:58, 20.26it/s]

{'loss': 0.3762, 'grad_norm': 3.356569290161133, 'learning_rate': 4.404026845637584e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 9394/75000 [07:35<57:58, 18.86it/s]  

{'loss': 0.3308, 'grad_norm': 3.032853364944458, 'learning_rate': 4.403355704697987e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9403/75000 [07:35<54:53, 19.92it/s]

{'loss': 0.3749, 'grad_norm': 3.2045416831970215, 'learning_rate': 4.40268456375839e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9413/75000 [07:36<56:34, 19.32it/s]

{'loss': 0.4128, 'grad_norm': 5.422750473022461, 'learning_rate': 4.402013422818792e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 9422/75000 [07:36<1:01:47, 17.69it/s]

{'loss': 0.2687, 'grad_norm': 1.1899285316467285, 'learning_rate': 4.401342281879195e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 9432/75000 [07:37<59:06, 18.49it/s]

{'loss': 0.4019, 'grad_norm': 3.7180447578430176, 'learning_rate': 4.400671140939597e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9443/75000 [07:37<57:56, 18.86it/s]

{'loss': 0.3425, 'grad_norm': 3.3650310039520264, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9453/75000 [07:38<1:02:03, 17.60it/s]

{'loss': 0.3775, 'grad_norm': 6.707165241241455, 'learning_rate': 4.3993288590604034e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 9463/75000 [07:38<1:02:22, 17.51it/s]

{'loss': 0.2397, 'grad_norm': 1.6700736284255981, 'learning_rate': 4.3986577181208056e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 9474/75000 [07:39<57:14, 19.08it/s]

{'loss': 0.1883, 'grad_norm': 3.775489330291748, 'learning_rate': 4.3979865771812084e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9482/75000 [07:39<58:34, 18.64it/s]

{'loss': 0.333, 'grad_norm': 3.3292224407196045, 'learning_rate': 4.3973154362416106e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9491/75000 [07:40<56:47, 19.23it/s]

{'loss': 0.315, 'grad_norm': 10.05744457244873, 'learning_rate': 4.3966442953020135e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9500/75000 [07:40<57:20, 19.04it/s]

{'loss': 0.4173, 'grad_norm': 4.208391189575195, 'learning_rate': 4.395973154362416e-05, 'epoch': 0.38}


                                                       
 13%|█▎        | 9514/75000 [07:46<2:58:46,  6.10it/s]

{'loss': 0.4775, 'grad_norm': 3.635068893432617, 'learning_rate': 4.395302013422819e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 9523/75000 [07:46<1:30:43, 12.03it/s]

{'loss': 0.2952, 'grad_norm': 5.895533561706543, 'learning_rate': 4.394630872483222e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 9532/75000 [07:46<1:04:59, 16.79it/s]

{'loss': 0.3978, 'grad_norm': 6.6930131912231445, 'learning_rate': 4.393959731543624e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 9544/75000 [07:47<54:21, 20.07it/s]

{'loss': 0.3813, 'grad_norm': 2.834989070892334, 'learning_rate': 4.393288590604027e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9553/75000 [07:47<53:41, 20.31it/s]

{'loss': 0.2467, 'grad_norm': 1.6637084484100342, 'learning_rate': 4.392617449664429e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9562/75000 [07:48<52:30, 20.77it/s]

{'loss': 0.2657, 'grad_norm': 5.549081325531006, 'learning_rate': 4.391946308724833e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9574/75000 [07:48<53:40, 20.32it/s]

{'loss': 0.4386, 'grad_norm': 8.54797649383545, 'learning_rate': 4.391275167785235e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9583/75000 [07:49<52:24, 20.80it/s]

{'loss': 0.42, 'grad_norm': 4.254324913024902, 'learning_rate': 4.390604026845638e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9592/75000 [07:49<51:23, 21.21it/s]

{'loss': 0.3126, 'grad_norm': 8.09814739227295, 'learning_rate': 4.389932885906041e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9604/75000 [07:50<51:14, 21.27it/s]

{'loss': 0.2813, 'grad_norm': 0.706695556640625, 'learning_rate': 4.389261744966443e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9613/75000 [07:50<50:56, 21.39it/s]

{'loss': 0.3957, 'grad_norm': 5.446882724761963, 'learning_rate': 4.388590604026846e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9622/75000 [07:51<50:21, 21.64it/s]

{'loss': 0.3502, 'grad_norm': 3.6717348098754883, 'learning_rate': 4.3879194630872486e-05, 'epoch': 0.38}


                                                    
 13%|█▎        | 9634/75000 [07:51<50:24, 21.61it/s]

{'loss': 0.2987, 'grad_norm': 2.383012056350708, 'learning_rate': 4.3872483221476514e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9643/75000 [07:52<52:15, 20.84it/s]

{'loss': 0.4204, 'grad_norm': 5.3281168937683105, 'learning_rate': 4.386577181208054e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9652/75000 [07:52<52:32, 20.73it/s]

{'loss': 0.187, 'grad_norm': 7.34962272644043, 'learning_rate': 4.3859060402684564e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9664/75000 [07:53<51:13, 21.26it/s]

{'loss': 0.3254, 'grad_norm': 3.2367372512817383, 'learning_rate': 4.385234899328859e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9673/75000 [07:53<52:04, 20.91it/s]

{'loss': 0.3539, 'grad_norm': 11.120865821838379, 'learning_rate': 4.3845637583892615e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9682/75000 [07:53<51:14, 21.24it/s]

{'loss': 0.2334, 'grad_norm': 2.4087131023406982, 'learning_rate': 4.383892617449665e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9694/75000 [07:54<50:55, 21.37it/s]

{'loss': 0.4854, 'grad_norm': 8.220930099487305, 'learning_rate': 4.383221476510067e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9703/75000 [07:54<51:56, 20.95it/s]

{'loss': 0.3774, 'grad_norm': 1.467145323753357, 'learning_rate': 4.38255033557047e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9712/75000 [07:55<50:24, 21.59it/s]

{'loss': 0.366, 'grad_norm': 3.0844297409057617, 'learning_rate': 4.381879194630873e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9724/75000 [07:55<49:38, 21.91it/s]

{'loss': 0.3631, 'grad_norm': 1.9119255542755127, 'learning_rate': 4.381208053691275e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9733/75000 [07:56<50:23, 21.59it/s]

{'loss': 0.4721, 'grad_norm': 3.9743964672088623, 'learning_rate': 4.380536912751678e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9742/75000 [07:56<51:10, 21.25it/s]

{'loss': 0.345, 'grad_norm': 4.025991439819336, 'learning_rate': 4.379865771812081e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9754/75000 [07:57<49:23, 22.02it/s]

{'loss': 0.4352, 'grad_norm': 3.3654255867004395, 'learning_rate': 4.3791946308724836e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9763/75000 [07:57<53:18, 20.39it/s]

{'loss': 0.3643, 'grad_norm': 1.7701002359390259, 'learning_rate': 4.378523489932886e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9772/75000 [07:58<51:53, 20.95it/s]

{'loss': 0.2323, 'grad_norm': 1.9239320755004883, 'learning_rate': 4.377852348993289e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9784/75000 [07:58<49:54, 21.78it/s]

{'loss': 0.2822, 'grad_norm': 4.808221340179443, 'learning_rate': 4.3771812080536915e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9793/75000 [07:59<50:30, 21.52it/s]

{'loss': 0.4197, 'grad_norm': 7.413628101348877, 'learning_rate': 4.3765100671140944e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9802/75000 [07:59<50:38, 21.46it/s]

{'loss': 0.3711, 'grad_norm': 2.094376564025879, 'learning_rate': 4.375838926174497e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9811/75000 [08:00<50:09, 21.66it/s]

{'loss': 0.2511, 'grad_norm': 3.7202892303466797, 'learning_rate': 4.3751677852348994e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9823/75000 [08:00<53:31, 20.29it/s]

{'loss': 0.3727, 'grad_norm': 2.5058279037475586, 'learning_rate': 4.374496644295302e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9832/75000 [08:01<52:45, 20.59it/s]

{'loss': 0.2612, 'grad_norm': 4.854253768920898, 'learning_rate': 4.373825503355705e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9844/75000 [08:01<51:42, 21.00it/s]

{'loss': 0.4999, 'grad_norm': 3.971696376800537, 'learning_rate': 4.373154362416107e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9853/75000 [08:02<51:32, 21.06it/s]

{'loss': 0.4732, 'grad_norm': 6.841648578643799, 'learning_rate': 4.37248322147651e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9862/75000 [08:02<52:56, 20.51it/s]

{'loss': 0.4066, 'grad_norm': 3.709105968475342, 'learning_rate': 4.371812080536913e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9874/75000 [08:03<52:42, 20.59it/s]

{'loss': 0.3648, 'grad_norm': 8.212066650390625, 'learning_rate': 4.371140939597316e-05, 'epoch': 0.39}


                                                    
 13%|█▎        | 9883/75000 [08:03<55:38, 19.50it/s]

{'loss': 0.3582, 'grad_norm': 5.388889312744141, 'learning_rate': 4.370469798657718e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9892/75000 [08:03<52:20, 20.73it/s]

{'loss': 0.3688, 'grad_norm': 5.429262638092041, 'learning_rate': 4.369798657718121e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9904/75000 [08:04<53:02, 20.45it/s]

{'loss': 0.247, 'grad_norm': 4.834745407104492, 'learning_rate': 4.369127516778524e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9913/75000 [08:04<52:00, 20.86it/s]

{'loss': 0.4374, 'grad_norm': 2.489501953125, 'learning_rate': 4.3684563758389266e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9922/75000 [08:05<51:33, 21.04it/s]

{'loss': 0.3341, 'grad_norm': 4.0743818283081055, 'learning_rate': 4.3677852348993295e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9934/75000 [08:05<51:43, 20.96it/s]

{'loss': 0.3312, 'grad_norm': 2.9496867656707764, 'learning_rate': 4.3671140939597316e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9943/75000 [08:06<51:31, 21.05it/s]

{'loss': 0.3079, 'grad_norm': 0.30571356415748596, 'learning_rate': 4.3664429530201345e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9954/75000 [08:06<52:45, 20.55it/s]

{'loss': 0.3764, 'grad_norm': 1.5553102493286133, 'learning_rate': 4.365771812080537e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9963/75000 [08:07<53:00, 20.45it/s]

{'loss': 0.366, 'grad_norm': 1.773290991783142, 'learning_rate': 4.3651006711409395e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9972/75000 [08:07<52:40, 20.58it/s]

{'loss': 0.4171, 'grad_norm': 2.8689827919006348, 'learning_rate': 4.3644295302013424e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9984/75000 [08:08<52:22, 20.69it/s]

{'loss': 0.2909, 'grad_norm': 6.6257123947143555, 'learning_rate': 4.363758389261745e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 9993/75000 [08:08<54:49, 19.76it/s]

{'loss': 0.3222, 'grad_norm': 1.599170207977295, 'learning_rate': 4.363087248322148e-05, 'epoch': 0.4}


                                                    
 13%|█▎        | 10000/75000 [08:09<53:20, 20.31it/s]

{'loss': 0.2708, 'grad_norm': 6.852495193481445, 'learning_rate': 4.36241610738255e-05, 'epoch': 0.4}


                                                       
 13%|█▎        | 10014/75000 [08:10<1:01:51, 17.51it/s]

{'loss': 0.2781, 'grad_norm': 6.676795959472656, 'learning_rate': 4.361744966442953e-05, 'epoch': 0.4}


                                                       
 13%|█▎        | 10023/75000 [08:10<55:52, 19.38it/s]

{'loss': 0.3504, 'grad_norm': 2.5212621688842773, 'learning_rate': 4.361073825503356e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10032/75000 [08:11<54:02, 20.04it/s]

{'loss': 0.4225, 'grad_norm': 8.596327781677246, 'learning_rate': 4.360402684563759e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10044/75000 [08:11<52:57, 20.44it/s]

{'loss': 0.3322, 'grad_norm': 1.5686415433883667, 'learning_rate': 4.359731543624162e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10053/75000 [08:12<52:28, 20.63it/s]

{'loss': 0.3081, 'grad_norm': 4.963226318359375, 'learning_rate': 4.359060402684564e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10062/75000 [08:12<53:32, 20.21it/s]

{'loss': 0.4016, 'grad_norm': 6.2912445068359375, 'learning_rate': 4.358389261744967e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10074/75000 [08:13<53:34, 20.20it/s]

{'loss': 0.2691, 'grad_norm': 1.1901932954788208, 'learning_rate': 4.357718120805369e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10083/75000 [08:13<53:22, 20.27it/s]

{'loss': 0.3624, 'grad_norm': 4.308547496795654, 'learning_rate': 4.357046979865772e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10092/75000 [08:14<52:13, 20.71it/s]

{'loss': 0.3344, 'grad_norm': 1.9254932403564453, 'learning_rate': 4.356375838926175e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10104/75000 [08:14<52:01, 20.79it/s]

{'loss': 0.3836, 'grad_norm': 6.6784162521362305, 'learning_rate': 4.3557046979865775e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10113/75000 [08:15<52:37, 20.55it/s]

{'loss': 0.3427, 'grad_norm': 3.428640365600586, 'learning_rate': 4.35503355704698e-05, 'epoch': 0.4}


                                                     
 13%|█▎        | 10122/75000 [08:15<52:16, 20.69it/s]

{'loss': 0.2675, 'grad_norm': 3.7962191104888916, 'learning_rate': 4.3543624161073825e-05, 'epoch': 0.4}


                                                     
 14%|█▎        | 10133/75000 [08:16<53:11, 20.33it/s]

{'loss': 0.3866, 'grad_norm': 7.962785720825195, 'learning_rate': 4.3536912751677853e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10142/75000 [08:16<53:14, 20.30it/s]

{'loss': 0.4248, 'grad_norm': 4.358555316925049, 'learning_rate': 4.353020134228188e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10154/75000 [08:17<51:38, 20.93it/s]

{'loss': 0.2889, 'grad_norm': 4.7335968017578125, 'learning_rate': 4.352348993288591e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10163/75000 [08:17<52:26, 20.61it/s]

{'loss': 0.352, 'grad_norm': 1.649888277053833, 'learning_rate': 4.351677852348994e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10174/75000 [08:18<52:25, 20.61it/s]

{'loss': 0.3705, 'grad_norm': 1.3508667945861816, 'learning_rate': 4.351006711409396e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10183/75000 [08:18<53:42, 20.11it/s]

{'loss': 0.2737, 'grad_norm': 2.404106855392456, 'learning_rate': 4.350335570469799e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10192/75000 [08:19<53:54, 20.03it/s]

{'loss': 0.3324, 'grad_norm': 5.439242362976074, 'learning_rate': 4.349664429530201e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10204/75000 [08:19<55:06, 19.59it/s]

{'loss': 0.2593, 'grad_norm': 7.175330638885498, 'learning_rate': 4.348993288590604e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10213/75000 [08:20<53:08, 20.32it/s]

{'loss': 0.3622, 'grad_norm': 7.8352437019348145, 'learning_rate': 4.348322147651007e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10222/75000 [08:20<52:34, 20.53it/s]

{'loss': 0.3814, 'grad_norm': 1.6023309230804443, 'learning_rate': 4.34765100671141e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10231/75000 [08:20<51:54, 20.80it/s]

{'loss': 0.3482, 'grad_norm': 7.137657165527344, 'learning_rate': 4.3469798657718125e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10243/75000 [08:21<54:22, 19.85it/s]

{'loss': 0.3187, 'grad_norm': 1.021475076675415, 'learning_rate': 4.346308724832215e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10252/75000 [08:22<52:30, 20.55it/s]

{'loss': 0.4657, 'grad_norm': 7.505364894866943, 'learning_rate': 4.3456375838926176e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10261/75000 [08:22<52:58, 20.37it/s]

{'loss': 0.257, 'grad_norm': 3.9381234645843506, 'learning_rate': 4.3449664429530204e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10273/75000 [08:23<51:53, 20.79it/s]

{'loss': 0.3995, 'grad_norm': 5.436544418334961, 'learning_rate': 4.344295302013423e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10282/75000 [08:23<54:07, 19.93it/s]

{'loss': 0.2762, 'grad_norm': 5.022751331329346, 'learning_rate': 4.343624161073826e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10294/75000 [08:24<51:36, 20.89it/s]

{'loss': 0.4382, 'grad_norm': 5.6881585121154785, 'learning_rate': 4.342953020134228e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10303/75000 [08:24<52:18, 20.61it/s]

{'loss': 0.2853, 'grad_norm': 2.8602890968322754, 'learning_rate': 4.342281879194631e-05, 'epoch': 0.41}


                                                     
 14%|█▎        | 10312/75000 [08:24<52:49, 20.41it/s]

{'loss': 0.2331, 'grad_norm': 1.2124921083450317, 'learning_rate': 4.341610738255033e-05, 'epoch': 0.41}


                                                     
 14%|█▍        | 10321/75000 [08:25<53:26, 20.17it/s]

{'loss': 0.2973, 'grad_norm': 4.852487087249756, 'learning_rate': 4.340939597315437e-05, 'epoch': 0.41}


                                                     
 14%|█▍        | 10333/75000 [08:25<52:16, 20.62it/s]

{'loss': 0.415, 'grad_norm': 1.8593169450759888, 'learning_rate': 4.340268456375839e-05, 'epoch': 0.41}


                                                     
 14%|█▍        | 10342/75000 [08:26<52:34, 20.50it/s]

{'loss': 0.3749, 'grad_norm': 13.37352180480957, 'learning_rate': 4.339597315436242e-05, 'epoch': 0.41}


                                                     
 14%|█▍        | 10354/75000 [08:26<50:48, 21.20it/s]

{'loss': 0.4432, 'grad_norm': 1.6837174892425537, 'learning_rate': 4.338926174496645e-05, 'epoch': 0.41}


                                                     
 14%|█▍        | 10363/75000 [08:27<54:13, 19.87it/s]

{'loss': 0.2958, 'grad_norm': 2.417045831680298, 'learning_rate': 4.338255033557047e-05, 'epoch': 0.41}


                                                     
 14%|█▍        | 10372/75000 [08:27<52:45, 20.42it/s]

{'loss': 0.4134, 'grad_norm': 3.438448429107666, 'learning_rate': 4.33758389261745e-05, 'epoch': 0.41}


                                                     
 14%|█▍        | 10381/75000 [08:28<53:12, 20.24it/s]

{'loss': 0.2334, 'grad_norm': 1.2348990440368652, 'learning_rate': 4.3369127516778526e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10394/75000 [08:29<54:26, 19.78it/s]

{'loss': 0.3081, 'grad_norm': 3.4077630043029785, 'learning_rate': 4.3362416107382555e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10403/75000 [08:29<52:22, 20.55it/s]

{'loss': 0.2875, 'grad_norm': 3.005944013595581, 'learning_rate': 4.335570469798658e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10411/75000 [08:29<58:46, 18.31it/s]

{'loss': 0.3072, 'grad_norm': 1.9958124160766602, 'learning_rate': 4.3348993288590605e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10421/75000 [08:30<57:24, 18.75it/s]

{'loss': 0.2913, 'grad_norm': 6.138485908508301, 'learning_rate': 4.3342281879194634e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10433/75000 [08:31<55:27, 19.40it/s]

{'loss': 0.3287, 'grad_norm': 6.548513412475586, 'learning_rate': 4.3335570469798656e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10442/75000 [08:31<56:08, 19.17it/s]

{'loss': 0.2778, 'grad_norm': 6.973734378814697, 'learning_rate': 4.332885906040269e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10452/75000 [08:32<55:01, 19.55it/s]

{'loss': 0.268, 'grad_norm': 10.058760643005371, 'learning_rate': 4.332214765100671e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10462/75000 [08:32<55:28, 19.39it/s]

{'loss': 0.4694, 'grad_norm': 1.0504798889160156, 'learning_rate': 4.331543624161074e-05, 'epoch': 0.42}


                                                       
 14%|█▍        | 10472/75000 [08:33<58:36, 18.35it/s]

{'loss': 0.2834, 'grad_norm': 5.08612585067749, 'learning_rate': 4.330872483221477e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10481/75000 [08:33<57:40, 18.64it/s]

{'loss': 0.3555, 'grad_norm': 8.420550346374512, 'learning_rate': 4.330201342281879e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10492/75000 [08:34<56:03, 19.18it/s]

{'loss': 0.3451, 'grad_norm': 1.1984590291976929, 'learning_rate': 4.329530201342282e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10500/75000 [08:34<56:14, 19.11it/s]

{'loss': 0.3263, 'grad_norm': 6.157227993011475, 'learning_rate': 4.328859060402685e-05, 'epoch': 0.42}


                                                       
 14%|█▍        | 10512/75000 [08:35<1:15:02, 14.32it/s]

{'loss': 0.3431, 'grad_norm': 4.487570762634277, 'learning_rate': 4.328187919463088e-05, 'epoch': 0.42}


                                                       
 14%|█▍        | 10523/75000 [08:36<58:43, 18.30it/s]  

{'loss': 0.3393, 'grad_norm': 2.532972812652588, 'learning_rate': 4.32751677852349e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10531/75000 [08:36<57:17, 18.76it/s]

{'loss': 0.5042, 'grad_norm': 2.6444332599639893, 'learning_rate': 4.326845637583893e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10541/75000 [08:37<55:40, 19.30it/s]

{'loss': 0.4161, 'grad_norm': 5.815715789794922, 'learning_rate': 4.3261744966442956e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10552/75000 [08:37<55:47, 19.25it/s]

{'loss': 0.3386, 'grad_norm': 2.6984705924987793, 'learning_rate': 4.325503355704698e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10561/75000 [08:38<56:58, 18.85it/s]

{'loss': 0.4224, 'grad_norm': 3.841050624847412, 'learning_rate': 4.324832214765101e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10574/75000 [08:38<55:20, 19.40it/s]

{'loss': 0.3639, 'grad_norm': 2.11444091796875, 'learning_rate': 4.3241610738255035e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10581/75000 [08:39<56:46, 18.91it/s]

{'loss': 0.3967, 'grad_norm': 4.527768611907959, 'learning_rate': 4.3234899328859063e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10594/75000 [08:39<54:24, 19.73it/s]

{'loss': 0.4169, 'grad_norm': 1.9562867879867554, 'learning_rate': 4.3228187919463085e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10602/75000 [08:40<55:01, 19.51it/s]

{'loss': 0.3292, 'grad_norm': 1.9477896690368652, 'learning_rate': 4.3221476510067114e-05, 'epoch': 0.42}


                                                       
 14%|█▍        | 10613/75000 [08:40<56:03, 19.14it/s]

{'loss': 0.3209, 'grad_norm': 6.0027995109558105, 'learning_rate': 4.321476510067114e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10622/75000 [08:41<53:43, 19.97it/s]

{'loss': 0.3157, 'grad_norm': 7.0776686668396, 'learning_rate': 4.320805369127517e-05, 'epoch': 0.42}


                                                     
 14%|█▍        | 10634/75000 [08:41<52:21, 20.49it/s]

{'loss': 0.3114, 'grad_norm': 4.944102764129639, 'learning_rate': 4.32013422818792e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10643/75000 [08:42<51:56, 20.65it/s]

{'loss': 0.2751, 'grad_norm': 7.013740062713623, 'learning_rate': 4.319463087248322e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10652/75000 [08:42<53:59, 19.86it/s]

{'loss': 0.2821, 'grad_norm': 5.172975063323975, 'learning_rate': 4.318791946308725e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10664/75000 [08:43<51:28, 20.83it/s]

{'loss': 0.3399, 'grad_norm': 7.02781343460083, 'learning_rate': 4.318120805369128e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10673/75000 [08:43<53:09, 20.17it/s]

{'loss': 0.4109, 'grad_norm': 1.9910870790481567, 'learning_rate': 4.317449664429531e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10682/75000 [08:44<53:00, 20.22it/s]

{'loss': 0.3328, 'grad_norm': 5.82194709777832, 'learning_rate': 4.3167785234899335e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10694/75000 [08:44<52:15, 20.51it/s]

{'loss': 0.3753, 'grad_norm': 1.7849738597869873, 'learning_rate': 4.316107382550336e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10703/75000 [08:45<52:44, 20.32it/s]

{'loss': 0.291, 'grad_norm': 5.490069389343262, 'learning_rate': 4.3154362416107386e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10712/75000 [08:45<55:34, 19.28it/s]

{'loss': 0.2943, 'grad_norm': 2.131314754486084, 'learning_rate': 4.314765100671141e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10724/75000 [08:46<51:51, 20.66it/s]

{'loss': 0.3486, 'grad_norm': 4.256993293762207, 'learning_rate': 4.3140939597315436e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10733/75000 [08:46<52:23, 20.44it/s]

{'loss': 0.4592, 'grad_norm': 6.437543869018555, 'learning_rate': 4.3134228187919465e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10742/75000 [08:47<52:13, 20.51it/s]

{'loss': 0.3807, 'grad_norm': 3.530520439147949, 'learning_rate': 4.312751677852349e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10753/75000 [08:47<55:33, 19.27it/s]

{'loss': 0.2973, 'grad_norm': 4.652132034301758, 'learning_rate': 4.312080536912752e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10763/75000 [08:48<54:52, 19.51it/s]

{'loss': 0.3887, 'grad_norm': 6.550787448883057, 'learning_rate': 4.3114093959731543e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10773/75000 [08:48<52:43, 20.30it/s]

{'loss': 0.3638, 'grad_norm': 13.582076072692871, 'learning_rate': 4.310738255033557e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10782/75000 [08:49<52:13, 20.49it/s]

{'loss': 0.3163, 'grad_norm': 4.876334190368652, 'learning_rate': 4.3100671140939594e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10792/75000 [08:49<54:24, 19.67it/s]

{'loss': 0.3135, 'grad_norm': 2.153226852416992, 'learning_rate': 4.309395973154363e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10804/75000 [08:50<51:21, 20.84it/s]

{'loss': 0.3406, 'grad_norm': 3.5810720920562744, 'learning_rate': 4.308724832214766e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10813/75000 [08:50<52:35, 20.34it/s]

{'loss': 0.2995, 'grad_norm': 4.77401876449585, 'learning_rate': 4.308053691275168e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10822/75000 [08:51<52:05, 20.53it/s]

{'loss': 0.3147, 'grad_norm': 3.426701545715332, 'learning_rate': 4.307382550335571e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10834/75000 [08:51<54:04, 19.78it/s]

{'loss': 0.4599, 'grad_norm': 1.8808010816574097, 'learning_rate': 4.306711409395973e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10843/75000 [08:52<53:11, 20.10it/s]

{'loss': 0.343, 'grad_norm': 5.303218364715576, 'learning_rate': 4.306040268456376e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10852/75000 [08:52<53:26, 20.01it/s]

{'loss': 0.2732, 'grad_norm': 5.835056781768799, 'learning_rate': 4.305369127516779e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10864/75000 [08:53<52:11, 20.48it/s]

{'loss': 0.3558, 'grad_norm': 5.497226238250732, 'learning_rate': 4.3046979865771815e-05, 'epoch': 0.43}


                                                     
 14%|█▍        | 10872/75000 [08:53<54:52, 19.47it/s]

{'loss': 0.3544, 'grad_norm': 8.335705757141113, 'learning_rate': 4.3040268456375844e-05, 'epoch': 0.43}


                                                     
 15%|█▍        | 10884/75000 [08:54<51:37, 20.70it/s]

{'loss': 0.2889, 'grad_norm': 8.131464958190918, 'learning_rate': 4.3033557046979866e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10893/75000 [08:54<52:57, 20.18it/s]

{'loss': 0.3904, 'grad_norm': 1.2656916379928589, 'learning_rate': 4.3026845637583894e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10902/75000 [08:55<53:48, 19.85it/s]

{'loss': 0.3674, 'grad_norm': 7.730669021606445, 'learning_rate': 4.3020134228187916e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10914/75000 [08:55<52:13, 20.45it/s]

{'loss': 0.3917, 'grad_norm': 8.57336139678955, 'learning_rate': 4.301342281879195e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10923/75000 [08:56<51:49, 20.61it/s]

{'loss': 0.4555, 'grad_norm': 2.675302267074585, 'learning_rate': 4.300671140939598e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10932/75000 [08:56<52:06, 20.49it/s]

{'loss': 0.4116, 'grad_norm': 3.653402328491211, 'learning_rate': 4.3e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10944/75000 [08:57<53:23, 20.00it/s]

{'loss': 0.4163, 'grad_norm': 5.881168842315674, 'learning_rate': 4.299328859060403e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10952/75000 [08:57<54:48, 19.48it/s]

{'loss': 0.2856, 'grad_norm': 4.413259029388428, 'learning_rate': 4.298657718120805e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10963/75000 [08:58<52:49, 20.20it/s]

{'loss': 0.2828, 'grad_norm': 2.548135757446289, 'learning_rate': 4.297986577181208e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10974/75000 [08:58<52:31, 20.32it/s]

{'loss': 0.3539, 'grad_norm': 2.14176607131958, 'learning_rate': 4.297315436241611e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10982/75000 [08:59<55:05, 19.37it/s]

{'loss': 0.2549, 'grad_norm': 4.940471172332764, 'learning_rate': 4.296644295302014e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 10993/75000 [08:59<52:47, 20.21it/s]

{'loss': 0.3137, 'grad_norm': 1.9197323322296143, 'learning_rate': 4.2959731543624166e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11000/75000 [09:00<51:50, 20.57it/s]

{'loss': 0.321, 'grad_norm': 1.1604111194610596, 'learning_rate': 4.295302013422819e-05, 'epoch': 0.44}


                                                       
 15%|█▍        | 11014/75000 [09:01<1:01:23, 17.37it/s]

{'loss': 0.3342, 'grad_norm': 1.5667762756347656, 'learning_rate': 4.2946308724832216e-05, 'epoch': 0.44}


                                                       
 15%|█▍        | 11023/75000 [09:01<54:33, 19.54it/s]

{'loss': 0.3583, 'grad_norm': 1.5435590744018555, 'learning_rate': 4.2939597315436245e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11032/75000 [09:02<53:50, 19.80it/s]

{'loss': 0.4681, 'grad_norm': 6.2833356857299805, 'learning_rate': 4.2932885906040274e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11044/75000 [09:02<51:50, 20.56it/s]

{'loss': 0.3373, 'grad_norm': 4.934515476226807, 'learning_rate': 4.2926174496644295e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11053/75000 [09:03<54:24, 19.59it/s]

{'loss': 0.2389, 'grad_norm': 4.41079044342041, 'learning_rate': 4.2919463087248324e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11062/75000 [09:03<52:38, 20.24it/s]

{'loss': 0.4276, 'grad_norm': 4.600069046020508, 'learning_rate': 4.291275167785235e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11074/75000 [09:04<51:41, 20.61it/s]

{'loss': 0.3591, 'grad_norm': 2.8633697032928467, 'learning_rate': 4.2906040268456374e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11083/75000 [09:04<52:25, 20.32it/s]

{'loss': 0.3153, 'grad_norm': 4.182108402252197, 'learning_rate': 4.28993288590604e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11092/75000 [09:05<54:15, 19.63it/s]

{'loss': 0.3056, 'grad_norm': 6.282683849334717, 'learning_rate': 4.289261744966443e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11104/75000 [09:05<52:01, 20.47it/s]

{'loss': 0.3688, 'grad_norm': 1.9695501327514648, 'learning_rate': 4.288590604026846e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11113/75000 [09:06<51:19, 20.74it/s]

{'loss': 0.2881, 'grad_norm': 4.838911533355713, 'learning_rate': 4.287919463087249e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11122/75000 [09:06<52:17, 20.36it/s]

{'loss': 0.3664, 'grad_norm': 5.460537433624268, 'learning_rate': 4.287248322147651e-05, 'epoch': 0.44}


                                                     
 15%|█▍        | 11134/75000 [09:07<52:09, 20.41it/s]

{'loss': 0.3655, 'grad_norm': 6.40912389755249, 'learning_rate': 4.286577181208054e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11143/75000 [09:07<52:47, 20.16it/s]

{'loss': 0.3533, 'grad_norm': 2.9824132919311523, 'learning_rate': 4.285906040268457e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11152/75000 [09:08<51:10, 20.79it/s]

{'loss': 0.3565, 'grad_norm': 1.3034522533416748, 'learning_rate': 4.2852348993288596e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11161/75000 [09:08<57:56, 18.36it/s]

{'loss': 0.3429, 'grad_norm': 3.095097064971924, 'learning_rate': 4.284563758389262e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11172/75000 [09:09<54:19, 19.58it/s]

{'loss': 0.3238, 'grad_norm': 1.6994706392288208, 'learning_rate': 4.2838926174496646e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11183/75000 [09:09<53:23, 19.92it/s]

{'loss': 0.2881, 'grad_norm': 3.1008129119873047, 'learning_rate': 4.2832214765100675e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11192/75000 [09:10<55:49, 19.05it/s]

{'loss': 0.2232, 'grad_norm': 3.275310516357422, 'learning_rate': 4.2825503355704696e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11204/75000 [09:10<55:44, 19.07it/s]

{'loss': 0.329, 'grad_norm': 2.0709822177886963, 'learning_rate': 4.2818791946308725e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11213/75000 [09:11<53:41, 19.80it/s]

{'loss': 0.3488, 'grad_norm': 4.467651844024658, 'learning_rate': 4.2812080536912754e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11222/75000 [09:11<52:28, 20.26it/s]

{'loss': 0.3823, 'grad_norm': 10.307868957519531, 'learning_rate': 4.280536912751678e-05, 'epoch': 0.45}


                                                     
 15%|█▍        | 11234/75000 [09:12<50:59, 20.84it/s]

{'loss': 0.3296, 'grad_norm': 1.4937306642532349, 'learning_rate': 4.2798657718120804e-05, 'epoch': 0.45}


                                                       
 15%|█▍        | 11242/75000 [09:12<57:56, 18.34it/s]

{'loss': 0.3285, 'grad_norm': 12.624750137329102, 'learning_rate': 4.279194630872483e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11253/75000 [09:13<53:48, 19.75it/s]

{'loss': 0.2905, 'grad_norm': 1.646875023841858, 'learning_rate': 4.278523489932886e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11262/75000 [09:13<52:47, 20.12it/s]

{'loss': 0.4764, 'grad_norm': 1.4438129663467407, 'learning_rate': 4.277852348993289e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11274/75000 [09:14<52:51, 20.09it/s]

{'loss': 0.2967, 'grad_norm': 2.1080007553100586, 'learning_rate': 4.277181208053692e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11283/75000 [09:14<52:48, 20.11it/s]

{'loss': 0.3123, 'grad_norm': 4.7271728515625, 'learning_rate': 4.276510067114094e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11292/75000 [09:15<52:04, 20.39it/s]

{'loss': 0.2288, 'grad_norm': 0.9494515061378479, 'learning_rate': 4.275838926174497e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11304/75000 [09:15<51:17, 20.69it/s]

{'loss': 0.2585, 'grad_norm': 1.2985285520553589, 'learning_rate': 4.2751677852349e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11312/75000 [09:16<53:49, 19.72it/s]

{'loss': 0.3763, 'grad_norm': 17.699527740478516, 'learning_rate': 4.274496644295302e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11324/75000 [09:16<51:26, 20.63it/s]

{'loss': 0.3705, 'grad_norm': 1.8719916343688965, 'learning_rate': 4.2738255033557054e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11333/75000 [09:17<52:14, 20.31it/s]

{'loss': 0.4654, 'grad_norm': 5.167030334472656, 'learning_rate': 4.2731543624161076e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11342/75000 [09:17<51:51, 20.46it/s]

{'loss': 0.2622, 'grad_norm': 4.372213363647461, 'learning_rate': 4.2724832214765104e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11354/75000 [09:18<52:43, 20.12it/s]

{'loss': 0.316, 'grad_norm': 4.432784557342529, 'learning_rate': 4.2718120805369126e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11363/75000 [09:18<53:07, 19.97it/s]

{'loss': 0.3558, 'grad_norm': 6.845252990722656, 'learning_rate': 4.2711409395973155e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11372/75000 [09:19<51:51, 20.45it/s]

{'loss': 0.4003, 'grad_norm': 4.591927528381348, 'learning_rate': 4.270469798657718e-05, 'epoch': 0.45}


                                                     
 15%|█▌        | 11384/75000 [09:19<51:33, 20.57it/s]

{'loss': 0.3319, 'grad_norm': 3.465400218963623, 'learning_rate': 4.269798657718121e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11393/75000 [09:20<52:04, 20.36it/s]

{'loss': 0.3688, 'grad_norm': 3.6651113033294678, 'learning_rate': 4.269127516778524e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11402/75000 [09:20<53:53, 19.67it/s]

{'loss': 0.3796, 'grad_norm': 8.473069190979004, 'learning_rate': 4.268456375838926e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11414/75000 [09:21<52:06, 20.34it/s]

{'loss': 0.4037, 'grad_norm': 8.73176097869873, 'learning_rate': 4.267785234899329e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11423/75000 [09:21<52:56, 20.02it/s]

{'loss': 0.2968, 'grad_norm': 5.681346416473389, 'learning_rate': 4.267114093959731e-05, 'epoch': 0.46}


                                                       
 15%|█▌        | 11434/75000 [09:22<56:54, 18.62it/s]  

{'loss': 0.4077, 'grad_norm': 3.4838173389434814, 'learning_rate': 4.266442953020134e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11444/75000 [09:22<53:18, 19.87it/s]

{'loss': 0.377, 'grad_norm': 6.409562110900879, 'learning_rate': 4.2657718120805376e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11453/75000 [09:23<51:58, 20.38it/s]

{'loss': 0.4129, 'grad_norm': 4.4428791999816895, 'learning_rate': 4.26510067114094e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11462/75000 [09:23<51:59, 20.37it/s]

{'loss': 0.3468, 'grad_norm': 5.012782573699951, 'learning_rate': 4.2644295302013427e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11474/75000 [09:24<52:53, 20.02it/s]

{'loss': 0.2945, 'grad_norm': 8.740097999572754, 'learning_rate': 4.263758389261745e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11483/75000 [09:24<52:14, 20.26it/s]

{'loss': 0.3415, 'grad_norm': 1.8916642665863037, 'learning_rate': 4.263087248322148e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11492/75000 [09:25<52:02, 20.34it/s]

{'loss': 0.2654, 'grad_norm': 3.214876174926758, 'learning_rate': 4.2624161073825505e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11500/75000 [09:25<55:11, 19.17it/s]

{'loss': 0.3075, 'grad_norm': 0.624968409538269, 'learning_rate': 4.2617449664429534e-05, 'epoch': 0.46}


                                                       
 15%|█▌        | 11512/75000 [09:26<1:04:26, 16.42it/s]

{'loss': 0.4312, 'grad_norm': 4.547519683837891, 'learning_rate': 4.261073825503356e-05, 'epoch': 0.46}


                                                       
 15%|█▌        | 11524/75000 [09:27<53:19, 19.84it/s]

{'loss': 0.2328, 'grad_norm': 3.3800899982452393, 'learning_rate': 4.2604026845637584e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11533/75000 [09:27<55:18, 19.13it/s]

{'loss': 0.3861, 'grad_norm': 6.0124192237854, 'learning_rate': 4.259731543624161e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11542/75000 [09:28<52:41, 20.07it/s]

{'loss': 0.3437, 'grad_norm': 10.636514663696289, 'learning_rate': 4.2590604026845635e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11554/75000 [09:28<51:25, 20.56it/s]

{'loss': 0.3903, 'grad_norm': 2.1288299560546875, 'learning_rate': 4.258389261744967e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11563/75000 [09:29<51:24, 20.56it/s]

{'loss': 0.2114, 'grad_norm': 3.32794451713562, 'learning_rate': 4.25771812080537e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11574/75000 [09:29<53:00, 19.94it/s]

{'loss': 0.4583, 'grad_norm': 3.568337917327881, 'learning_rate': 4.257046979865772e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11583/75000 [09:30<51:40, 20.46it/s]

{'loss': 0.4858, 'grad_norm': 4.100667476654053, 'learning_rate': 4.256375838926175e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11592/75000 [09:30<51:51, 20.38it/s]

{'loss': 0.4048, 'grad_norm': 10.044126510620117, 'learning_rate': 4.255704697986577e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11604/75000 [09:31<52:30, 20.12it/s]

{'loss': 0.4392, 'grad_norm': 12.394405364990234, 'learning_rate': 4.25503355704698e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11613/75000 [09:31<52:43, 20.04it/s]

{'loss': 0.3307, 'grad_norm': 3.697211503982544, 'learning_rate': 4.254362416107383e-05, 'epoch': 0.46}


                                                     
 15%|█▌        | 11622/75000 [09:31<51:43, 20.42it/s]

{'loss': 0.3068, 'grad_norm': 0.8775922656059265, 'learning_rate': 4.2536912751677856e-05, 'epoch': 0.46}


                                                     
 16%|█▌        | 11631/75000 [09:32<54:12, 19.49it/s]

{'loss': 0.2803, 'grad_norm': 4.761105060577393, 'learning_rate': 4.2530201342281885e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11643/75000 [09:33<51:46, 20.39it/s]

{'loss': 0.4449, 'grad_norm': 1.169443130493164, 'learning_rate': 4.2523489932885907e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11652/75000 [09:33<51:42, 20.42it/s]

{'loss': 0.3212, 'grad_norm': 4.764565944671631, 'learning_rate': 4.2516778523489935e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11664/75000 [09:34<50:56, 20.72it/s]

{'loss': 0.4913, 'grad_norm': 3.231842279434204, 'learning_rate': 4.251006711409396e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11673/75000 [09:34<54:36, 19.33it/s]

{'loss': 0.4002, 'grad_norm': 5.802701473236084, 'learning_rate': 4.250335570469799e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11682/75000 [09:34<51:43, 20.40it/s]

{'loss': 0.3494, 'grad_norm': 4.408793926239014, 'learning_rate': 4.2496644295302014e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11694/75000 [09:35<51:03, 20.66it/s]

{'loss': 0.3485, 'grad_norm': 3.268629312515259, 'learning_rate': 4.248993288590604e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11702/75000 [09:35<55:03, 19.16it/s]

{'loss': 0.4005, 'grad_norm': 9.51813793182373, 'learning_rate': 4.248322147651007e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11713/75000 [09:36<52:44, 20.00it/s]

{'loss': 0.2325, 'grad_norm': 2.6750502586364746, 'learning_rate': 4.247651006711409e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11722/75000 [09:36<51:50, 20.35it/s]

{'loss': 0.3303, 'grad_norm': 2.2241146564483643, 'learning_rate': 4.246979865771812e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11734/75000 [09:37<50:59, 20.68it/s]

{'loss': 0.3449, 'grad_norm': 11.686131477355957, 'learning_rate': 4.246308724832215e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11742/75000 [09:38<54:32, 19.33it/s]

{'loss': 0.3262, 'grad_norm': 6.2329325675964355, 'learning_rate': 4.245637583892618e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11754/75000 [09:38<52:31, 20.07it/s]

{'loss': 0.476, 'grad_norm': 4.1234235763549805, 'learning_rate': 4.244966442953021e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11763/75000 [09:39<51:29, 20.47it/s]

{'loss': 0.3173, 'grad_norm': 2.2939441204071045, 'learning_rate': 4.244295302013423e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11772/75000 [09:39<54:10, 19.45it/s]

{'loss': 0.3125, 'grad_norm': 20.08871078491211, 'learning_rate': 4.243624161073826e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11783/75000 [09:40<51:51, 20.32it/s]

{'loss': 0.4261, 'grad_norm': 5.833044052124023, 'learning_rate': 4.242953020134228e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11792/75000 [09:40<52:16, 20.15it/s]

{'loss': 0.3126, 'grad_norm': 3.429748773574829, 'learning_rate': 4.2422818791946314e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11800/75000 [09:40<51:37, 20.40it/s]

{'loss': 0.4514, 'grad_norm': 3.736544132232666, 'learning_rate': 4.2416107382550336e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11812/75000 [09:41<54:48, 19.21it/s]

{'loss': 0.2733, 'grad_norm': 3.1302640438079834, 'learning_rate': 4.2409395973154365e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11823/75000 [09:42<51:40, 20.38it/s]

{'loss': 0.2472, 'grad_norm': 12.980754852294922, 'learning_rate': 4.240268456375839e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11832/75000 [09:42<51:08, 20.59it/s]

{'loss': 0.2477, 'grad_norm': 5.586056232452393, 'learning_rate': 4.2395973154362415e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11844/75000 [09:43<52:01, 20.23it/s]

{'loss': 0.3806, 'grad_norm': 8.858567237854004, 'learning_rate': 4.2389261744966444e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11853/75000 [09:43<52:25, 20.07it/s]

{'loss': 0.2696, 'grad_norm': 5.407845497131348, 'learning_rate': 4.238255033557047e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11862/75000 [09:44<51:38, 20.38it/s]

{'loss': 0.3182, 'grad_norm': 2.1256749629974365, 'learning_rate': 4.23758389261745e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11871/75000 [09:44<55:33, 18.94it/s]

{'loss': 0.4147, 'grad_norm': 8.92938232421875, 'learning_rate': 4.236912751677852e-05, 'epoch': 0.47}


                                                     
 16%|█▌        | 11883/75000 [09:45<51:43, 20.34it/s]

{'loss': 0.2988, 'grad_norm': 5.0411376953125, 'learning_rate': 4.236241610738255e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11892/75000 [09:45<51:55, 20.26it/s]

{'loss': 0.265, 'grad_norm': 3.949585199356079, 'learning_rate': 4.235570469798658e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11901/75000 [09:45<54:37, 19.25it/s]

{'loss': 0.3046, 'grad_norm': 5.290776252746582, 'learning_rate': 4.234899328859061e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11913/75000 [09:46<52:15, 20.12it/s]

{'loss': 0.3731, 'grad_norm': 10.339157104492188, 'learning_rate': 4.234228187919464e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11922/75000 [09:46<51:09, 20.55it/s]

{'loss': 0.3175, 'grad_norm': 5.101390361785889, 'learning_rate': 4.233557046979866e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11931/75000 [09:47<51:03, 20.59it/s]

{'loss': 0.45, 'grad_norm': 4.94503116607666, 'learning_rate': 4.232885906040269e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11943/75000 [09:48<51:34, 20.38it/s]

{'loss': 0.2993, 'grad_norm': 6.101895809173584, 'learning_rate': 4.2322147651006716e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11952/75000 [09:48<52:34, 19.99it/s]

{'loss': 0.3821, 'grad_norm': 1.9599775075912476, 'learning_rate': 4.231543624161074e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11964/75000 [09:49<50:52, 20.65it/s]

{'loss': 0.4327, 'grad_norm': 5.803922653198242, 'learning_rate': 4.2308724832214766e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11972/75000 [09:49<57:25, 18.29it/s]

{'loss': 0.297, 'grad_norm': 3.84218430519104, 'learning_rate': 4.2302013422818794e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11984/75000 [09:50<51:17, 20.47it/s]

{'loss': 0.3292, 'grad_norm': 3.0542829036712646, 'learning_rate': 4.229530201342282e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 11993/75000 [09:50<52:04, 20.16it/s]

{'loss': 0.3065, 'grad_norm': 2.1162140369415283, 'learning_rate': 4.2288590604026845e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12000/75000 [09:50<51:24, 20.42it/s]

{'loss': 0.4221, 'grad_norm': 4.196789264678955, 'learning_rate': 4.228187919463087e-05, 'epoch': 0.48}


                                                       
 16%|█▌        | 12013/75000 [09:51<1:03:25, 16.55it/s]

{'loss': 0.2774, 'grad_norm': 3.170372724533081, 'learning_rate': 4.22751677852349e-05, 'epoch': 0.48}


                                                       
 16%|█▌        | 12022/75000 [09:52<55:08, 19.03it/s]

{'loss': 0.2922, 'grad_norm': 7.65371561050415, 'learning_rate': 4.226845637583893e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12034/75000 [09:53<51:30, 20.38it/s]

{'loss': 0.2349, 'grad_norm': 2.624361515045166, 'learning_rate': 4.226174496644296e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12042/75000 [09:53<54:46, 19.16it/s]

{'loss': 0.4229, 'grad_norm': 5.698714256286621, 'learning_rate': 4.225503355704698e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12052/75000 [09:53<51:53, 20.22it/s]

{'loss': 0.4076, 'grad_norm': 11.706616401672363, 'learning_rate': 4.224832214765101e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12064/75000 [09:54<50:51, 20.62it/s]

{'loss': 0.4317, 'grad_norm': 5.523987770080566, 'learning_rate': 4.224161073825503e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12073/75000 [09:54<50:45, 20.67it/s]

{'loss': 0.3361, 'grad_norm': 3.5466175079345703, 'learning_rate': 4.223489932885906e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12084/75000 [09:55<53:20, 19.66it/s]

{'loss': 0.3835, 'grad_norm': 3.579726457595825, 'learning_rate': 4.222818791946309e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12093/75000 [09:55<52:36, 19.93it/s]

{'loss': 0.4328, 'grad_norm': 3.0766115188598633, 'learning_rate': 4.2221476510067117e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12102/75000 [09:56<52:07, 20.11it/s]

{'loss': 0.3859, 'grad_norm': 1.6647753715515137, 'learning_rate': 4.2214765100671145e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12113/75000 [09:57<53:43, 19.51it/s]

{'loss': 0.4096, 'grad_norm': 2.2943685054779053, 'learning_rate': 4.220805369127517e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12124/75000 [09:57<52:02, 20.14it/s]

{'loss': 0.4792, 'grad_norm': 5.379279136657715, 'learning_rate': 4.2201342281879195e-05, 'epoch': 0.48}


                                                     
 16%|█▌        | 12133/75000 [09:58<51:41, 20.27it/s]

{'loss': 0.3436, 'grad_norm': 3.7730872631073, 'learning_rate': 4.2194630872483224e-05, 'epoch': 0.49}


                                                     
 16%|█▌        | 12144/75000 [09:58<53:16, 19.66it/s]

{'loss': 0.2737, 'grad_norm': 3.0681984424591064, 'learning_rate': 4.218791946308725e-05, 'epoch': 0.49}


                                                     
 16%|█▌        | 12152/75000 [09:58<52:15, 20.04it/s]

{'loss': 0.2297, 'grad_norm': 6.142174243927002, 'learning_rate': 4.218120805369128e-05, 'epoch': 0.49}


                                                     
 16%|█▌        | 12164/75000 [09:59<52:12, 20.06it/s]

{'loss': 0.4158, 'grad_norm': 3.2114343643188477, 'learning_rate': 4.21744966442953e-05, 'epoch': 0.49}


                                                     
 16%|█▌        | 12172/75000 [10:00<57:47, 18.12it/s]

{'loss': 0.2349, 'grad_norm': 6.274258136749268, 'learning_rate': 4.216778523489933e-05, 'epoch': 0.49}


                                                     
 16%|█▌        | 12183/75000 [10:00<52:05, 20.10it/s]

{'loss': 0.4197, 'grad_norm': 4.323951244354248, 'learning_rate': 4.216107382550335e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12192/75000 [10:01<51:30, 20.32it/s]

{'loss': 0.3656, 'grad_norm': 2.766986131668091, 'learning_rate': 4.215436241610738e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12204/75000 [10:01<55:35, 18.83it/s]

{'loss': 0.2854, 'grad_norm': 1.685981035232544, 'learning_rate': 4.214765100671142e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12214/75000 [10:02<51:45, 20.22it/s]

{'loss': 0.3309, 'grad_norm': 2.3512063026428223, 'learning_rate': 4.214093959731544e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12223/75000 [10:02<52:53, 19.78it/s]

{'loss': 0.3396, 'grad_norm': 7.374061584472656, 'learning_rate': 4.213422818791947e-05, 'epoch': 0.49}


                                                       
 16%|█▋        | 12232/75000 [10:03<57:26, 18.21it/s]  

{'loss': 0.2658, 'grad_norm': 6.170891284942627, 'learning_rate': 4.212751677852349e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12242/75000 [10:03<53:38, 19.50it/s]

{'loss': 0.3398, 'grad_norm': 9.210684776306152, 'learning_rate': 4.212080536912752e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12253/75000 [10:04<51:45, 20.21it/s]

{'loss': 0.3703, 'grad_norm': 0.814426064491272, 'learning_rate': 4.2114093959731546e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12262/75000 [10:04<54:10, 19.30it/s]

{'loss': 0.3597, 'grad_norm': 1.8101407289505005, 'learning_rate': 4.2107382550335575e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12274/75000 [10:05<51:10, 20.43it/s]

{'loss': 0.3056, 'grad_norm': 3.1556315422058105, 'learning_rate': 4.21006711409396e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12283/75000 [10:05<51:32, 20.28it/s]

{'loss': 0.4102, 'grad_norm': 2.339052200317383, 'learning_rate': 4.2093959731543625e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12292/75000 [10:06<53:53, 19.39it/s]

{'loss': 0.4285, 'grad_norm': 4.1832756996154785, 'learning_rate': 4.2087248322147654e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12304/75000 [10:06<51:16, 20.38it/s]

{'loss': 0.2868, 'grad_norm': 1.8983733654022217, 'learning_rate': 4.2080536912751675e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12313/75000 [10:07<51:01, 20.48it/s]

{'loss': 0.3333, 'grad_norm': 2.7163352966308594, 'learning_rate': 4.2073825503355704e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12324/75000 [10:07<52:02, 20.07it/s]

{'loss': 0.3662, 'grad_norm': 2.9738996028900146, 'learning_rate': 4.206711409395973e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12333/75000 [10:08<51:31, 20.27it/s]

{'loss': 0.3868, 'grad_norm': 2.1413097381591797, 'learning_rate': 4.206040268456376e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12342/75000 [10:08<51:24, 20.31it/s]

{'loss': 0.3099, 'grad_norm': 8.379034042358398, 'learning_rate': 4.205369127516779e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12351/75000 [10:09<51:24, 20.31it/s]

{'loss': 0.2458, 'grad_norm': 5.195741176605225, 'learning_rate': 4.204697986577181e-05, 'epoch': 0.49}


                                                       
 16%|█▋        | 12362/75000 [10:09<56:04, 18.62it/s]

{'loss': 0.2768, 'grad_norm': 4.299569606781006, 'learning_rate': 4.204026845637584e-05, 'epoch': 0.49}


                                                     
 16%|█▋        | 12373/75000 [10:10<52:09, 20.01it/s]

{'loss': 0.33, 'grad_norm': 3.332101345062256, 'learning_rate': 4.203355704697987e-05, 'epoch': 0.49}


                                                     
 17%|█▋        | 12382/75000 [10:10<53:22, 19.55it/s]

{'loss': 0.3976, 'grad_norm': 9.6181640625, 'learning_rate': 4.20268456375839e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12394/75000 [10:11<50:33, 20.64it/s]

{'loss': 0.2879, 'grad_norm': 3.5366485118865967, 'learning_rate': 4.2020134228187926e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12403/75000 [10:11<50:26, 20.68it/s]

{'loss': 0.248, 'grad_norm': 0.4571656882762909, 'learning_rate': 4.201342281879195e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12412/75000 [10:12<52:38, 19.81it/s]

{'loss': 0.3668, 'grad_norm': 1.475835919380188, 'learning_rate': 4.2006711409395976e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12424/75000 [10:12<51:27, 20.27it/s]

{'loss': 0.3, 'grad_norm': 1.7458158731460571, 'learning_rate': 4.2e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12433/75000 [10:13<51:31, 20.24it/s]

{'loss': 0.3546, 'grad_norm': 16.428144454956055, 'learning_rate': 4.199328859060403e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12444/75000 [10:13<53:34, 19.46it/s]

{'loss': 0.4136, 'grad_norm': 2.4377238750457764, 'learning_rate': 4.1986577181208055e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12451/75000 [10:14<53:50, 19.36it/s]

{'loss': 0.3106, 'grad_norm': 12.205897331237793, 'learning_rate': 4.197986577181208e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12462/75000 [10:14<54:16, 19.20it/s]

{'loss': 0.3567, 'grad_norm': 12.489831924438477, 'learning_rate': 4.197315436241611e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12473/75000 [10:15<52:46, 19.75it/s]

{'loss': 0.4124, 'grad_norm': 6.968006610870361, 'learning_rate': 4.1966442953020134e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12484/75000 [10:15<54:12, 19.22it/s]

{'loss': 0.3067, 'grad_norm': 1.4091882705688477, 'learning_rate': 4.195973154362416e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12494/75000 [10:16<52:30, 19.84it/s]

{'loss': 0.3776, 'grad_norm': 11.168617248535156, 'learning_rate': 4.195302013422819e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12500/75000 [10:16<51:56, 20.06it/s]

{'loss': 0.4245, 'grad_norm': 6.773682117462158, 'learning_rate': 4.194630872483222e-05, 'epoch': 0.5}


                                                       
 17%|█▋        | 12514/75000 [10:17<1:02:17, 16.72it/s]

{'loss': 0.2407, 'grad_norm': 4.872757911682129, 'learning_rate': 4.193959731543624e-05, 'epoch': 0.5}


                                                       
 17%|█▋        | 12524/75000 [10:18<53:41, 19.39it/s]

{'loss': 0.3662, 'grad_norm': 3.4788177013397217, 'learning_rate': 4.193288590604027e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12533/75000 [10:18<51:46, 20.11it/s]

{'loss': 0.325, 'grad_norm': 3.2141318321228027, 'learning_rate': 4.19261744966443e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12542/75000 [10:19<53:09, 19.58it/s]

{'loss': 0.3328, 'grad_norm': 4.73023796081543, 'learning_rate': 4.191946308724832e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12553/75000 [10:19<52:01, 20.01it/s]

{'loss': 0.2635, 'grad_norm': 1.367421269416809, 'learning_rate': 4.1912751677852355e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12562/75000 [10:20<51:23, 20.25it/s]

{'loss': 0.3949, 'grad_norm': 2.0573372840881348, 'learning_rate': 4.190604026845638e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12574/75000 [10:20<52:27, 19.84it/s]

{'loss': 0.2821, 'grad_norm': 8.719853401184082, 'learning_rate': 4.1899328859060406e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12583/75000 [10:21<51:11, 20.32it/s]

{'loss': 0.2369, 'grad_norm': 5.117643356323242, 'learning_rate': 4.1892617449664434e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12592/75000 [10:21<51:09, 20.33it/s]

{'loss': 0.4345, 'grad_norm': 4.571170330047607, 'learning_rate': 4.1885906040268456e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12604/75000 [10:22<50:24, 20.63it/s]

{'loss': 0.3962, 'grad_norm': 1.6622235774993896, 'learning_rate': 4.1879194630872484e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12613/75000 [10:22<53:46, 19.34it/s]

{'loss': 0.3085, 'grad_norm': 2.641254425048828, 'learning_rate': 4.187248322147651e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12622/75000 [10:23<51:50, 20.05it/s]

{'loss': 0.3376, 'grad_norm': 2.206044912338257, 'learning_rate': 4.186577181208054e-05, 'epoch': 0.5}


                                                     
 17%|█▋        | 12634/75000 [10:23<51:18, 20.26it/s]

{'loss': 0.3481, 'grad_norm': 1.7668735980987549, 'learning_rate': 4.185906040268456e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12642/75000 [10:24<54:48, 18.96it/s]

{'loss': 0.28, 'grad_norm': 2.0998754501342773, 'learning_rate': 4.185234899328859e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12653/75000 [10:24<51:11, 20.30it/s]

{'loss': 0.2865, 'grad_norm': 2.5923197269439697, 'learning_rate': 4.184563758389262e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12662/75000 [10:25<51:06, 20.33it/s]

{'loss': 0.3647, 'grad_norm': 9.148785591125488, 'learning_rate': 4.183892617449664e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12673/75000 [10:25<53:36, 19.38it/s]

{'loss': 0.385, 'grad_norm': 3.924668550491333, 'learning_rate': 4.183221476510068e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12684/75000 [10:26<51:06, 20.32it/s]

{'loss': 0.3465, 'grad_norm': 1.645249843597412, 'learning_rate': 4.18255033557047e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12693/75000 [10:26<50:55, 20.39it/s]

{'loss': 0.4347, 'grad_norm': 4.13007926940918, 'learning_rate': 4.181879194630873e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12704/75000 [10:27<52:50, 19.65it/s]

{'loss': 0.3988, 'grad_norm': 7.147805213928223, 'learning_rate': 4.181208053691275e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12712/75000 [10:27<52:17, 19.85it/s]

{'loss': 0.2605, 'grad_norm': 5.971029281616211, 'learning_rate': 4.180536912751678e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12724/75000 [10:28<50:42, 20.47it/s]

{'loss': 0.321, 'grad_norm': 4.4554057121276855, 'learning_rate': 4.179865771812081e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12734/75000 [10:28<52:23, 19.81it/s]

{'loss': 0.3934, 'grad_norm': 7.325737953186035, 'learning_rate': 4.1791946308724835e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12742/75000 [10:29<52:10, 19.89it/s]

{'loss': 0.2479, 'grad_norm': 3.8493971824645996, 'learning_rate': 4.1785234899328864e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12752/75000 [10:29<52:45, 19.66it/s]

{'loss': 0.3316, 'grad_norm': 0.5413467884063721, 'learning_rate': 4.1778523489932886e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12764/75000 [10:30<51:08, 20.28it/s]

{'loss': 0.2901, 'grad_norm': 0.5208136439323425, 'learning_rate': 4.1771812080536914e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12772/75000 [10:30<52:38, 19.70it/s]

{'loss': 0.3846, 'grad_norm': 9.758766174316406, 'learning_rate': 4.176510067114094e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12784/75000 [10:31<50:22, 20.59it/s]

{'loss': 0.3336, 'grad_norm': 1.5252513885498047, 'learning_rate': 4.175838926174497e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12794/75000 [10:31<52:49, 19.63it/s]

{'loss': 0.3311, 'grad_norm': 3.104390859603882, 'learning_rate': 4.1751677852349e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12804/75000 [10:32<50:54, 20.36it/s]

{'loss': 0.2775, 'grad_norm': 4.0761919021606445, 'learning_rate': 4.174496644295302e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12813/75000 [10:32<51:06, 20.28it/s]

{'loss': 0.3683, 'grad_norm': 3.4551689624786377, 'learning_rate': 4.173825503355705e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12821/75000 [10:33<56:47, 18.25it/s]

{'loss': 0.3373, 'grad_norm': 1.9526458978652954, 'learning_rate': 4.173154362416107e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12832/75000 [10:33<51:56, 19.95it/s]

{'loss': 0.3938, 'grad_norm': 2.5586953163146973, 'learning_rate': 4.17248322147651e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12841/75000 [10:34<51:37, 20.06it/s]

{'loss': 0.4623, 'grad_norm': 2.082472324371338, 'learning_rate': 4.171812080536913e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12852/75000 [10:34<50:37, 20.46it/s]

{'loss': 0.3502, 'grad_norm': 6.905227184295654, 'learning_rate': 4.171140939597316e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12864/75000 [10:35<52:18, 19.80it/s]

{'loss': 0.2666, 'grad_norm': 6.789488792419434, 'learning_rate': 4.1704697986577186e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12874/75000 [10:36<51:18, 20.18it/s]

{'loss': 0.3424, 'grad_norm': 5.891643047332764, 'learning_rate': 4.169798657718121e-05, 'epoch': 0.51}


                                                     
 17%|█▋        | 12883/75000 [10:36<51:30, 20.10it/s]

{'loss': 0.3421, 'grad_norm': 6.487085342407227, 'learning_rate': 4.1691275167785236e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12894/75000 [10:37<53:08, 19.48it/s]

{'loss': 0.317, 'grad_norm': 4.023744106292725, 'learning_rate': 4.168456375838926e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12903/75000 [10:37<51:34, 20.07it/s]

{'loss': 0.4584, 'grad_norm': 5.014733791351318, 'learning_rate': 4.1677852348993293e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12912/75000 [10:37<51:27, 20.11it/s]

{'loss': 0.2614, 'grad_norm': 1.6440802812576294, 'learning_rate': 4.167114093959732e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12921/75000 [10:38<55:33, 18.62it/s]

{'loss': 0.2545, 'grad_norm': 7.102799892425537, 'learning_rate': 4.1664429530201344e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12933/75000 [10:38<51:06, 20.24it/s]

{'loss': 0.3375, 'grad_norm': 2.2450363636016846, 'learning_rate': 4.165771812080537e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12942/75000 [10:39<51:10, 20.21it/s]

{'loss': 0.1903, 'grad_norm': 6.39703893661499, 'learning_rate': 4.1651006711409394e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12954/75000 [10:40<50:33, 20.46it/s]

{'loss': 0.2724, 'grad_norm': 2.897138833999634, 'learning_rate': 4.164429530201342e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12963/75000 [10:40<54:22, 19.01it/s]

{'loss': 0.2675, 'grad_norm': 0.5658611059188843, 'learning_rate': 4.163758389261745e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12974/75000 [10:41<50:40, 20.40it/s]

{'loss': 0.4274, 'grad_norm': 7.31039571762085, 'learning_rate': 4.163087248322148e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12983/75000 [10:41<50:47, 20.35it/s]

{'loss': 0.429, 'grad_norm': 3.335653066635132, 'learning_rate': 4.162416107382551e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 12994/75000 [10:42<52:00, 19.87it/s]

{'loss': 0.2341, 'grad_norm': 2.873832941055298, 'learning_rate': 4.161744966442953e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13000/75000 [10:42<51:37, 20.02it/s]

{'loss': 0.3356, 'grad_norm': 2.578355073928833, 'learning_rate': 4.161073825503356e-05, 'epoch': 0.52}


                                                       
 17%|█▋        | 13014/75000 [10:43<1:01:04, 16.92it/s]

{'loss': 0.3062, 'grad_norm': 2.6770875453948975, 'learning_rate': 4.160402684563758e-05, 'epoch': 0.52}


                                                       
 17%|█▋        | 13021/75000 [10:43<56:21, 18.33it/s]

{'loss': 0.3189, 'grad_norm': 4.757335186004639, 'learning_rate': 4.1597315436241616e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13033/75000 [10:44<51:12, 20.17it/s]

{'loss': 0.3841, 'grad_norm': 1.781838059425354, 'learning_rate': 4.1590604026845644e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13042/75000 [10:44<50:54, 20.28it/s]

{'loss': 0.2887, 'grad_norm': 3.27763032913208, 'learning_rate': 4.1583892617449666e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13054/75000 [10:45<50:20, 20.51it/s]

{'loss': 0.3505, 'grad_norm': 0.9221369624137878, 'learning_rate': 4.1577181208053695e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13063/75000 [10:45<52:35, 19.63it/s]

{'loss': 0.4685, 'grad_norm': 9.920032501220703, 'learning_rate': 4.1570469798657716e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13072/75000 [10:46<50:18, 20.52it/s]

{'loss': 0.4569, 'grad_norm': 2.76920485496521, 'learning_rate': 4.1563758389261745e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13083/75000 [10:46<50:20, 20.50it/s]

{'loss': 0.3411, 'grad_norm': 1.4939974546432495, 'learning_rate': 4.155704697986577e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13094/75000 [10:47<54:01, 19.10it/s]

{'loss': 0.3528, 'grad_norm': 5.629795074462891, 'learning_rate': 4.15503355704698e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13104/75000 [10:47<50:50, 20.29it/s]

{'loss': 0.2147, 'grad_norm': 2.9039597511291504, 'learning_rate': 4.154362416107383e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13114/75000 [10:48<51:10, 20.15it/s]

{'loss': 0.2827, 'grad_norm': 4.422791957855225, 'learning_rate': 4.153691275167785e-05, 'epoch': 0.52}


                                                     
 17%|█▋        | 13122/75000 [10:48<53:38, 19.23it/s]

{'loss': 0.285, 'grad_norm': 6.931211471557617, 'learning_rate': 4.153020134228188e-05, 'epoch': 0.52}


                                                     
 18%|█▊        | 13131/75000 [10:49<50:44, 20.32it/s]

{'loss': 0.3596, 'grad_norm': 7.4305419921875, 'learning_rate': 4.152348993288591e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13143/75000 [10:49<50:25, 20.45it/s]

{'loss': 0.2414, 'grad_norm': 3.4463913440704346, 'learning_rate': 4.151677852348994e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13152/75000 [10:50<51:02, 20.19it/s]

{'loss': 0.3684, 'grad_norm': 15.860556602478027, 'learning_rate': 4.151006711409396e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13164/75000 [10:50<50:58, 20.22it/s]

{'loss': 0.3827, 'grad_norm': 10.003800392150879, 'learning_rate': 4.150335570469799e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13173/75000 [10:51<51:17, 20.09it/s]

{'loss': 0.4194, 'grad_norm': 0.7525684833526611, 'learning_rate': 4.149664429530202e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13182/75000 [10:51<50:49, 20.27it/s]

{'loss': 0.3943, 'grad_norm': 6.772779941558838, 'learning_rate': 4.148993288590604e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13194/75000 [10:52<53:06, 19.40it/s]

{'loss': 0.3022, 'grad_norm': 4.486734867095947, 'learning_rate': 4.148322147651007e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13203/75000 [10:52<51:19, 20.07it/s]

{'loss': 0.3278, 'grad_norm': 1.9105640649795532, 'learning_rate': 4.1476510067114096e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13212/75000 [10:53<51:07, 20.14it/s]

{'loss': 0.313, 'grad_norm': 2.489710807800293, 'learning_rate': 4.1469798657718124e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13221/75000 [10:53<50:53, 20.23it/s]

{'loss': 0.4294, 'grad_norm': 10.73095989227295, 'learning_rate': 4.146308724832215e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13233/75000 [10:54<54:19, 18.95it/s]

{'loss': 0.3318, 'grad_norm': 5.630512237548828, 'learning_rate': 4.1456375838926174e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13242/75000 [10:54<51:02, 20.16it/s]

{'loss': 0.3769, 'grad_norm': 5.8038530349731445, 'learning_rate': 4.14496644295302e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13253/75000 [10:55<54:09, 19.00it/s]

{'loss': 0.2906, 'grad_norm': 3.107084035873413, 'learning_rate': 4.144295302013423e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13264/75000 [10:56<50:56, 20.20it/s]

{'loss': 0.3757, 'grad_norm': 3.6558032035827637, 'learning_rate': 4.143624161073826e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13273/75000 [10:56<52:14, 19.69it/s]

{'loss': 0.2167, 'grad_norm': 1.5149067640304565, 'learning_rate': 4.142953020134228e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13282/75000 [10:56<51:02, 20.15it/s]

{'loss': 0.3363, 'grad_norm': 7.718579292297363, 'learning_rate': 4.142281879194631e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13293/75000 [10:57<52:27, 19.60it/s]

{'loss': 0.418, 'grad_norm': 5.07843017578125, 'learning_rate': 4.141610738255034e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13302/75000 [10:57<50:46, 20.25it/s]

{'loss': 0.2864, 'grad_norm': 3.9954020977020264, 'learning_rate': 4.140939597315436e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13311/75000 [10:58<51:00, 20.15it/s]

{'loss': 0.3398, 'grad_norm': 1.3929851055145264, 'learning_rate': 4.140268456375839e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13322/75000 [10:59<52:22, 19.63it/s]

{'loss': 0.2992, 'grad_norm': 8.40833568572998, 'learning_rate': 4.139597315436242e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13332/75000 [10:59<51:23, 20.00it/s]

{'loss': 0.4391, 'grad_norm': 1.5685365200042725, 'learning_rate': 4.1389261744966446e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13344/75000 [11:00<49:41, 20.68it/s]

{'loss': 0.2925, 'grad_norm': 3.47572660446167, 'learning_rate': 4.138255033557047e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13352/75000 [11:00<52:18, 19.64it/s]

{'loss': 0.3833, 'grad_norm': 2.7548470497131348, 'learning_rate': 4.13758389261745e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13363/75000 [11:01<51:29, 19.95it/s]

{'loss': 0.3175, 'grad_norm': 10.197078704833984, 'learning_rate': 4.1369127516778525e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13373/75000 [11:01<50:58, 20.15it/s]

{'loss': 0.3404, 'grad_norm': 3.494300127029419, 'learning_rate': 4.1362416107382554e-05, 'epoch': 0.53}


                                                     
 18%|█▊        | 13384/75000 [11:02<51:52, 19.80it/s]

{'loss': 0.2562, 'grad_norm': 2.9785256385803223, 'learning_rate': 4.135570469798658e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13393/75000 [11:02<50:45, 20.23it/s]

{'loss': 0.3958, 'grad_norm': 5.097799301147461, 'learning_rate': 4.1348993288590604e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13402/75000 [11:02<49:54, 20.57it/s]

{'loss': 0.3416, 'grad_norm': 2.9130520820617676, 'learning_rate': 4.134228187919463e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13414/75000 [11:03<51:16, 20.02it/s]

{'loss': 0.3487, 'grad_norm': 2.8563530445098877, 'learning_rate': 4.133557046979866e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13423/75000 [11:04<50:23, 20.37it/s]

{'loss': 0.2979, 'grad_norm': 2.5111513137817383, 'learning_rate': 4.132885906040268e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13434/75000 [11:04<50:58, 20.13it/s]

{'loss': 0.3611, 'grad_norm': 4.166898250579834, 'learning_rate': 4.132214765100672e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13444/75000 [11:05<51:25, 19.95it/s]

{'loss': 0.3413, 'grad_norm': 0.9248095750808716, 'learning_rate': 4.131543624161074e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13452/75000 [11:05<50:29, 20.32it/s]

{'loss': 0.2595, 'grad_norm': 3.887421131134033, 'learning_rate': 4.130872483221477e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13461/75000 [11:05<50:08, 20.46it/s]

{'loss': 0.3778, 'grad_norm': 3.9414889812469482, 'learning_rate': 4.130201342281879e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13473/75000 [11:06<50:26, 20.33it/s]

{'loss': 0.3453, 'grad_norm': 1.8963907957077026, 'learning_rate': 4.129530201342282e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13484/75000 [11:07<50:25, 20.34it/s]

{'loss': 0.3946, 'grad_norm': 11.639106750488281, 'learning_rate': 4.128859060402685e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13493/75000 [11:07<50:51, 20.15it/s]

{'loss': 0.2394, 'grad_norm': 4.027426242828369, 'learning_rate': 4.1281879194630876e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13500/75000 [11:07<50:12, 20.41it/s]

{'loss': 0.4606, 'grad_norm': 2.9566752910614014, 'learning_rate': 4.1275167785234905e-05, 'epoch': 0.54}


                                                       
 18%|█▊        | 13512/75000 [11:08<1:03:36, 16.11it/s]

{'loss': 0.3808, 'grad_norm': 7.488801002502441, 'learning_rate': 4.1268456375838926e-05, 'epoch': 0.54}


                                                       
 18%|█▊        | 13522/75000 [11:09<54:13, 18.90it/s]

{'loss': 0.2611, 'grad_norm': 1.6185475587844849, 'learning_rate': 4.1261744966442955e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13533/75000 [11:09<51:06, 20.05it/s]

{'loss': 0.3737, 'grad_norm': 5.826373100280762, 'learning_rate': 4.125503355704698e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13543/75000 [11:10<54:15, 18.88it/s]

{'loss': 0.4773, 'grad_norm': 7.823537826538086, 'learning_rate': 4.1248322147651005e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13553/75000 [11:10<51:19, 19.95it/s]

{'loss': 0.328, 'grad_norm': 3.5743796825408936, 'learning_rate': 4.124161073825504e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13562/75000 [11:11<50:50, 20.14it/s]

{'loss': 0.2633, 'grad_norm': 6.963767051696777, 'learning_rate': 4.123489932885906e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13571/75000 [11:11<50:36, 20.23it/s]

{'loss': 0.3806, 'grad_norm': 1.630834698677063, 'learning_rate': 4.122818791946309e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13584/75000 [11:12<51:15, 19.97it/s]

{'loss': 0.3296, 'grad_norm': 4.6882734298706055, 'learning_rate': 4.122147651006711e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13594/75000 [11:13<49:43, 20.58it/s]

{'loss': 0.4231, 'grad_norm': 4.2333879470825195, 'learning_rate': 4.121476510067114e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13602/75000 [11:13<54:36, 18.74it/s]

{'loss': 0.2833, 'grad_norm': 4.179349422454834, 'learning_rate': 4.120805369127517e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13613/75000 [11:13<50:45, 20.16it/s]

{'loss': 0.3809, 'grad_norm': 5.088588237762451, 'learning_rate': 4.12013422818792e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13622/75000 [11:14<49:59, 20.46it/s]

{'loss': 0.4783, 'grad_norm': 4.739264488220215, 'learning_rate': 4.119463087248323e-05, 'epoch': 0.54}


                                                     
 18%|█▊        | 13634/75000 [11:15<50:08, 20.40it/s]

{'loss': 0.3955, 'grad_norm': 12.934510231018066, 'learning_rate': 4.118791946308725e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13643/75000 [11:15<50:23, 20.29it/s]

{'loss': 0.2908, 'grad_norm': 2.921053647994995, 'learning_rate': 4.118120805369128e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13652/75000 [11:15<50:14, 20.35it/s]

{'loss': 0.2876, 'grad_norm': 3.906660795211792, 'learning_rate': 4.11744966442953e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13664/75000 [11:16<52:03, 19.64it/s]

{'loss': 0.3715, 'grad_norm': 6.50790548324585, 'learning_rate': 4.1167785234899334e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13673/75000 [11:16<50:34, 20.21it/s]

{'loss': 0.3323, 'grad_norm': 6.492505073547363, 'learning_rate': 4.116107382550336e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13684/75000 [11:17<50:04, 20.41it/s]

{'loss': 0.2923, 'grad_norm': 3.1856939792633057, 'learning_rate': 4.1154362416107385e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13692/75000 [11:17<53:28, 19.11it/s]

{'loss': 0.3249, 'grad_norm': 9.692687034606934, 'learning_rate': 4.114765100671141e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13704/75000 [11:18<49:38, 20.58it/s]

{'loss': 0.2499, 'grad_norm': 7.134243488311768, 'learning_rate': 4.1140939597315435e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13713/75000 [11:18<50:10, 20.36it/s]

{'loss': 0.3519, 'grad_norm': 5.933728218078613, 'learning_rate': 4.1134228187919463e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13722/75000 [11:19<50:33, 20.20it/s]

{'loss': 0.3605, 'grad_norm': 3.5549826622009277, 'learning_rate': 4.112751677852349e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13734/75000 [11:20<51:50, 19.70it/s]

{'loss': 0.267, 'grad_norm': 2.045936107635498, 'learning_rate': 4.112080536912752e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13743/75000 [11:20<51:55, 19.66it/s]

{'loss': 0.2832, 'grad_norm': 1.85068678855896, 'learning_rate': 4.111409395973155e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13753/75000 [11:20<50:07, 20.36it/s]

{'loss': 0.2775, 'grad_norm': 1.9692691564559937, 'learning_rate': 4.110738255033557e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13762/75000 [11:21<50:39, 20.15it/s]

{'loss': 0.3196, 'grad_norm': 6.861074924468994, 'learning_rate': 4.11006711409396e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13774/75000 [11:22<49:27, 20.63it/s]

{'loss': 0.3987, 'grad_norm': 5.416213035583496, 'learning_rate': 4.109395973154362e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13783/75000 [11:22<50:04, 20.37it/s]

{'loss': 0.386, 'grad_norm': 2.537015199661255, 'learning_rate': 4.1087248322147656e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13792/75000 [11:22<49:44, 20.51it/s]

{'loss': 0.3069, 'grad_norm': 1.0657743215560913, 'learning_rate': 4.1080536912751685e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13804/75000 [11:23<50:15, 20.29it/s]

{'loss': 0.2466, 'grad_norm': 1.2923705577850342, 'learning_rate': 4.107382550335571e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13813/75000 [11:23<50:07, 20.34it/s]

{'loss': 0.2778, 'grad_norm': 3.0985796451568604, 'learning_rate': 4.1067114093959735e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13822/75000 [11:24<50:17, 20.27it/s]

{'loss': 0.3851, 'grad_norm': 6.6485724449157715, 'learning_rate': 4.106040268456376e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13834/75000 [11:24<49:56, 20.42it/s]

{'loss': 0.2573, 'grad_norm': 1.4512807130813599, 'learning_rate': 4.1053691275167786e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13843/75000 [11:25<49:38, 20.53it/s]

{'loss': 0.393, 'grad_norm': 0.6575513482093811, 'learning_rate': 4.1046979865771814e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13852/75000 [11:25<49:54, 20.42it/s]

{'loss': 0.3198, 'grad_norm': 4.477200508117676, 'learning_rate': 4.104026845637584e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13864/75000 [11:26<52:36, 19.37it/s]

{'loss': 0.3632, 'grad_norm': 3.050647497177124, 'learning_rate': 4.103355704697987e-05, 'epoch': 0.55}


                                                     
 18%|█▊        | 13874/75000 [11:26<49:59, 20.38it/s]

{'loss': 0.22, 'grad_norm': 5.239838123321533, 'learning_rate': 4.102684563758389e-05, 'epoch': 0.55}


                                                     
 19%|█▊        | 13883/75000 [11:27<51:10, 19.90it/s]

{'loss': 0.385, 'grad_norm': 3.1880786418914795, 'learning_rate': 4.102013422818792e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13894/75000 [11:27<50:27, 20.19it/s]

{'loss': 0.3034, 'grad_norm': 4.526909828186035, 'learning_rate': 4.101342281879194e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13902/75000 [11:28<51:11, 19.89it/s]

{'loss': 0.2463, 'grad_norm': 6.788011074066162, 'learning_rate': 4.100671140939598e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13914/75000 [11:28<49:31, 20.56it/s]

{'loss': 0.4026, 'grad_norm': 2.172961473464966, 'learning_rate': 4.1e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13923/75000 [11:29<52:53, 19.25it/s]

{'loss': 0.3254, 'grad_norm': 5.470468521118164, 'learning_rate': 4.099328859060403e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13934/75000 [11:30<50:10, 20.29it/s]

{'loss': 0.2539, 'grad_norm': 4.544030666351318, 'learning_rate': 4.098657718120806e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13943/75000 [11:30<50:10, 20.28it/s]

{'loss': 0.4221, 'grad_norm': 2.5902974605560303, 'learning_rate': 4.097986577181208e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13952/75000 [11:30<51:02, 19.94it/s]

{'loss': 0.3729, 'grad_norm': 4.25151252746582, 'learning_rate': 4.097315436241611e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13964/75000 [11:31<50:17, 20.23it/s]

{'loss': 0.3094, 'grad_norm': 0.6948392391204834, 'learning_rate': 4.0966442953020136e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13973/75000 [11:31<50:05, 20.31it/s]

{'loss': 0.2315, 'grad_norm': 2.617551565170288, 'learning_rate': 4.0959731543624165e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13982/75000 [11:32<49:33, 20.52it/s]

{'loss': 0.3989, 'grad_norm': 3.197385549545288, 'learning_rate': 4.095302013422819e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 13994/75000 [11:32<49:27, 20.56it/s]

{'loss': 0.2651, 'grad_norm': 4.909792423248291, 'learning_rate': 4.0946308724832215e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 14000/75000 [11:33<49:51, 20.39it/s]

{'loss': 0.2722, 'grad_norm': 4.595253944396973, 'learning_rate': 4.0939597315436244e-05, 'epoch': 0.56}


                                                       
 19%|█▊        | 14012/75000 [11:34<1:06:02, 15.39it/s]

{'loss': 0.3711, 'grad_norm': 2.785468101501465, 'learning_rate': 4.093288590604027e-05, 'epoch': 0.56}


                                                       
 19%|█▊        | 14023/75000 [11:34<52:50, 19.23it/s]

{'loss': 0.393, 'grad_norm': 4.841879844665527, 'learning_rate': 4.09261744966443e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 14032/75000 [11:35<50:48, 20.00it/s]

{'loss': 0.3412, 'grad_norm': 2.520235061645508, 'learning_rate': 4.091946308724832e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 14043/75000 [11:35<52:35, 19.32it/s]

{'loss': 0.3797, 'grad_norm': 4.510630130767822, 'learning_rate': 4.091275167785235e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 14054/75000 [11:36<50:25, 20.15it/s]

{'loss': 0.3551, 'grad_norm': 2.563715934753418, 'learning_rate': 4.090604026845638e-05, 'epoch': 0.56}


                                                     
 19%|█▊        | 14062/75000 [11:36<50:24, 20.15it/s]

{'loss': 0.3697, 'grad_norm': 1.4830529689788818, 'learning_rate': 4.08993288590604e-05, 'epoch': 0.56}


                                                     
 19%|█▉        | 14072/75000 [11:37<52:34, 19.31it/s]

{'loss': 0.3861, 'grad_norm': 4.036438941955566, 'learning_rate': 4.089261744966443e-05, 'epoch': 0.56}


                                                     
 19%|█▉        | 14084/75000 [11:37<49:38, 20.45it/s]

{'loss': 0.2177, 'grad_norm': 6.087649345397949, 'learning_rate': 4.088590604026846e-05, 'epoch': 0.56}


                                                     
 19%|█▉        | 14093/75000 [11:38<50:21, 20.16it/s]

{'loss': 0.4007, 'grad_norm': 2.3346996307373047, 'learning_rate': 4.087919463087249e-05, 'epoch': 0.56}


                                                     
 19%|█▉        | 14103/75000 [11:38<51:45, 19.61it/s]

{'loss': 0.2645, 'grad_norm': 8.106633186340332, 'learning_rate': 4.087248322147651e-05, 'epoch': 0.56}


                                                     
 19%|█▉        | 14114/75000 [11:39<50:07, 20.25it/s]

{'loss': 0.453, 'grad_norm': 5.865232944488525, 'learning_rate': 4.086577181208054e-05, 'epoch': 0.56}


                                                     
 19%|█▉        | 14123/75000 [11:39<50:06, 20.25it/s]

{'loss': 0.3077, 'grad_norm': 3.7137064933776855, 'learning_rate': 4.0859060402684566e-05, 'epoch': 0.56}


                                                     
 19%|█▉        | 14132/75000 [11:40<56:28, 17.96it/s]

{'loss': 0.3904, 'grad_norm': 2.3060970306396484, 'learning_rate': 4.0852348993288595e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14143/75000 [11:40<51:19, 19.76it/s]

{'loss': 0.3596, 'grad_norm': 2.9303343296051025, 'learning_rate': 4.084563758389262e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14153/75000 [11:41<49:53, 20.33it/s]

{'loss': 0.3698, 'grad_norm': 1.8677504062652588, 'learning_rate': 4.0838926174496645e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14162/75000 [11:41<58:53, 17.22it/s]

{'loss': 0.3394, 'grad_norm': 8.920985221862793, 'learning_rate': 4.0832214765100673e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14174/75000 [11:42<51:32, 19.67it/s]

{'loss': 0.4939, 'grad_norm': 4.649264335632324, 'learning_rate': 4.0825503355704695e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14182/75000 [11:42<50:16, 20.16it/s]

{'loss': 0.2887, 'grad_norm': 6.873055458068848, 'learning_rate': 4.0818791946308724e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14194/75000 [11:43<51:46, 19.57it/s]

{'loss': 0.3148, 'grad_norm': 0.9543959498405457, 'learning_rate': 4.081208053691275e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14203/75000 [11:44<50:00, 20.26it/s]

{'loss': 0.3269, 'grad_norm': 3.024585008621216, 'learning_rate': 4.080536912751678e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14212/75000 [11:44<50:02, 20.25it/s]

{'loss': 0.3548, 'grad_norm': 2.580336570739746, 'learning_rate': 4.079865771812081e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14223/75000 [11:45<51:29, 19.67it/s]

{'loss': 0.4752, 'grad_norm': 3.6570262908935547, 'learning_rate': 4.079194630872483e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14232/75000 [11:45<49:25, 20.49it/s]

{'loss': 0.295, 'grad_norm': 3.118887186050415, 'learning_rate': 4.078523489932886e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14244/75000 [11:46<49:26, 20.48it/s]

{'loss': 0.2563, 'grad_norm': 3.0354723930358887, 'learning_rate': 4.077852348993289e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14252/75000 [11:46<52:35, 19.25it/s]

{'loss': 0.3202, 'grad_norm': 1.8113919496536255, 'learning_rate': 4.077181208053692e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14262/75000 [11:46<51:06, 19.81it/s]

{'loss': 0.3637, 'grad_norm': 6.18478536605835, 'learning_rate': 4.0765100671140945e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14271/75000 [11:47<49:43, 20.36it/s]

{'loss': 0.2806, 'grad_norm': 2.3825459480285645, 'learning_rate': 4.075838926174497e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14283/75000 [11:48<49:03, 20.62it/s]

{'loss': 0.4, 'grad_norm': 11.800895690917969, 'learning_rate': 4.0751677852348996e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14294/75000 [11:48<50:26, 20.06it/s]

{'loss': 0.365, 'grad_norm': 1.255170226097107, 'learning_rate': 4.074496644295302e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14303/75000 [11:49<50:12, 20.15it/s]

{'loss': 0.3115, 'grad_norm': 2.407456636428833, 'learning_rate': 4.0738255033557046e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14312/75000 [11:49<50:40, 19.96it/s]

{'loss': 0.3777, 'grad_norm': 7.349166393280029, 'learning_rate': 4.073154362416108e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14324/75000 [11:50<52:09, 19.39it/s]

{'loss': 0.4278, 'grad_norm': 4.854926109313965, 'learning_rate': 4.07248322147651e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14331/75000 [11:50<51:02, 19.81it/s]

{'loss': 0.3028, 'grad_norm': 1.330561637878418, 'learning_rate': 4.071812080536913e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14343/75000 [11:51<50:23, 20.06it/s]

{'loss': 0.3807, 'grad_norm': 8.155773162841797, 'learning_rate': 4.0711409395973153e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14354/75000 [11:51<54:26, 18.57it/s]

{'loss': 0.4432, 'grad_norm': 1.9519875049591064, 'learning_rate': 4.070469798657718e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14364/75000 [11:52<52:13, 19.35it/s]

{'loss': 0.3183, 'grad_norm': 8.51119613647461, 'learning_rate': 4.069798657718121e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14374/75000 [11:52<50:59, 19.81it/s]

{'loss': 0.3852, 'grad_norm': 5.278854846954346, 'learning_rate': 4.069127516778524e-05, 'epoch': 0.57}


                                                     
 19%|█▉        | 14381/75000 [11:53<54:08, 18.66it/s]

{'loss': 0.2868, 'grad_norm': 11.644121170043945, 'learning_rate': 4.068456375838927e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14392/75000 [11:53<50:10, 20.13it/s]

{'loss': 0.2702, 'grad_norm': 1.285983920097351, 'learning_rate': 4.067785234899329e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14404/75000 [11:54<49:33, 20.38it/s]

{'loss': 0.3664, 'grad_norm': 7.619176864624023, 'learning_rate': 4.067114093959732e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14412/75000 [11:54<55:26, 18.21it/s]

{'loss': 0.3659, 'grad_norm': 9.12100601196289, 'learning_rate': 4.066442953020134e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14424/75000 [11:55<49:47, 20.28it/s]

{'loss': 0.232, 'grad_norm': 10.667359352111816, 'learning_rate': 4.065771812080537e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14433/75000 [11:55<50:32, 19.97it/s]

{'loss': 0.3796, 'grad_norm': 5.395374774932861, 'learning_rate': 4.0651006711409404e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14442/75000 [11:56<52:26, 19.25it/s]

{'loss': 0.3529, 'grad_norm': 2.387796401977539, 'learning_rate': 4.0644295302013425e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14454/75000 [11:56<49:50, 20.25it/s]

{'loss': 0.2563, 'grad_norm': 1.099453330039978, 'learning_rate': 4.0637583892617454e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14463/75000 [11:57<49:39, 20.32it/s]

{'loss': 0.2582, 'grad_norm': 0.4766344130039215, 'learning_rate': 4.0630872483221476e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14472/75000 [11:57<49:43, 20.29it/s]

{'loss': 0.3466, 'grad_norm': 23.00381851196289, 'learning_rate': 4.0624161073825504e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14484/75000 [11:58<51:30, 19.58it/s]

{'loss': 0.4576, 'grad_norm': 4.352818965911865, 'learning_rate': 4.061744966442953e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14494/75000 [11:58<50:05, 20.13it/s]

{'loss': 0.4148, 'grad_norm': 4.562699317932129, 'learning_rate': 4.061073825503356e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14500/75000 [11:59<51:34, 19.55it/s]

{'loss': 0.2508, 'grad_norm': 2.983530044555664, 'learning_rate': 4.060402684563759e-05, 'epoch': 0.58}


                                                       
 19%|█▉        | 14514/75000 [12:00<57:52, 17.42it/s]  

{'loss': 0.4416, 'grad_norm': 4.121598720550537, 'learning_rate': 4.059731543624161e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14524/75000 [12:00<51:16, 19.66it/s]

{'loss': 0.3898, 'grad_norm': 2.751340866088867, 'learning_rate': 4.059060402684564e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14533/75000 [12:01<50:35, 19.92it/s]

{'loss': 0.322, 'grad_norm': 11.016300201416016, 'learning_rate': 4.058389261744966e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14544/75000 [12:01<51:16, 19.65it/s]

{'loss': 0.2476, 'grad_norm': 10.888396263122559, 'learning_rate': 4.05771812080537e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14553/75000 [12:02<50:01, 20.14it/s]

{'loss': 0.3485, 'grad_norm': 6.528381824493408, 'learning_rate': 4.057046979865772e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14562/75000 [12:02<50:27, 19.96it/s]

{'loss': 0.2682, 'grad_norm': 5.8736114501953125, 'learning_rate': 4.056375838926175e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14572/75000 [12:03<51:22, 19.60it/s]

{'loss': 0.3275, 'grad_norm': 2.2520251274108887, 'learning_rate': 4.0557046979865776e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14581/75000 [12:03<50:29, 19.94it/s]

{'loss': 0.2202, 'grad_norm': 8.332718849182129, 'learning_rate': 4.05503355704698e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14593/75000 [12:04<49:00, 20.55it/s]

{'loss': 0.3531, 'grad_norm': 3.697913885116577, 'learning_rate': 4.0543624161073826e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14604/75000 [12:04<49:58, 20.14it/s]

{'loss': 0.2717, 'grad_norm': 6.656491756439209, 'learning_rate': 4.0536912751677855e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14613/75000 [12:05<49:51, 20.18it/s]

{'loss': 0.3229, 'grad_norm': 6.4826250076293945, 'learning_rate': 4.0530201342281884e-05, 'epoch': 0.58}


                                                     
 19%|█▉        | 14622/75000 [12:05<50:38, 19.87it/s]

{'loss': 0.3503, 'grad_norm': 9.017016410827637, 'learning_rate': 4.0523489932885905e-05, 'epoch': 0.58}


                                                     
 20%|█▉        | 14634/75000 [12:06<51:15, 19.63it/s]

{'loss': 0.2773, 'grad_norm': 12.939746856689453, 'learning_rate': 4.0516778523489934e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14643/75000 [12:06<50:12, 20.04it/s]

{'loss': 0.418, 'grad_norm': 10.703990936279297, 'learning_rate': 4.051006711409396e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14652/75000 [12:07<49:18, 20.40it/s]

{'loss': 0.4351, 'grad_norm': 5.285466194152832, 'learning_rate': 4.0503355704697984e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14661/75000 [12:07<52:35, 19.12it/s]

{'loss': 0.3464, 'grad_norm': 3.344521999359131, 'learning_rate': 4.049664429530202e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14673/75000 [12:08<49:13, 20.43it/s]

{'loss': 0.3312, 'grad_norm': 4.642972946166992, 'learning_rate': 4.048993288590604e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14682/75000 [12:08<50:00, 20.10it/s]

{'loss': 0.4311, 'grad_norm': 1.5932049751281738, 'learning_rate': 4.048322147651007e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14694/75000 [12:09<50:17, 19.99it/s]

{'loss': 0.254, 'grad_norm': 3.046809434890747, 'learning_rate': 4.04765100671141e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14703/75000 [12:09<50:01, 20.09it/s]

{'loss': 0.49, 'grad_norm': 3.7385129928588867, 'learning_rate': 4.046979865771812e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14712/75000 [12:10<48:31, 20.71it/s]

{'loss': 0.2991, 'grad_norm': 7.990893363952637, 'learning_rate': 4.046308724832215e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14721/75000 [12:10<51:35, 19.47it/s]

{'loss': 0.2726, 'grad_norm': 2.1544198989868164, 'learning_rate': 4.045637583892618e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14732/75000 [12:11<51:32, 19.49it/s]

{'loss': 0.3534, 'grad_norm': 1.9370324611663818, 'learning_rate': 4.0449664429530206e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14744/75000 [12:11<49:54, 20.12it/s]

{'loss': 0.3878, 'grad_norm': 7.024440765380859, 'learning_rate': 4.044295302013423e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14753/75000 [12:12<53:39, 18.71it/s]

{'loss': 0.3864, 'grad_norm': 2.4681196212768555, 'learning_rate': 4.0436241610738256e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14762/75000 [12:12<53:30, 18.76it/s]

{'loss': 0.3513, 'grad_norm': 2.501647710800171, 'learning_rate': 4.0429530201342285e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14773/75000 [12:13<49:42, 20.20it/s]

{'loss': 0.3151, 'grad_norm': 10.391976356506348, 'learning_rate': 4.0422818791946306e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14783/75000 [12:13<53:16, 18.84it/s]

{'loss': 0.3245, 'grad_norm': 0.7857038974761963, 'learning_rate': 4.041610738255034e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14792/75000 [12:14<50:34, 19.84it/s]

{'loss': 0.3178, 'grad_norm': 5.780692100524902, 'learning_rate': 4.0409395973154364e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14804/75000 [12:14<49:18, 20.35it/s]

{'loss': 0.2318, 'grad_norm': 2.9564273357391357, 'learning_rate': 4.040268456375839e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14812/75000 [12:15<52:23, 19.15it/s]

{'loss': 0.3841, 'grad_norm': 11.239081382751465, 'learning_rate': 4.0395973154362414e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14822/75000 [12:15<50:08, 20.00it/s]

{'loss': 0.3467, 'grad_norm': 2.8564417362213135, 'learning_rate': 4.038926174496644e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14834/75000 [12:16<48:45, 20.56it/s]

{'loss': 0.3354, 'grad_norm': 4.5323405265808105, 'learning_rate': 4.038255033557047e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14844/75000 [12:16<49:59, 20.05it/s]

{'loss': 0.301, 'grad_norm': 11.068188667297363, 'learning_rate': 4.03758389261745e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14853/75000 [12:17<49:22, 20.30it/s]

{'loss': 0.2953, 'grad_norm': 1.6684668064117432, 'learning_rate': 4.036912751677853e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14862/75000 [12:17<49:50, 20.11it/s]

{'loss': 0.3531, 'grad_norm': 0.6287243962287903, 'learning_rate': 4.036241610738255e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14873/75000 [12:18<49:19, 20.32it/s]

{'loss': 0.352, 'grad_norm': 1.6790151596069336, 'learning_rate': 4.035570469798658e-05, 'epoch': 0.59}


                                                     
 20%|█▉        | 14882/75000 [12:18<49:13, 20.35it/s]

{'loss': 0.3187, 'grad_norm': 1.7056198120117188, 'learning_rate': 4.034899328859061e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14894/75000 [12:19<51:42, 19.37it/s]

{'loss': 0.2758, 'grad_norm': 8.77098560333252, 'learning_rate': 4.0342281879194635e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14904/75000 [12:19<49:20, 20.30it/s]

{'loss': 0.4307, 'grad_norm': 5.9266357421875, 'learning_rate': 4.0335570469798664e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14913/75000 [12:20<49:45, 20.13it/s]

{'loss': 0.3468, 'grad_norm': 4.034334659576416, 'learning_rate': 4.0328859060402686e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14924/75000 [12:20<51:39, 19.38it/s]

{'loss': 0.3759, 'grad_norm': 6.006995677947998, 'learning_rate': 4.0322147651006714e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14934/75000 [12:21<50:26, 19.84it/s]

{'loss': 0.2506, 'grad_norm': 7.127678394317627, 'learning_rate': 4.0315436241610736e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14942/75000 [12:21<50:05, 19.98it/s]

{'loss': 0.2679, 'grad_norm': 5.107149600982666, 'learning_rate': 4.0308724832214765e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14954/75000 [12:22<51:18, 19.51it/s]

{'loss': 0.4763, 'grad_norm': 3.382833957672119, 'learning_rate': 4.030201342281879e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14964/75000 [12:22<49:47, 20.10it/s]

{'loss': 0.5259, 'grad_norm': 6.264445781707764, 'learning_rate': 4.029530201342282e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14973/75000 [12:23<49:13, 20.32it/s]

{'loss': 0.4007, 'grad_norm': 1.5440350770950317, 'learning_rate': 4.028859060402685e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14983/75000 [12:23<57:05, 17.52it/s]

{'loss': 0.3049, 'grad_norm': 2.768218755722046, 'learning_rate': 4.028187919463087e-05, 'epoch': 0.6}


                                                     
 20%|█▉        | 14992/75000 [12:24<50:35, 19.77it/s]

{'loss': 0.3659, 'grad_norm': 6.53612756729126, 'learning_rate': 4.02751677852349e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15000/75000 [12:24<49:31, 20.19it/s]

{'loss': 0.3467, 'grad_norm': 2.123286485671997, 'learning_rate': 4.026845637583892e-05, 'epoch': 0.6}


                                                       
 20%|██        | 15014/75000 [12:25<1:01:15, 16.32it/s]

{'loss': 0.3417, 'grad_norm': 5.035614013671875, 'learning_rate': 4.026174496644296e-05, 'epoch': 0.6}


                                                       
 20%|██        | 15022/75000 [12:26<52:55, 18.89it/s]

{'loss': 0.2197, 'grad_norm': 0.3877228796482086, 'learning_rate': 4.0255033557046986e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15032/75000 [12:26<50:53, 19.64it/s]

{'loss': 0.409, 'grad_norm': 2.1580607891082764, 'learning_rate': 4.024832214765101e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15042/75000 [12:27<53:54, 18.54it/s]

{'loss': 0.3499, 'grad_norm': 4.081728458404541, 'learning_rate': 4.0241610738255037e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15053/75000 [12:27<49:55, 20.01it/s]

{'loss': 0.2947, 'grad_norm': 0.8988337516784668, 'learning_rate': 4.023489932885906e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15062/75000 [12:28<55:30, 17.99it/s]

{'loss': 0.2372, 'grad_norm': 1.4816985130310059, 'learning_rate': 4.022818791946309e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15073/75000 [12:29<53:15, 18.75it/s]

{'loss': 0.3799, 'grad_norm': 1.7731542587280273, 'learning_rate': 4.0221476510067115e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15083/75000 [12:29<52:15, 19.11it/s]

{'loss': 0.413, 'grad_norm': 1.6969273090362549, 'learning_rate': 4.0214765100671144e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15091/75000 [12:30<55:27, 18.00it/s]

{'loss': 0.3945, 'grad_norm': 3.840691089630127, 'learning_rate': 4.020805369127517e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15101/75000 [12:30<53:58, 18.50it/s]

{'loss': 0.2426, 'grad_norm': 3.7840676307678223, 'learning_rate': 4.0201342281879194e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15111/75000 [12:31<52:45, 18.92it/s]

{'loss': 0.3535, 'grad_norm': 0.7836093306541443, 'learning_rate': 4.019463087248322e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15122/75000 [12:31<52:07, 19.14it/s]

{'loss': 0.278, 'grad_norm': 5.304882526397705, 'learning_rate': 4.0187919463087245e-05, 'epoch': 0.6}


                                                     
 20%|██        | 15131/75000 [12:32<57:04, 17.48it/s]

{'loss': 0.4284, 'grad_norm': 0.9518545866012573, 'learning_rate': 4.018120805369128e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15142/75000 [12:32<54:20, 18.36it/s]

{'loss': 0.3745, 'grad_norm': 4.247186183929443, 'learning_rate': 4.017449664429531e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15152/75000 [12:33<53:02, 18.80it/s]

{'loss': 0.3091, 'grad_norm': 3.764542579650879, 'learning_rate': 4.016778523489933e-05, 'epoch': 0.61}


                                                       
 20%|██        | 15163/75000 [12:33<56:12, 17.74it/s]  

{'loss': 0.408, 'grad_norm': 4.588578701019287, 'learning_rate': 4.016107382550336e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15173/75000 [12:34<54:39, 18.24it/s]

{'loss': 0.5092, 'grad_norm': 2.4213027954101562, 'learning_rate': 4.015436241610738e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15182/75000 [12:34<52:47, 18.88it/s]

{'loss': 0.4484, 'grad_norm': 6.671946048736572, 'learning_rate': 4.014765100671141e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15193/75000 [12:35<54:48, 18.18it/s]

{'loss': 0.3283, 'grad_norm': 2.3459041118621826, 'learning_rate': 4.014093959731544e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15204/75000 [12:36<53:47, 18.53it/s]

{'loss': 0.3947, 'grad_norm': 2.5010101795196533, 'learning_rate': 4.0134228187919466e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15212/75000 [12:36<56:25, 17.66it/s]

{'loss': 0.3723, 'grad_norm': 0.9466451406478882, 'learning_rate': 4.0127516778523495e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15222/75000 [12:37<56:31, 17.62it/s]

{'loss': 0.3212, 'grad_norm': 6.629948616027832, 'learning_rate': 4.0120805369127517e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15232/75000 [12:37<54:16, 18.35it/s]

{'loss': 0.3596, 'grad_norm': 1.3244191408157349, 'learning_rate': 4.0114093959731545e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15243/75000 [12:38<52:06, 19.11it/s]

{'loss': 0.5094, 'grad_norm': 2.8620522022247314, 'learning_rate': 4.0107382550335574e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15254/75000 [12:38<53:36, 18.57it/s]

{'loss': 0.3438, 'grad_norm': 0.9897592067718506, 'learning_rate': 4.01006711409396e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15261/75000 [12:39<53:13, 18.71it/s]

{'loss': 0.3335, 'grad_norm': 2.2938148975372314, 'learning_rate': 4.009395973154363e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15273/75000 [12:39<52:02, 19.13it/s]

{'loss': 0.2961, 'grad_norm': 4.528709411621094, 'learning_rate': 4.008724832214765e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15283/75000 [12:40<54:05, 18.40it/s]

{'loss': 0.3547, 'grad_norm': 7.4999799728393555, 'learning_rate': 4.008053691275168e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15294/75000 [12:41<51:56, 19.16it/s]

{'loss': 0.3324, 'grad_norm': 4.231001377105713, 'learning_rate': 4.00738255033557e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15303/75000 [12:41<52:17, 19.03it/s]

{'loss': 0.294, 'grad_norm': 9.164750099182129, 'learning_rate': 4.006711409395973e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15313/75000 [12:42<53:36, 18.56it/s]

{'loss': 0.2864, 'grad_norm': 2.6160004138946533, 'learning_rate': 4.006040268456376e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15322/75000 [12:42<53:02, 18.75it/s]

{'loss': 0.156, 'grad_norm': 4.638306617736816, 'learning_rate': 4.005369127516779e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15331/75000 [12:43<52:03, 19.10it/s]

{'loss': 0.4377, 'grad_norm': 4.251157760620117, 'learning_rate': 4.004697986577182e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15342/75000 [12:43<51:25, 19.33it/s]

{'loss': 0.2896, 'grad_norm': 4.941143989562988, 'learning_rate': 4.004026845637584e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15353/75000 [12:44<52:57, 18.77it/s]

{'loss': 0.4208, 'grad_norm': 1.3143293857574463, 'learning_rate': 4.003355704697987e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15362/75000 [12:44<52:36, 18.89it/s]

{'loss': 0.3328, 'grad_norm': 9.791487693786621, 'learning_rate': 4.0026845637583896e-05, 'epoch': 0.61}


                                                     
 20%|██        | 15373/75000 [12:45<51:49, 19.17it/s]

{'loss': 0.2727, 'grad_norm': 21.18840217590332, 'learning_rate': 4.0020134228187924e-05, 'epoch': 0.61}


                                                     
 21%|██        | 15383/75000 [12:45<57:19, 17.33it/s]

{'loss': 0.2263, 'grad_norm': 0.9216557741165161, 'learning_rate': 4.0013422818791946e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15392/75000 [12:46<53:38, 18.52it/s]

{'loss': 0.3947, 'grad_norm': 5.014216899871826, 'learning_rate': 4.0006711409395975e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15402/75000 [12:46<52:16, 19.00it/s]

{'loss': 0.3282, 'grad_norm': 1.445137619972229, 'learning_rate': 4e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15413/75000 [12:47<52:09, 19.04it/s]

{'loss': 0.3558, 'grad_norm': 3.3808305263519287, 'learning_rate': 3.9993288590604025e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15424/75000 [12:47<52:11, 19.03it/s]

{'loss': 0.3311, 'grad_norm': 3.6362643241882324, 'learning_rate': 3.9986577181208054e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15434/75000 [12:48<50:09, 19.79it/s]

{'loss': 0.2558, 'grad_norm': 1.790626049041748, 'learning_rate': 3.997986577181208e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15444/75000 [12:48<50:11, 19.77it/s]

{'loss': 0.3723, 'grad_norm': 3.5962600708007812, 'learning_rate': 3.997315436241611e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15454/75000 [12:49<52:08, 19.03it/s]

{'loss': 0.4162, 'grad_norm': 5.3116865158081055, 'learning_rate': 3.996644295302013e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15462/75000 [12:49<50:00, 19.84it/s]

{'loss': 0.4482, 'grad_norm': 1.6754770278930664, 'learning_rate': 3.995973154362416e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15474/75000 [12:50<49:36, 20.00it/s]

{'loss': 0.2513, 'grad_norm': 3.2166290283203125, 'learning_rate': 3.995302013422819e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15484/75000 [12:51<49:07, 20.19it/s]

{'loss': 0.3998, 'grad_norm': 2.4913461208343506, 'learning_rate': 3.994630872483222e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15493/75000 [12:51<49:16, 20.13it/s]

{'loss': 0.2696, 'grad_norm': 7.35338020324707, 'learning_rate': 3.993959731543625e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15500/75000 [12:51<48:52, 20.29it/s]

{'loss': 0.3602, 'grad_norm': 2.7935988903045654, 'learning_rate': 3.993288590604027e-05, 'epoch': 0.62}


                                                       
 21%|██        | 15514/75000 [12:53<58:46, 16.87it/s]  

{'loss': 0.2469, 'grad_norm': 4.184317111968994, 'learning_rate': 3.99261744966443e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15524/75000 [12:53<51:53, 19.10it/s]

{'loss': 0.3677, 'grad_norm': 6.516612529754639, 'learning_rate': 3.9919463087248326e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15532/75000 [12:53<51:25, 19.27it/s]

{'loss': 0.3867, 'grad_norm': 2.023193359375, 'learning_rate': 3.991275167785235e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15544/75000 [12:54<49:38, 19.96it/s]

{'loss': 0.2792, 'grad_norm': 3.5681354999542236, 'learning_rate': 3.990604026845638e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15553/75000 [12:54<49:19, 20.08it/s]

{'loss': 0.3188, 'grad_norm': 2.8629932403564453, 'learning_rate': 3.9899328859060404e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15563/75000 [12:55<50:42, 19.53it/s]

{'loss': 0.2719, 'grad_norm': 2.3084592819213867, 'learning_rate': 3.989261744966443e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15574/75000 [12:56<49:00, 20.21it/s]

{'loss': 0.2844, 'grad_norm': 5.564430236816406, 'learning_rate': 3.9885906040268455e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15584/75000 [12:56<52:53, 18.72it/s]

{'loss': 0.3081, 'grad_norm': 2.586744785308838, 'learning_rate': 3.987919463087248e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15591/75000 [12:56<51:58, 19.05it/s]

{'loss': 0.3488, 'grad_norm': 1.0137277841567993, 'learning_rate': 3.987248322147651e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15603/75000 [12:57<50:13, 19.71it/s]

{'loss': 0.445, 'grad_norm': 8.198582649230957, 'learning_rate': 3.986577181208054e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15613/75000 [12:58<50:32, 19.58it/s]

{'loss': 0.3316, 'grad_norm': 7.701908111572266, 'learning_rate': 3.985906040268457e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15624/75000 [12:58<49:07, 20.14it/s]

{'loss': 0.3176, 'grad_norm': 2.531716823577881, 'learning_rate': 3.985234899328859e-05, 'epoch': 0.62}


                                                     
 21%|██        | 15633/75000 [12:59<48:29, 20.40it/s]

{'loss': 0.414, 'grad_norm': 4.719576835632324, 'learning_rate': 3.984563758389262e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15641/75000 [12:59<53:42, 18.42it/s]

{'loss': 0.3261, 'grad_norm': 1.2913744449615479, 'learning_rate': 3.983892617449664e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15652/75000 [13:00<49:35, 19.94it/s]

{'loss': 0.3278, 'grad_norm': 3.478421449661255, 'learning_rate': 3.983221476510067e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15664/75000 [13:00<49:05, 20.15it/s]

{'loss': 0.2891, 'grad_norm': 1.8649847507476807, 'learning_rate': 3.9825503355704705e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15672/75000 [13:01<50:09, 19.71it/s]

{'loss': 0.3417, 'grad_norm': 4.319827079772949, 'learning_rate': 3.9818791946308727e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15682/75000 [13:01<49:34, 19.94it/s]

{'loss': 0.3818, 'grad_norm': 2.1838016510009766, 'learning_rate': 3.9812080536912755e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15694/75000 [13:02<49:02, 20.15it/s]

{'loss': 0.3675, 'grad_norm': 2.4612696170806885, 'learning_rate': 3.980536912751678e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15703/75000 [13:02<53:54, 18.33it/s]

{'loss': 0.3841, 'grad_norm': 4.264288425445557, 'learning_rate': 3.9798657718120805e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15712/75000 [13:03<53:16, 18.55it/s]

{'loss': 0.4353, 'grad_norm': 3.6729395389556885, 'learning_rate': 3.9791946308724834e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15723/75000 [13:03<53:03, 18.62it/s]

{'loss': 0.3396, 'grad_norm': 3.779757499694824, 'learning_rate': 3.978523489932886e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15733/75000 [13:04<54:25, 18.15it/s]

{'loss': 0.3539, 'grad_norm': 6.531002998352051, 'learning_rate': 3.977852348993289e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15744/75000 [13:04<53:00, 18.63it/s]

{'loss': 0.298, 'grad_norm': 3.977166175842285, 'learning_rate': 3.977181208053691e-05, 'epoch': 0.63}


                                                       
 21%|██        | 15754/75000 [13:05<54:23, 18.15it/s]

{'loss': 0.3984, 'grad_norm': 3.1363987922668457, 'learning_rate': 3.976510067114094e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15764/75000 [13:06<49:46, 19.83it/s]

{'loss': 0.3651, 'grad_norm': 5.594920635223389, 'learning_rate': 3.975838926174496e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15771/75000 [13:06<50:00, 19.74it/s]

{'loss': 0.3459, 'grad_norm': 4.901349067687988, 'learning_rate': 3.9751677852349e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15783/75000 [13:07<49:47, 19.82it/s]

{'loss': 0.3016, 'grad_norm': 1.3088228702545166, 'learning_rate': 3.974496644295303e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15793/75000 [13:07<49:25, 19.96it/s]

{'loss': 0.3937, 'grad_norm': 6.582912921905518, 'learning_rate': 3.973825503355705e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15802/75000 [13:07<48:48, 20.21it/s]

{'loss': 0.3158, 'grad_norm': 4.645223140716553, 'learning_rate': 3.973154362416108e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15812/75000 [13:08<56:08, 17.57it/s]

{'loss': 0.2824, 'grad_norm': 8.054442405700684, 'learning_rate': 3.97248322147651e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15824/75000 [13:09<51:31, 19.14it/s]

{'loss': 0.2856, 'grad_norm': 3.3586747646331787, 'learning_rate': 3.971812080536913e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15832/75000 [13:09<1:03:02, 15.64it/s]

{'loss': 0.4255, 'grad_norm': 5.993734359741211, 'learning_rate': 3.9711409395973156e-05, 'epoch': 0.63}


                                                       
 21%|██        | 15843/75000 [13:10<55:02, 17.91it/s]  

{'loss': 0.3377, 'grad_norm': 4.603627681732178, 'learning_rate': 3.9704697986577185e-05, 'epoch': 0.63}


                                                     
 21%|██        | 15853/75000 [13:10<58:15, 16.92it/s]

{'loss': 0.3346, 'grad_norm': 1.4522215127944946, 'learning_rate': 3.969798657718121e-05, 'epoch': 0.63}


                                                       
 21%|██        | 15863/75000 [13:11<1:00:25, 16.31it/s]

{'loss': 0.3615, 'grad_norm': 3.21789813041687, 'learning_rate': 3.9691275167785235e-05, 'epoch': 0.63}


                                                       
 21%|██        | 15873/75000 [13:12<1:06:03, 14.92it/s]

{'loss': 0.4199, 'grad_norm': 1.2316102981567383, 'learning_rate': 3.9684563758389264e-05, 'epoch': 0.63}


                                                       
 21%|██        | 15883/75000 [13:12<59:16, 16.62it/s]  

{'loss': 0.3131, 'grad_norm': 9.309048652648926, 'learning_rate': 3.9677852348993285e-05, 'epoch': 0.64}


                                                     
 21%|██        | 15894/75000 [13:13<50:55, 19.34it/s]

{'loss': 0.3382, 'grad_norm': 3.057199478149414, 'learning_rate': 3.967114093959732e-05, 'epoch': 0.64}


                                                       
 21%|██        | 15903/75000 [13:14<57:16, 17.20it/s]  

{'loss': 0.3029, 'grad_norm': 1.6494677066802979, 'learning_rate': 3.966442953020135e-05, 'epoch': 0.64}


                                                     
 21%|██        | 15913/75000 [13:14<52:13, 18.86it/s]

{'loss': 0.2938, 'grad_norm': 5.50629186630249, 'learning_rate': 3.965771812080537e-05, 'epoch': 0.64}


                                                     
 21%|██        | 15921/75000 [13:15<55:48, 17.64it/s]

{'loss': 0.2758, 'grad_norm': 1.030747413635254, 'learning_rate': 3.96510067114094e-05, 'epoch': 0.64}


                                                       
 21%|██        | 15934/75000 [13:15<53:00, 18.57it/s]  

{'loss': 0.238, 'grad_norm': 2.1604130268096924, 'learning_rate': 3.964429530201342e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 15941/75000 [13:16<51:45, 19.02it/s]

{'loss': 0.2391, 'grad_norm': 2.129812240600586, 'learning_rate': 3.963758389261745e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 15953/75000 [13:16<51:08, 19.24it/s]

{'loss': 0.2346, 'grad_norm': 6.649864673614502, 'learning_rate': 3.963087248322148e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 15962/75000 [13:17<52:21, 18.79it/s]

{'loss': 0.1889, 'grad_norm': 1.508144497871399, 'learning_rate': 3.962416107382551e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 15974/75000 [13:17<49:13, 19.98it/s]

{'loss': 0.2883, 'grad_norm': 5.452333450317383, 'learning_rate': 3.9617449664429536e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 15984/75000 [13:18<48:06, 20.45it/s]

{'loss': 0.5178, 'grad_norm': 8.441828727722168, 'learning_rate': 3.961073825503356e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 15992/75000 [13:18<52:01, 18.90it/s]

{'loss': 0.2069, 'grad_norm': 6.162206172943115, 'learning_rate': 3.9604026845637586e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16000/75000 [13:19<49:25, 19.90it/s]

{'loss': 0.2722, 'grad_norm': 12.308004379272461, 'learning_rate': 3.959731543624161e-05, 'epoch': 0.64}


                                                       
 21%|██▏       | 16014/75000 [13:20<58:39, 16.76it/s]  

{'loss': 0.3174, 'grad_norm': 12.98705005645752, 'learning_rate': 3.959060402684564e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16023/75000 [13:20<54:01, 18.19it/s]

{'loss': 0.338, 'grad_norm': 7.087746620178223, 'learning_rate': 3.9583892617449665e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16031/75000 [13:21<50:33, 19.44it/s]

{'loss': 0.4855, 'grad_norm': 5.134706974029541, 'learning_rate': 3.957718120805369e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16042/75000 [13:21<49:31, 19.84it/s]

{'loss': 0.492, 'grad_norm': 4.536740779876709, 'learning_rate': 3.957046979865772e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16054/75000 [13:22<51:02, 19.25it/s]

{'loss': 0.3389, 'grad_norm': 4.765866279602051, 'learning_rate': 3.9563758389261744e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16064/75000 [13:22<48:57, 20.06it/s]

{'loss': 0.3272, 'grad_norm': 3.6820340156555176, 'learning_rate': 3.955704697986577e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16074/75000 [13:23<48:10, 20.38it/s]

{'loss': 0.2482, 'grad_norm': 2.738131046295166, 'learning_rate': 3.95503355704698e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16081/75000 [13:23<53:03, 18.51it/s]

{'loss': 0.2791, 'grad_norm': 4.098954200744629, 'learning_rate': 3.954362416107383e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16093/75000 [13:24<49:19, 19.90it/s]

{'loss': 0.3679, 'grad_norm': 10.027972221374512, 'learning_rate': 3.953691275167785e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16104/75000 [13:24<48:23, 20.28it/s]

{'loss': 0.383, 'grad_norm': 8.220602035522461, 'learning_rate': 3.953020134228188e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16114/75000 [13:25<49:48, 19.70it/s]

{'loss': 0.3886, 'grad_norm': 6.589776515960693, 'learning_rate': 3.952348993288591e-05, 'epoch': 0.64}


                                                     
 21%|██▏       | 16124/75000 [13:25<48:28, 20.24it/s]

{'loss': 0.3346, 'grad_norm': 3.5613083839416504, 'learning_rate': 3.951677852348994e-05, 'epoch': 0.64}


                                                     
 22%|██▏       | 16132/75000 [13:26<52:40, 18.62it/s]

{'loss': 0.4234, 'grad_norm': 4.0619025230407715, 'learning_rate': 3.9510067114093965e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16142/75000 [13:26<50:03, 19.60it/s]

{'loss': 0.2592, 'grad_norm': 6.5808939933776855, 'learning_rate': 3.950335570469799e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16154/75000 [13:27<48:12, 20.34it/s]

{'loss': 0.3417, 'grad_norm': 2.2188680171966553, 'learning_rate': 3.9496644295302016e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16164/75000 [13:27<50:19, 19.48it/s]

{'loss': 0.335, 'grad_norm': 3.0604867935180664, 'learning_rate': 3.9489932885906044e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16173/75000 [13:28<48:19, 20.29it/s]

{'loss': 0.335, 'grad_norm': 2.2393174171447754, 'learning_rate': 3.9483221476510066e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16182/75000 [13:28<51:47, 18.93it/s]

{'loss': 0.3243, 'grad_norm': 1.9399398565292358, 'learning_rate': 3.9476510067114094e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16193/75000 [13:29<48:29, 20.21it/s]

{'loss': 0.3907, 'grad_norm': 2.5706398487091064, 'learning_rate': 3.946979865771812e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16204/75000 [13:29<48:52, 20.05it/s]

{'loss': 0.361, 'grad_norm': 11.642511367797852, 'learning_rate': 3.946308724832215e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16212/75000 [13:30<50:32, 19.38it/s]

{'loss': 0.3643, 'grad_norm': 12.507087707519531, 'learning_rate': 3.945637583892617e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16223/75000 [13:30<49:17, 19.87it/s]

{'loss': 0.2872, 'grad_norm': 4.1618876457214355, 'learning_rate': 3.94496644295302e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16232/75000 [13:31<50:06, 19.55it/s]

{'loss': 0.3247, 'grad_norm': 6.088038921356201, 'learning_rate': 3.944295302013423e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16243/75000 [13:31<48:37, 20.14it/s]

{'loss': 0.3354, 'grad_norm': 1.5475482940673828, 'learning_rate': 3.943624161073826e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16252/75000 [13:32<48:06, 20.36it/s]

{'loss': 0.3862, 'grad_norm': 5.5179829597473145, 'learning_rate': 3.942953020134229e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16264/75000 [13:33<48:53, 20.03it/s]

{'loss': 0.3882, 'grad_norm': 2.9865365028381348, 'learning_rate': 3.942281879194631e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16273/75000 [13:33<48:04, 20.36it/s]

{'loss': 0.4484, 'grad_norm': 6.101324558258057, 'learning_rate': 3.941610738255034e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16282/75000 [13:34<57:47, 16.94it/s]

{'loss': 0.3834, 'grad_norm': 2.5093233585357666, 'learning_rate': 3.940939597315436e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16294/75000 [13:34<49:58, 19.58it/s]

{'loss': 0.4506, 'grad_norm': 6.423631191253662, 'learning_rate': 3.940268456375839e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16302/75000 [13:34<48:58, 19.98it/s]

{'loss': 0.2681, 'grad_norm': 2.004823684692383, 'learning_rate': 3.939597315436242e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16313/75000 [13:35<50:14, 19.47it/s]

{'loss': 0.3363, 'grad_norm': 2.2129628658294678, 'learning_rate': 3.9389261744966445e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16324/75000 [13:36<47:55, 20.41it/s]

{'loss': 0.3934, 'grad_norm': 1.9093002080917358, 'learning_rate': 3.9382550335570474e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16333/75000 [13:36<50:25, 19.39it/s]

{'loss': 0.262, 'grad_norm': 3.3386545181274414, 'learning_rate': 3.9375838926174496e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16343/75000 [13:37<48:46, 20.04it/s]

{'loss': 0.365, 'grad_norm': 3.7058968544006348, 'learning_rate': 3.9369127516778524e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16354/75000 [13:37<47:59, 20.37it/s]

{'loss': 0.2723, 'grad_norm': 10.99953842163086, 'learning_rate': 3.936241610738255e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16362/75000 [13:38<49:32, 19.73it/s]

{'loss': 0.3052, 'grad_norm': 6.672173500061035, 'learning_rate': 3.935570469798658e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16373/75000 [13:38<48:18, 20.23it/s]

{'loss': 0.389, 'grad_norm': 11.196643829345703, 'learning_rate': 3.934899328859061e-05, 'epoch': 0.65}


                                                     
 22%|██▏       | 16382/75000 [13:39<51:11, 19.08it/s]

{'loss': 0.3673, 'grad_norm': 4.925920486450195, 'learning_rate': 3.934228187919463e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16391/75000 [13:39<48:33, 20.12it/s]

{'loss': 0.368, 'grad_norm': 3.5340235233306885, 'learning_rate': 3.933557046979866e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16403/75000 [13:40<48:59, 19.93it/s]

{'loss': 0.3284, 'grad_norm': 3.0820653438568115, 'learning_rate': 3.932885906040268e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16412/75000 [13:40<49:54, 19.56it/s]

{'loss': 0.2583, 'grad_norm': 2.227452039718628, 'learning_rate': 3.932214765100671e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16424/75000 [13:41<48:04, 20.31it/s]

{'loss': 0.3137, 'grad_norm': 0.8397834300994873, 'learning_rate': 3.9315436241610746e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16433/75000 [13:41<48:38, 20.06it/s]

{'loss': 0.3839, 'grad_norm': 4.966935634613037, 'learning_rate': 3.930872483221477e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16444/75000 [13:42<48:18, 20.20it/s]

{'loss': 0.3749, 'grad_norm': 3.465841054916382, 'learning_rate': 3.9302013422818796e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16453/75000 [13:42<48:46, 20.01it/s]

{'loss': 0.3793, 'grad_norm': 4.815028667449951, 'learning_rate': 3.929530201342282e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16462/75000 [13:43<48:17, 20.20it/s]

{'loss': 0.3628, 'grad_norm': 4.934194564819336, 'learning_rate': 3.9288590604026846e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16473/75000 [13:43<51:52, 18.80it/s]

{'loss': 0.2056, 'grad_norm': 5.891908645629883, 'learning_rate': 3.9281879194630875e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16481/75000 [13:44<49:34, 19.68it/s]

{'loss': 0.3389, 'grad_norm': 8.161933898925781, 'learning_rate': 3.9275167785234903e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16494/75000 [13:44<49:05, 19.86it/s]

{'loss': 0.4867, 'grad_norm': 15.639304161071777, 'learning_rate': 3.926845637583893e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16500/75000 [13:44<48:51, 19.95it/s]

{'loss': 0.4303, 'grad_norm': 1.4986820220947266, 'learning_rate': 3.9261744966442954e-05, 'epoch': 0.66}


                                                       
 22%|██▏       | 16511/75000 [13:48<2:16:24,  7.15it/s]

{'loss': 0.4055, 'grad_norm': 2.3253955841064453, 'learning_rate': 3.925503355704698e-05, 'epoch': 0.66}


                                                       
 22%|██▏       | 16522/75000 [13:49<1:07:40, 14.40it/s]

{'loss': 0.2289, 'grad_norm': 4.755637168884277, 'learning_rate': 3.9248322147651004e-05, 'epoch': 0.66}


                                                       
 22%|██▏       | 16531/75000 [13:49<56:48, 17.15it/s]

{'loss': 0.2905, 'grad_norm': 4.479889392852783, 'learning_rate': 3.924161073825503e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16541/75000 [13:50<54:50, 17.77it/s]

{'loss': 0.4299, 'grad_norm': 8.45938491821289, 'learning_rate': 3.923489932885907e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16554/75000 [13:50<48:54, 19.92it/s]

{'loss': 0.3145, 'grad_norm': 3.718578338623047, 'learning_rate': 3.922818791946309e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16562/75000 [13:51<48:20, 20.15it/s]

{'loss': 0.3238, 'grad_norm': 1.981581211090088, 'learning_rate': 3.922147651006712e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16574/75000 [13:51<51:49, 18.79it/s]

{'loss': 0.2902, 'grad_norm': 10.651285171508789, 'learning_rate': 3.921476510067114e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16581/75000 [13:52<50:31, 19.27it/s]

{'loss': 0.3452, 'grad_norm': 5.558358669281006, 'learning_rate': 3.920805369127517e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16594/75000 [13:52<48:19, 20.14it/s]

{'loss': 0.2547, 'grad_norm': 3.7824742794036865, 'learning_rate': 3.92013422818792e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16602/75000 [13:53<52:04, 18.69it/s]

{'loss': 0.3691, 'grad_norm': 1.471171259880066, 'learning_rate': 3.9194630872483226e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16613/75000 [13:53<53:28, 18.20it/s]

{'loss': 0.2877, 'grad_norm': 1.933104395866394, 'learning_rate': 3.9187919463087254e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16624/75000 [13:54<51:36, 18.85it/s]

{'loss': 0.3177, 'grad_norm': 3.234875440597534, 'learning_rate': 3.9181208053691276e-05, 'epoch': 0.66}


                                                     
 22%|██▏       | 16634/75000 [13:54<50:36, 19.22it/s]

{'loss': 0.3808, 'grad_norm': 7.066781520843506, 'learning_rate': 3.9174496644295305e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16644/75000 [13:55<48:19, 20.12it/s]

{'loss': 0.3512, 'grad_norm': 4.1660895347595215, 'learning_rate': 3.9167785234899326e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16654/75000 [13:56<50:44, 19.16it/s]

{'loss': 0.4843, 'grad_norm': 7.90034294128418, 'learning_rate': 3.916107382550336e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16664/75000 [13:56<48:59, 19.85it/s]

{'loss': 0.251, 'grad_norm': 15.658304214477539, 'learning_rate': 3.915436241610738e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16674/75000 [13:57<48:01, 20.24it/s]

{'loss': 0.2844, 'grad_norm': 9.128264427185059, 'learning_rate': 3.914765100671141e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16682/75000 [13:57<50:06, 19.39it/s]

{'loss': 0.2957, 'grad_norm': 0.30004191398620605, 'learning_rate': 3.914093959731544e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16694/75000 [13:58<47:48, 20.33it/s]

{'loss': 0.3303, 'grad_norm': 3.1478068828582764, 'learning_rate': 3.913422818791946e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16703/75000 [13:58<57:20, 16.95it/s]

{'loss': 0.4042, 'grad_norm': 3.9249136447906494, 'learning_rate': 3.912751677852349e-05, 'epoch': 0.67}


                                                       
 22%|██▏       | 16714/75000 [13:59<51:38, 18.81it/s]

{'loss': 0.3569, 'grad_norm': 4.544726371765137, 'learning_rate': 3.912080536912752e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16722/75000 [13:59<49:26, 19.64it/s]

{'loss': 0.439, 'grad_norm': 1.5954807996749878, 'learning_rate': 3.911409395973155e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16733/75000 [14:00<47:56, 20.26it/s]

{'loss': 0.3481, 'grad_norm': 3.3136045932769775, 'learning_rate': 3.9107382550335576e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16744/75000 [14:00<48:43, 19.93it/s]

{'loss': 0.3337, 'grad_norm': 6.086696624755859, 'learning_rate': 3.91006711409396e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16752/75000 [14:01<47:23, 20.49it/s]

{'loss': 0.3767, 'grad_norm': 3.250441789627075, 'learning_rate': 3.909395973154363e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16764/75000 [14:01<48:10, 20.15it/s]

{'loss': 0.3229, 'grad_norm': 9.2476224899292, 'learning_rate': 3.908724832214765e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16774/75000 [14:02<48:49, 19.87it/s]

{'loss': 0.3072, 'grad_norm': 3.1663146018981934, 'learning_rate': 3.9080536912751684e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16782/75000 [14:02<48:52, 19.86it/s]

{'loss': 0.3894, 'grad_norm': 1.3215420246124268, 'learning_rate': 3.9073825503355706e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16793/75000 [14:03<50:01, 19.39it/s]

{'loss': 0.238, 'grad_norm': 1.4183218479156494, 'learning_rate': 3.9067114093959734e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16804/75000 [14:03<52:20, 18.53it/s]

{'loss': 0.2652, 'grad_norm': 2.1301181316375732, 'learning_rate': 3.906040268456376e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16812/75000 [14:04<54:45, 17.71it/s]

{'loss': 0.284, 'grad_norm': 12.616991996765137, 'learning_rate': 3.9053691275167784e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16824/75000 [14:04<48:53, 19.83it/s]

{'loss': 0.4592, 'grad_norm': 1.2733479738235474, 'learning_rate': 3.904697986577181e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16833/75000 [14:05<53:53, 17.99it/s]

{'loss': 0.2971, 'grad_norm': 3.5769190788269043, 'learning_rate': 3.904026845637584e-05, 'epoch': 0.67}


                                                       
 22%|██▏       | 16843/75000 [14:06<58:20, 16.62it/s]

{'loss': 0.3901, 'grad_norm': 24.84037208557129, 'learning_rate': 3.903355704697987e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16853/75000 [14:06<53:49, 18.00it/s]

{'loss': 0.4738, 'grad_norm': 5.040326118469238, 'learning_rate': 3.902684563758389e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16863/75000 [14:07<51:37, 18.77it/s]

{'loss': 0.396, 'grad_norm': 3.2828822135925293, 'learning_rate': 3.902013422818792e-05, 'epoch': 0.67}


                                                     
 22%|██▏       | 16873/75000 [14:07<48:47, 19.86it/s]

{'loss': 0.4319, 'grad_norm': 3.2345950603485107, 'learning_rate': 3.901342281879195e-05, 'epoch': 0.67}


                                                     
 23%|██▎       | 16884/75000 [14:08<48:21, 20.03it/s]

{'loss': 0.3032, 'grad_norm': 6.15878963470459, 'learning_rate': 3.900671140939597e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16894/75000 [14:08<50:00, 19.36it/s]

{'loss': 0.3718, 'grad_norm': 2.8874828815460205, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16904/75000 [14:09<47:36, 20.34it/s]

{'loss': 0.4714, 'grad_norm': 4.992934703826904, 'learning_rate': 3.899328859060403e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16913/75000 [14:09<53:20, 18.15it/s]

{'loss': 0.2771, 'grad_norm': 9.845329284667969, 'learning_rate': 3.8986577181208056e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16921/75000 [14:10<50:24, 19.21it/s]

{'loss': 0.3127, 'grad_norm': 3.8628034591674805, 'learning_rate': 3.897986577181208e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16932/75000 [14:10<48:14, 20.06it/s]

{'loss': 0.2621, 'grad_norm': 2.174117088317871, 'learning_rate': 3.897315436241611e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16941/75000 [14:11<50:27, 19.18it/s]

{'loss': 0.3075, 'grad_norm': 3.555669069290161, 'learning_rate': 3.8966442953020135e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16954/75000 [14:11<48:00, 20.15it/s]

{'loss': 0.432, 'grad_norm': 2.2092363834381104, 'learning_rate': 3.8959731543624164e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16963/75000 [14:12<47:31, 20.35it/s]

{'loss': 0.3595, 'grad_norm': 2.5311264991760254, 'learning_rate': 3.895302013422819e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16973/75000 [14:12<53:18, 18.14it/s]

{'loss': 0.3095, 'grad_norm': 2.341256618499756, 'learning_rate': 3.8946308724832214e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16982/75000 [14:13<50:13, 19.25it/s]

{'loss': 0.301, 'grad_norm': 1.4221383333206177, 'learning_rate': 3.893959731543624e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 16994/75000 [14:13<48:44, 19.84it/s]

{'loss': 0.2842, 'grad_norm': 3.270024299621582, 'learning_rate': 3.893288590604027e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 17000/75000 [14:14<58:40, 16.48it/s]

{'loss': 0.2734, 'grad_norm': 2.9190242290496826, 'learning_rate': 3.89261744966443e-05, 'epoch': 0.68}


                                                       
 23%|██▎       | 17014/75000 [14:15<59:44, 16.18it/s]  

{'loss': 0.2794, 'grad_norm': 1.7570960521697998, 'learning_rate': 3.891946308724833e-05, 'epoch': 0.68}


                                                       
 23%|██▎       | 17024/75000 [14:16<53:07, 18.19it/s]  

{'loss': 0.3451, 'grad_norm': 4.790459632873535, 'learning_rate': 3.891275167785235e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 17032/75000 [14:16<57:34, 16.78it/s]

{'loss': 0.3614, 'grad_norm': 4.44948673248291, 'learning_rate': 3.890604026845638e-05, 'epoch': 0.68}


                                                       
 23%|██▎       | 17042/75000 [14:17<57:59, 16.66it/s]

{'loss': 0.2674, 'grad_norm': 1.1422181129455566, 'learning_rate': 3.88993288590604e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 17052/75000 [14:17<54:21, 17.77it/s]

{'loss': 0.2172, 'grad_norm': 8.305304527282715, 'learning_rate': 3.889261744966443e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 17062/75000 [14:18<57:02, 16.93it/s]

{'loss': 0.4285, 'grad_norm': 3.941624402999878, 'learning_rate': 3.888590604026846e-05, 'epoch': 0.68}


                                                       
 23%|██▎       | 17072/75000 [14:19<1:05:20, 14.77it/s]

{'loss': 0.3578, 'grad_norm': 4.210165977478027, 'learning_rate': 3.8879194630872486e-05, 'epoch': 0.68}


                                                       
 23%|██▎       | 17082/75000 [14:19<1:04:49, 14.89it/s]

{'loss': 0.3322, 'grad_norm': 3.785470962524414, 'learning_rate': 3.8872483221476515e-05, 'epoch': 0.68}


                                                       
 23%|██▎       | 17092/75000 [14:20<1:09:26, 13.90it/s]

{'loss': 0.363, 'grad_norm': 1.8478246927261353, 'learning_rate': 3.8865771812080536e-05, 'epoch': 0.68}


                                                       
 23%|██▎       | 17102/75000 [14:21<1:08:37, 14.06it/s]

{'loss': 0.2936, 'grad_norm': 1.8038527965545654, 'learning_rate': 3.8859060402684565e-05, 'epoch': 0.68}


                                                       
 23%|██▎       | 17112/75000 [14:21<57:28, 16.79it/s]

{'loss': 0.4723, 'grad_norm': 1.4882562160491943, 'learning_rate': 3.885234899328859e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 17124/75000 [14:22<48:54, 19.72it/s]

{'loss': 0.2458, 'grad_norm': 2.20849609375, 'learning_rate': 3.884563758389262e-05, 'epoch': 0.68}


                                                     
 23%|██▎       | 17134/75000 [14:22<49:46, 19.38it/s]

{'loss': 0.3992, 'grad_norm': 1.875380039215088, 'learning_rate': 3.883892617449665e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17144/75000 [14:23<49:40, 19.41it/s]

{'loss': 0.3837, 'grad_norm': 6.097884654998779, 'learning_rate': 3.883221476510067e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17152/75000 [14:23<51:58, 18.55it/s]

{'loss': 0.4445, 'grad_norm': 4.854001522064209, 'learning_rate': 3.88255033557047e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17162/75000 [14:24<48:32, 19.86it/s]

{'loss': 0.3829, 'grad_norm': 2.996053695678711, 'learning_rate': 3.881879194630872e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17174/75000 [14:24<47:23, 20.33it/s]

{'loss': 0.2436, 'grad_norm': 1.6274008750915527, 'learning_rate': 3.881208053691275e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17184/75000 [14:25<49:18, 19.55it/s]

{'loss': 0.3741, 'grad_norm': 3.3184120655059814, 'learning_rate': 3.880536912751678e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17194/75000 [14:25<47:38, 20.22it/s]

{'loss': 0.3098, 'grad_norm': 2.1776909828186035, 'learning_rate': 3.879865771812081e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17202/75000 [14:26<49:28, 19.47it/s]

{'loss': 0.4157, 'grad_norm': 1.5506213903427124, 'learning_rate': 3.879194630872484e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17212/75000 [14:26<48:25, 19.89it/s]

{'loss': 0.3604, 'grad_norm': 0.9591404795646667, 'learning_rate': 3.878523489932886e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17224/75000 [14:27<47:01, 20.48it/s]

{'loss': 0.2741, 'grad_norm': 3.2018299102783203, 'learning_rate': 3.877852348993289e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17234/75000 [14:27<48:46, 19.74it/s]

{'loss': 0.2729, 'grad_norm': 3.5051257610321045, 'learning_rate': 3.877181208053691e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17244/75000 [14:28<47:14, 20.37it/s]

{'loss': 0.3937, 'grad_norm': 3.769132137298584, 'learning_rate': 3.8765100671140944e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17252/75000 [14:28<49:13, 19.55it/s]

{'loss': 0.4621, 'grad_norm': 11.024796485900879, 'learning_rate': 3.875838926174497e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17262/75000 [14:29<47:57, 20.06it/s]

{'loss': 0.341, 'grad_norm': 4.599066734313965, 'learning_rate': 3.8751677852348995e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17274/75000 [14:29<47:24, 20.29it/s]

{'loss': 0.2591, 'grad_norm': 1.663162350654602, 'learning_rate': 3.874496644295302e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17283/75000 [14:30<49:52, 19.29it/s]

{'loss': 0.4283, 'grad_norm': 3.6457884311676025, 'learning_rate': 3.8738255033557045e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17293/75000 [14:30<48:18, 19.91it/s]

{'loss': 0.2682, 'grad_norm': 2.7389144897460938, 'learning_rate': 3.8731543624161073e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17303/75000 [14:31<48:32, 19.81it/s]

{'loss': 0.2689, 'grad_norm': 8.391770362854004, 'learning_rate': 3.87248322147651e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17313/75000 [14:31<49:47, 19.31it/s]

{'loss': 0.291, 'grad_norm': 3.0845980644226074, 'learning_rate': 3.871812080536913e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17321/75000 [14:32<48:32, 19.81it/s]

{'loss': 0.3589, 'grad_norm': 1.8735462427139282, 'learning_rate': 3.871140939597316e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17332/75000 [14:32<50:45, 18.94it/s]

{'loss': 0.3054, 'grad_norm': 2.0824949741363525, 'learning_rate': 3.870469798657718e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17343/75000 [14:33<47:44, 20.13it/s]

{'loss': 0.3496, 'grad_norm': 2.226006031036377, 'learning_rate': 3.869798657718121e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17352/75000 [14:33<50:07, 19.17it/s]

{'loss': 0.3835, 'grad_norm': 2.0633044242858887, 'learning_rate': 3.869127516778524e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17362/75000 [14:34<50:53, 18.87it/s]

{'loss': 0.3735, 'grad_norm': 3.5885753631591797, 'learning_rate': 3.8684563758389266e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17372/75000 [14:35<50:11, 19.14it/s]

{'loss': 0.2832, 'grad_norm': 2.444938898086548, 'learning_rate': 3.8677852348993295e-05, 'epoch': 0.69}


                                                     
 23%|██▎       | 17382/75000 [14:35<48:07, 19.96it/s]

{'loss': 0.3391, 'grad_norm': 5.045252323150635, 'learning_rate': 3.867114093959732e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17394/75000 [14:36<49:58, 19.21it/s]

{'loss': 0.2534, 'grad_norm': 4.266450881958008, 'learning_rate': 3.8664429530201345e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17403/75000 [14:36<49:19, 19.46it/s]

{'loss': 0.4145, 'grad_norm': 1.9511327743530273, 'learning_rate': 3.865771812080537e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17413/75000 [14:37<47:36, 20.16it/s]

{'loss': 0.4045, 'grad_norm': 8.9890718460083, 'learning_rate': 3.8651006711409396e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17422/75000 [14:37<49:46, 19.28it/s]

{'loss': 0.3005, 'grad_norm': 4.90692663192749, 'learning_rate': 3.8644295302013424e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17433/75000 [14:38<47:13, 20.32it/s]

{'loss': 0.4216, 'grad_norm': 5.820308685302734, 'learning_rate': 3.863758389261745e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17442/75000 [14:38<47:32, 20.18it/s]

{'loss': 0.2566, 'grad_norm': 6.972092628479004, 'learning_rate': 3.863087248322148e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17454/75000 [14:39<48:27, 19.79it/s]

{'loss': 0.4228, 'grad_norm': 0.6584563255310059, 'learning_rate': 3.86241610738255e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17462/75000 [14:39<49:07, 19.52it/s]

{'loss': 0.3053, 'grad_norm': 3.355687379837036, 'learning_rate': 3.861744966442953e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17472/75000 [14:40<49:59, 19.18it/s]

{'loss': 0.4383, 'grad_norm': 0.37376585602760315, 'learning_rate': 3.861073825503356e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17484/75000 [14:40<49:46, 19.26it/s]

{'loss': 0.3537, 'grad_norm': 3.189310312271118, 'learning_rate': 3.860402684563759e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17492/75000 [14:41<47:51, 20.03it/s]

{'loss': 0.3264, 'grad_norm': 1.1062699556350708, 'learning_rate': 3.859731543624161e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17500/75000 [14:41<48:14, 19.86it/s]

{'loss': 0.3244, 'grad_norm': 2.110797643661499, 'learning_rate': 3.859060402684564e-05, 'epoch': 0.7}


                                                       
 23%|██▎       | 17514/75000 [14:42<53:40, 17.85it/s]  

{'loss': 0.3142, 'grad_norm': 19.05284309387207, 'learning_rate': 3.858389261744967e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17524/75000 [14:43<48:07, 19.90it/s]

{'loss': 0.3331, 'grad_norm': 2.461458444595337, 'learning_rate': 3.857718120805369e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17533/75000 [14:43<48:45, 19.65it/s]

{'loss': 0.2984, 'grad_norm': 9.518694877624512, 'learning_rate': 3.857046979865772e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17543/75000 [14:44<49:17, 19.43it/s]

{'loss': 0.4681, 'grad_norm': 2.4530506134033203, 'learning_rate': 3.8563758389261746e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17554/75000 [14:44<49:38, 19.29it/s]

{'loss': 0.4024, 'grad_norm': 4.883635520935059, 'learning_rate': 3.8557046979865775e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17562/75000 [14:45<47:47, 20.03it/s]

{'loss': 0.2467, 'grad_norm': 3.878225088119507, 'learning_rate': 3.8550335570469804e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17574/75000 [14:45<47:26, 20.18it/s]

{'loss': 0.5138, 'grad_norm': 6.448050498962402, 'learning_rate': 3.8543624161073825e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17583/75000 [14:46<48:26, 19.75it/s]

{'loss': 0.3318, 'grad_norm': 7.013440132141113, 'learning_rate': 3.8536912751677854e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17591/75000 [14:46<48:36, 19.69it/s]

{'loss': 0.2654, 'grad_norm': 4.461535453796387, 'learning_rate': 3.853020134228188e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17603/75000 [14:47<46:42, 20.48it/s]

{'loss': 0.2524, 'grad_norm': 13.464466094970703, 'learning_rate': 3.852348993288591e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17612/75000 [14:47<52:15, 18.31it/s]

{'loss': 0.3287, 'grad_norm': 4.223664283752441, 'learning_rate': 3.851677852348993e-05, 'epoch': 0.7}


                                                     
 23%|██▎       | 17622/75000 [14:48<47:54, 19.96it/s]

{'loss': 0.3212, 'grad_norm': 3.7311296463012695, 'learning_rate': 3.851006711409396e-05, 'epoch': 0.7}


                                                     
 24%|██▎       | 17634/75000 [14:48<48:42, 19.63it/s]

{'loss': 0.3555, 'grad_norm': 14.90192985534668, 'learning_rate': 3.850335570469799e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17644/75000 [14:49<49:47, 19.20it/s]

{'loss': 0.4179, 'grad_norm': 3.001377820968628, 'learning_rate': 3.849664429530201e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17654/75000 [14:49<49:16, 19.40it/s]

{'loss': 0.3925, 'grad_norm': 2.016097068786621, 'learning_rate': 3.848993288590605e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17662/75000 [14:50<51:54, 18.41it/s]

{'loss': 0.3224, 'grad_norm': 4.026790618896484, 'learning_rate': 3.848322147651007e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17674/75000 [14:50<47:53, 19.95it/s]

{'loss': 0.3672, 'grad_norm': 8.55871868133545, 'learning_rate': 3.84765100671141e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17684/75000 [14:51<48:40, 19.63it/s]

{'loss': 0.3679, 'grad_norm': 5.287322998046875, 'learning_rate': 3.846979865771812e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17691/75000 [14:51<50:02, 19.09it/s]

{'loss': 0.2729, 'grad_norm': 3.172351598739624, 'learning_rate': 3.846308724832215e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17704/75000 [14:52<47:41, 20.02it/s]

{'loss': 0.3166, 'grad_norm': 9.833014488220215, 'learning_rate': 3.8456375838926176e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17711/75000 [14:52<49:35, 19.26it/s]

{'loss': 0.3335, 'grad_norm': 7.004562854766846, 'learning_rate': 3.8449664429530205e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17724/75000 [14:53<48:26, 19.70it/s]

{'loss': 0.3476, 'grad_norm': 1.8134154081344604, 'learning_rate': 3.844295302013423e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17734/75000 [14:54<47:22, 20.15it/s]

{'loss': 0.3977, 'grad_norm': 4.256462097167969, 'learning_rate': 3.8436241610738255e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17743/75000 [14:54<48:40, 19.60it/s]

{'loss': 0.4162, 'grad_norm': 2.4513003826141357, 'learning_rate': 3.8429530201342283e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17754/75000 [14:55<49:50, 19.14it/s]

{'loss': 0.3609, 'grad_norm': 5.443058013916016, 'learning_rate': 3.8422818791946305e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17764/75000 [14:55<48:27, 19.68it/s]

{'loss': 0.2637, 'grad_norm': 1.487196922302246, 'learning_rate': 3.8416107382550334e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17774/75000 [14:56<48:00, 19.87it/s]

{'loss': 0.2962, 'grad_norm': 7.180136680603027, 'learning_rate': 3.840939597315437e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17784/75000 [14:56<49:00, 19.46it/s]

{'loss': 0.3079, 'grad_norm': 12.984745025634766, 'learning_rate': 3.840268456375839e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17792/75000 [14:57<47:15, 20.17it/s]

{'loss': 0.2837, 'grad_norm': 5.793390274047852, 'learning_rate': 3.839597315436242e-05, 'epoch': 0.71}


                                                     
 24%|██▎       | 17803/75000 [14:57<47:47, 19.94it/s]

{'loss': 0.2652, 'grad_norm': 6.771416664123535, 'learning_rate': 3.838926174496644e-05, 'epoch': 0.71}


                                                     
 24%|██▍       | 17813/75000 [14:58<47:57, 19.88it/s]

{'loss': 0.3685, 'grad_norm': 5.6757636070251465, 'learning_rate': 3.838255033557047e-05, 'epoch': 0.71}


                                                     
 24%|██▍       | 17822/75000 [14:58<47:06, 20.23it/s]

{'loss': 0.3908, 'grad_norm': 5.10586404800415, 'learning_rate': 3.83758389261745e-05, 'epoch': 0.71}


                                                     
 24%|██▍       | 17834/75000 [14:59<53:20, 17.86it/s]

{'loss': 0.2749, 'grad_norm': 2.8936848640441895, 'learning_rate': 3.836912751677853e-05, 'epoch': 0.71}


                                                     
 24%|██▍       | 17844/75000 [14:59<48:42, 19.55it/s]

{'loss': 0.3511, 'grad_norm': 4.279341697692871, 'learning_rate': 3.8362416107382555e-05, 'epoch': 0.71}


                                                     
 24%|██▍       | 17852/75000 [15:00<47:16, 20.15it/s]

{'loss': 0.43, 'grad_norm': 7.702054500579834, 'learning_rate': 3.835570469798658e-05, 'epoch': 0.71}


                                                     
 24%|██▍       | 17864/75000 [15:00<49:14, 19.34it/s]

{'loss': 0.2569, 'grad_norm': 3.802255630493164, 'learning_rate': 3.8348993288590606e-05, 'epoch': 0.71}


                                                     
 24%|██▍       | 17874/75000 [15:01<47:02, 20.24it/s]

{'loss': 0.383, 'grad_norm': 4.997523784637451, 'learning_rate': 3.834228187919463e-05, 'epoch': 0.71}


                                                     
 24%|██▍       | 17883/75000 [15:01<47:22, 20.09it/s]

{'loss': 0.2859, 'grad_norm': 9.732651710510254, 'learning_rate': 3.833557046979866e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17894/75000 [15:02<48:16, 19.72it/s]

{'loss': 0.3216, 'grad_norm': 7.359094142913818, 'learning_rate': 3.832885906040269e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17904/75000 [15:02<47:14, 20.14it/s]

{'loss': 0.2879, 'grad_norm': 4.631434917449951, 'learning_rate': 3.832214765100671e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17913/75000 [15:03<47:04, 20.21it/s]

{'loss': 0.354, 'grad_norm': 13.937335968017578, 'learning_rate': 3.831543624161074e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17924/75000 [15:03<50:03, 19.00it/s]

{'loss': 0.3185, 'grad_norm': 6.323152542114258, 'learning_rate': 3.8308724832214763e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17933/75000 [15:04<49:52, 19.07it/s]

{'loss': 0.2248, 'grad_norm': 6.263065338134766, 'learning_rate': 3.830201342281879e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17944/75000 [15:04<48:21, 19.67it/s]

{'loss': 0.3885, 'grad_norm': 7.285624027252197, 'learning_rate': 3.829530201342282e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17954/75000 [15:05<48:26, 19.63it/s]

{'loss': 0.2911, 'grad_norm': 3.5779969692230225, 'learning_rate': 3.828859060402685e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17964/75000 [15:06<49:41, 19.13it/s]

{'loss': 0.3506, 'grad_norm': 4.258570194244385, 'learning_rate': 3.828187919463088e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17971/75000 [15:06<50:31, 18.81it/s]

{'loss': 0.3467, 'grad_norm': 1.471948504447937, 'learning_rate': 3.82751677852349e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17984/75000 [15:07<47:43, 19.91it/s]

{'loss': 0.2578, 'grad_norm': 2.752133846282959, 'learning_rate': 3.826845637583893e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 17993/75000 [15:07<48:03, 19.77it/s]

{'loss': 0.4123, 'grad_norm': 2.2070603370666504, 'learning_rate': 3.826174496644295e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18000/75000 [15:07<49:51, 19.06it/s]

{'loss': 0.2775, 'grad_norm': 5.696508407592773, 'learning_rate': 3.8255033557046985e-05, 'epoch': 0.72}


                                                       
 24%|██▍       | 18014/75000 [15:09<59:59, 15.83it/s]  

{'loss': 0.4128, 'grad_norm': 4.964457035064697, 'learning_rate': 3.8248322147651014e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18022/75000 [15:09<53:44, 17.67it/s]

{'loss': 0.3614, 'grad_norm': 2.0170257091522217, 'learning_rate': 3.8241610738255035e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18034/75000 [15:10<49:06, 19.33it/s]

{'loss': 0.3569, 'grad_norm': 11.404814720153809, 'learning_rate': 3.8234899328859064e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18044/75000 [15:10<46:54, 20.24it/s]

{'loss': 0.2603, 'grad_norm': 3.1302027702331543, 'learning_rate': 3.8228187919463086e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18054/75000 [15:11<49:03, 19.35it/s]

{'loss': 0.3829, 'grad_norm': 1.6566506624221802, 'learning_rate': 3.8221476510067114e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18061/75000 [15:11<48:29, 19.57it/s]

{'loss': 0.3004, 'grad_norm': 8.178460121154785, 'learning_rate': 3.821476510067114e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18072/75000 [15:12<46:47, 20.28it/s]

{'loss': 0.2567, 'grad_norm': 3.7955451011657715, 'learning_rate': 3.820805369127517e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18083/75000 [15:12<48:29, 19.56it/s]

{'loss': 0.3433, 'grad_norm': 2.800917148590088, 'learning_rate': 3.82013422818792e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18094/75000 [15:13<49:19, 19.23it/s]

{'loss': 0.4613, 'grad_norm': 2.52075457572937, 'learning_rate': 3.819463087248322e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18104/75000 [15:13<48:59, 19.36it/s]

{'loss': 0.4043, 'grad_norm': 4.351677417755127, 'learning_rate': 3.818791946308725e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18114/75000 [15:14<48:12, 19.67it/s]

{'loss': 0.4377, 'grad_norm': 2.9639792442321777, 'learning_rate': 3.818120805369127e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18124/75000 [15:15<49:10, 19.28it/s]

{'loss': 0.2889, 'grad_norm': 2.664248466491699, 'learning_rate': 3.817449664429531e-05, 'epoch': 0.72}


                                                     
 24%|██▍       | 18132/75000 [15:15<48:30, 19.54it/s]

{'loss': 0.3359, 'grad_norm': 2.098451614379883, 'learning_rate': 3.816778523489933e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18141/75000 [15:15<50:32, 18.75it/s]

{'loss': 0.3385, 'grad_norm': 2.100832939147949, 'learning_rate': 3.816107382550336e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18153/75000 [15:16<49:48, 19.02it/s]

{'loss': 0.3257, 'grad_norm': 3.820016622543335, 'learning_rate': 3.8154362416107386e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18162/75000 [15:17<49:23, 19.18it/s]

{'loss': 0.3911, 'grad_norm': 5.4792561531066895, 'learning_rate': 3.814765100671141e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18171/75000 [15:17<47:53, 19.78it/s]

{'loss': 0.2928, 'grad_norm': 3.17362904548645, 'learning_rate': 3.8140939597315436e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18184/75000 [15:18<47:01, 20.14it/s]

{'loss': 0.2832, 'grad_norm': 1.3039791584014893, 'learning_rate': 3.8134228187919465e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18193/75000 [15:18<47:33, 19.91it/s]

{'loss': 0.3556, 'grad_norm': 1.894869327545166, 'learning_rate': 3.8127516778523494e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18204/75000 [15:19<48:36, 19.48it/s]

{'loss': 0.2428, 'grad_norm': 0.47219860553741455, 'learning_rate': 3.812080536912752e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18213/75000 [15:19<47:14, 20.03it/s]

{'loss': 0.3467, 'grad_norm': 35.66596984863281, 'learning_rate': 3.8114093959731544e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18222/75000 [15:20<46:49, 20.21it/s]

{'loss': 0.4408, 'grad_norm': 4.540276050567627, 'learning_rate': 3.810738255033557e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18232/75000 [15:20<48:43, 19.42it/s]

{'loss': 0.4832, 'grad_norm': 1.5877526998519897, 'learning_rate': 3.81006711409396e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18244/75000 [15:21<47:59, 19.71it/s]

{'loss': 0.4058, 'grad_norm': 2.419186592102051, 'learning_rate': 3.809395973154363e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18252/75000 [15:21<47:46, 19.79it/s]

{'loss': 0.3394, 'grad_norm': 1.842240810394287, 'learning_rate': 3.808724832214765e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18261/75000 [15:22<51:42, 18.29it/s]

{'loss': 0.3498, 'grad_norm': 2.861509084701538, 'learning_rate': 3.808053691275168e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18273/75000 [15:22<47:18, 19.98it/s]

{'loss': 0.4094, 'grad_norm': 18.85981559753418, 'learning_rate': 3.807382550335571e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18282/75000 [15:23<46:46, 20.21it/s]

{'loss': 0.3312, 'grad_norm': 0.9873117208480835, 'learning_rate': 3.806711409395973e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18293/75000 [15:23<51:43, 18.27it/s]

{'loss': 0.3468, 'grad_norm': 5.074483871459961, 'learning_rate': 3.806040268456376e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18304/75000 [15:24<47:26, 19.92it/s]

{'loss': 0.3925, 'grad_norm': 5.010201930999756, 'learning_rate': 3.805369127516779e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18312/75000 [15:24<52:42, 17.93it/s]

{'loss': 0.2954, 'grad_norm': 5.160764217376709, 'learning_rate': 3.8046979865771816e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18324/75000 [15:25<50:15, 18.79it/s]

{'loss': 0.4045, 'grad_norm': 7.16989278793335, 'learning_rate': 3.804026845637584e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18332/75000 [15:25<47:36, 19.84it/s]

{'loss': 0.4534, 'grad_norm': 1.6118545532226562, 'learning_rate': 3.8033557046979866e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18342/75000 [15:26<48:42, 19.38it/s]

{'loss': 0.3247, 'grad_norm': 3.5059423446655273, 'learning_rate': 3.8026845637583895e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18353/75000 [15:27<49:15, 19.17it/s]

{'loss': 0.3627, 'grad_norm': 2.0956549644470215, 'learning_rate': 3.802013422818792e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18364/75000 [15:27<47:07, 20.03it/s]

{'loss': 0.3231, 'grad_norm': 2.045278787612915, 'learning_rate': 3.801342281879195e-05, 'epoch': 0.73}


                                                     
 24%|██▍       | 18373/75000 [15:28<47:40, 19.79it/s]

{'loss': 0.2974, 'grad_norm': 1.919677972793579, 'learning_rate': 3.8006711409395974e-05, 'epoch': 0.73}


                                                     
 25%|██▍       | 18384/75000 [15:28<46:41, 20.21it/s]

{'loss': 0.2399, 'grad_norm': 5.91465425491333, 'learning_rate': 3.8e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18394/75000 [15:29<50:53, 18.54it/s]

{'loss': 0.3426, 'grad_norm': 9.50733470916748, 'learning_rate': 3.7993288590604024e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18404/75000 [15:29<48:00, 19.65it/s]

{'loss': 0.4189, 'grad_norm': 1.1964595317840576, 'learning_rate': 3.798657718120805e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18414/75000 [15:30<47:29, 19.86it/s]

{'loss': 0.3123, 'grad_norm': 2.6139395236968994, 'learning_rate': 3.797986577181208e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18423/75000 [15:30<50:37, 18.63it/s]

{'loss': 0.3916, 'grad_norm': 4.271893501281738, 'learning_rate': 3.797315436241611e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18432/75000 [15:31<49:12, 19.16it/s]

{'loss': 0.2879, 'grad_norm': 3.162911891937256, 'learning_rate': 3.796644295302014e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18442/75000 [15:31<49:20, 19.11it/s]

{'loss': 0.2451, 'grad_norm': 5.365337371826172, 'learning_rate': 3.795973154362416e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18454/75000 [15:32<50:07, 18.80it/s]

{'loss': 0.4719, 'grad_norm': 6.395249366760254, 'learning_rate': 3.795302013422819e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18462/75000 [15:32<49:34, 19.01it/s]

{'loss': 0.3969, 'grad_norm': 5.769904613494873, 'learning_rate': 3.794630872483222e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18473/75000 [15:33<51:09, 18.41it/s]

{'loss': 0.2251, 'grad_norm': 4.63784646987915, 'learning_rate': 3.7939597315436245e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18484/75000 [15:33<49:05, 19.19it/s]

{'loss': 0.298, 'grad_norm': 9.73653507232666, 'learning_rate': 3.7932885906040274e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18492/75000 [15:34<56:51, 16.56it/s]

{'loss': 0.2589, 'grad_norm': 6.396312236785889, 'learning_rate': 3.7926174496644296e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18500/75000 [15:34<50:57, 18.48it/s]

{'loss': 0.279, 'grad_norm': 6.683307647705078, 'learning_rate': 3.7919463087248324e-05, 'epoch': 0.74}


                                                       
 25%|██▍       | 18513/75000 [15:36<59:36, 15.79it/s]  

{'loss': 0.42, 'grad_norm': 4.6974310874938965, 'learning_rate': 3.7912751677852346e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18522/75000 [15:36<50:57, 18.47it/s]

{'loss': 0.4182, 'grad_norm': 3.8495757579803467, 'learning_rate': 3.7906040268456375e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18534/75000 [15:37<49:50, 18.88it/s]

{'loss': 0.2485, 'grad_norm': 2.4989867210388184, 'learning_rate': 3.789932885906041e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18544/75000 [15:37<49:22, 19.06it/s]

{'loss': 0.2867, 'grad_norm': 3.5380876064300537, 'learning_rate': 3.789261744966443e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18554/75000 [15:38<47:20, 19.87it/s]

{'loss': 0.3109, 'grad_norm': 5.152469635009766, 'learning_rate': 3.788590604026846e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18563/75000 [15:38<52:38, 17.87it/s]

{'loss': 0.3644, 'grad_norm': 2.196301221847534, 'learning_rate': 3.787919463087248e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18573/75000 [15:39<49:09, 19.13it/s]

{'loss': 0.3017, 'grad_norm': 2.8412699699401855, 'learning_rate': 3.787248322147651e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18582/75000 [15:39<48:45, 19.28it/s]

{'loss': 0.267, 'grad_norm': 1.5184500217437744, 'learning_rate': 3.786577181208054e-05, 'epoch': 0.74}


                                                       
 25%|██▍       | 18594/75000 [15:40<49:34, 18.96it/s]

{'loss': 0.3541, 'grad_norm': 7.5086350440979, 'learning_rate': 3.785906040268457e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18603/75000 [15:40<50:55, 18.45it/s]

{'loss': 0.3239, 'grad_norm': 2.811075448989868, 'learning_rate': 3.7852348993288596e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18614/75000 [15:41<50:19, 18.68it/s]

{'loss': 0.2672, 'grad_norm': 6.974973201751709, 'learning_rate': 3.784563758389262e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18623/75000 [15:41<49:21, 19.04it/s]

{'loss': 0.2238, 'grad_norm': 5.9094767570495605, 'learning_rate': 3.7838926174496647e-05, 'epoch': 0.74}


                                                     
 25%|██▍       | 18634/75000 [15:42<47:36, 19.74it/s]

{'loss': 0.2901, 'grad_norm': 1.8379192352294922, 'learning_rate': 3.783221476510067e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18642/75000 [15:42<52:46, 17.80it/s]

{'loss': 0.3344, 'grad_norm': 3.7620368003845215, 'learning_rate': 3.78255033557047e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18653/75000 [15:43<49:55, 18.81it/s]

{'loss': 0.4214, 'grad_norm': 4.239704608917236, 'learning_rate': 3.781879194630873e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18664/75000 [15:44<50:07, 18.73it/s]

{'loss': 0.3093, 'grad_norm': 5.370096683502197, 'learning_rate': 3.7812080536912754e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18673/75000 [15:44<52:09, 18.00it/s]

{'loss': 0.3369, 'grad_norm': 5.577511787414551, 'learning_rate': 3.780536912751678e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18681/75000 [15:45<48:56, 19.18it/s]

{'loss': 0.3754, 'grad_norm': 2.4831390380859375, 'learning_rate': 3.7798657718120804e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18691/75000 [15:45<51:03, 18.38it/s]

{'loss': 0.3446, 'grad_norm': 1.754118800163269, 'learning_rate': 3.779194630872483e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18703/75000 [15:46<50:11, 18.69it/s]

{'loss': 0.3809, 'grad_norm': 2.831063747406006, 'learning_rate': 3.778523489932886e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18712/75000 [15:46<49:46, 18.85it/s]

{'loss': 0.3597, 'grad_norm': 1.75877046585083, 'learning_rate': 3.777852348993289e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18722/75000 [15:47<47:15, 19.85it/s]

{'loss': 0.3734, 'grad_norm': 2.807724952697754, 'learning_rate': 3.777181208053692e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18733/75000 [15:47<47:30, 19.74it/s]

{'loss': 0.364, 'grad_norm': 15.863030433654785, 'learning_rate': 3.776510067114094e-05, 'epoch': 0.75}


                                                     
 25%|██▍       | 18744/75000 [15:48<46:11, 20.30it/s]

{'loss': 0.3984, 'grad_norm': 1.944108009338379, 'learning_rate': 3.775838926174497e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18753/75000 [15:48<46:31, 20.15it/s]

{'loss': 0.3015, 'grad_norm': 3.4116477966308594, 'learning_rate': 3.775167785234899e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18763/75000 [15:49<49:37, 18.89it/s]

{'loss': 0.3637, 'grad_norm': 9.858861923217773, 'learning_rate': 3.7744966442953026e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18774/75000 [15:49<46:33, 20.12it/s]

{'loss': 0.3779, 'grad_norm': 5.068782329559326, 'learning_rate': 3.773825503355705e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18783/75000 [15:50<46:23, 20.20it/s]

{'loss': 0.3017, 'grad_norm': 2.1033618450164795, 'learning_rate': 3.7731543624161076e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18792/75000 [15:50<48:46, 19.20it/s]

{'loss': 0.3176, 'grad_norm': 3.9077084064483643, 'learning_rate': 3.7724832214765105e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18802/75000 [15:51<46:58, 19.94it/s]

{'loss': 0.3036, 'grad_norm': 3.428455114364624, 'learning_rate': 3.7718120805369127e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18813/75000 [15:51<49:07, 19.07it/s]

{'loss': 0.3266, 'grad_norm': 3.0088446140289307, 'learning_rate': 3.7711409395973155e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18823/75000 [15:52<48:03, 19.48it/s]

{'loss': 0.2945, 'grad_norm': 0.8234065175056458, 'learning_rate': 3.7704697986577184e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18833/75000 [15:52<47:07, 19.87it/s]

{'loss': 0.3808, 'grad_norm': 1.5842573642730713, 'learning_rate': 3.769798657718121e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18844/75000 [15:53<48:17, 19.38it/s]

{'loss': 0.2873, 'grad_norm': 0.9961212277412415, 'learning_rate': 3.769127516778524e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18854/75000 [15:53<49:14, 19.00it/s]

{'loss': 0.3561, 'grad_norm': 1.9859133958816528, 'learning_rate': 3.768456375838926e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18862/75000 [15:54<54:34, 17.14it/s]

{'loss': 0.4299, 'grad_norm': 23.948875427246094, 'learning_rate': 3.767785234899329e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18872/75000 [15:54<48:28, 19.30it/s]

{'loss': 0.3408, 'grad_norm': 2.977423667907715, 'learning_rate': 3.767114093959731e-05, 'epoch': 0.75}


                                                     
 25%|██▌       | 18883/75000 [15:55<46:13, 20.23it/s]

{'loss': 0.2422, 'grad_norm': 1.6002099514007568, 'learning_rate': 3.766442953020135e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18892/75000 [15:55<49:38, 18.84it/s]

{'loss': 0.4124, 'grad_norm': 6.482565879821777, 'learning_rate': 3.765771812080537e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18904/75000 [15:56<46:05, 20.28it/s]

{'loss': 0.3312, 'grad_norm': 2.249972105026245, 'learning_rate': 3.76510067114094e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18914/75000 [15:57<49:18, 18.96it/s]

{'loss': 0.4638, 'grad_norm': 6.869584560394287, 'learning_rate': 3.764429530201343e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18924/75000 [15:57<46:57, 19.90it/s]

{'loss': 0.3131, 'grad_norm': 10.103538513183594, 'learning_rate': 3.763758389261745e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18931/75000 [15:58<49:43, 18.79it/s]

{'loss': 0.1973, 'grad_norm': 2.3430845737457275, 'learning_rate': 3.763087248322148e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18942/75000 [15:58<48:50, 19.13it/s]

{'loss': 0.3735, 'grad_norm': 3.866208076477051, 'learning_rate': 3.7624161073825506e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18953/75000 [15:59<46:08, 20.25it/s]

{'loss': 0.4571, 'grad_norm': 5.144628047943115, 'learning_rate': 3.7617449664429534e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18962/75000 [15:59<50:24, 18.53it/s]

{'loss': 0.2434, 'grad_norm': 1.0181790590286255, 'learning_rate': 3.7610738255033556e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18973/75000 [16:00<49:00, 19.05it/s]

{'loss': 0.3677, 'grad_norm': 9.283795356750488, 'learning_rate': 3.7604026845637585e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18982/75000 [16:00<49:12, 18.97it/s]

{'loss': 0.2408, 'grad_norm': 7.009024620056152, 'learning_rate': 3.759731543624161e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 18992/75000 [16:01<46:50, 19.93it/s]

{'loss': 0.2793, 'grad_norm': 7.2116851806640625, 'learning_rate': 3.7590604026845635e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19000/75000 [16:01<47:38, 19.59it/s]

{'loss': 0.388, 'grad_norm': 7.680290222167969, 'learning_rate': 3.758389261744967e-05, 'epoch': 0.76}


                                                       
 25%|██▌       | 19013/75000 [16:02<56:58, 16.38it/s]  

{'loss': 0.4393, 'grad_norm': 2.9025278091430664, 'learning_rate': 3.757718120805369e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19022/75000 [16:03<49:34, 18.82it/s]

{'loss': 0.3328, 'grad_norm': 2.503861665725708, 'learning_rate': 3.757046979865772e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19032/75000 [16:03<50:45, 18.38it/s]

{'loss': 0.4314, 'grad_norm': 1.6993716955184937, 'learning_rate': 3.756375838926175e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19041/75000 [16:04<49:32, 18.83it/s]

{'loss': 0.3199, 'grad_norm': 1.9746594429016113, 'learning_rate': 3.755704697986577e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19054/75000 [16:04<46:23, 20.10it/s]

{'loss': 0.3168, 'grad_norm': 2.2066640853881836, 'learning_rate': 3.75503355704698e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19064/75000 [16:05<46:34, 20.02it/s]

{'loss': 0.2663, 'grad_norm': 4.981716156005859, 'learning_rate': 3.754362416107383e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19074/75000 [16:05<46:38, 19.98it/s]

{'loss': 0.3875, 'grad_norm': 5.136756420135498, 'learning_rate': 3.753691275167786e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19082/75000 [16:06<51:36, 18.06it/s]

{'loss': 0.3034, 'grad_norm': 2.905704975128174, 'learning_rate': 3.753020134228188e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19092/75000 [16:06<48:34, 19.18it/s]

{'loss': 0.3525, 'grad_norm': 2.523744821548462, 'learning_rate': 3.752348993288591e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19103/75000 [16:07<48:05, 19.37it/s]

{'loss': 0.378, 'grad_norm': 5.1367902755737305, 'learning_rate': 3.7516778523489936e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19114/75000 [16:08<47:22, 19.66it/s]

{'loss': 0.3319, 'grad_norm': 1.2948755025863647, 'learning_rate': 3.7510067114093964e-05, 'epoch': 0.76}


                                                     
 25%|██▌       | 19123/75000 [16:08<48:59, 19.01it/s]

{'loss': 0.3256, 'grad_norm': 4.796845436096191, 'learning_rate': 3.750335570469799e-05, 'epoch': 0.76}


                                                     
 26%|██▌       | 19134/75000 [16:09<46:10, 20.17it/s]

{'loss': 0.3191, 'grad_norm': 3.9911975860595703, 'learning_rate': 3.7496644295302014e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19143/75000 [16:09<45:41, 20.37it/s]

{'loss': 0.3283, 'grad_norm': 4.674686431884766, 'learning_rate': 3.748993288590604e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19153/75000 [16:10<49:41, 18.73it/s]

{'loss': 0.2867, 'grad_norm': 0.9802409410476685, 'learning_rate': 3.7483221476510065e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19162/75000 [16:10<48:34, 19.16it/s]

{'loss': 0.2129, 'grad_norm': 1.756300926208496, 'learning_rate': 3.747651006711409e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19174/75000 [16:11<46:10, 20.15it/s]

{'loss': 0.3342, 'grad_norm': 6.581733703613281, 'learning_rate': 3.746979865771812e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19183/75000 [16:11<49:55, 18.64it/s]

{'loss': 0.2808, 'grad_norm': 3.180345058441162, 'learning_rate': 3.746308724832215e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19194/75000 [16:12<47:18, 19.66it/s]

{'loss': 0.4411, 'grad_norm': 4.414345741271973, 'learning_rate': 3.745637583892618e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19201/75000 [16:12<52:41, 17.65it/s]

{'loss': 0.2293, 'grad_norm': 7.3566670417785645, 'learning_rate': 3.74496644295302e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19214/75000 [16:13<46:26, 20.02it/s]

{'loss': 0.4919, 'grad_norm': 3.2773778438568115, 'learning_rate': 3.744295302013423e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19222/75000 [16:13<49:46, 18.68it/s]

{'loss': 0.2736, 'grad_norm': 3.1482086181640625, 'learning_rate': 3.743624161073825e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19233/75000 [16:14<52:59, 17.54it/s]

{'loss': 0.2235, 'grad_norm': 2.6717982292175293, 'learning_rate': 3.7429530201342286e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19241/75000 [16:14<54:30, 17.05it/s]

{'loss': 0.3317, 'grad_norm': 16.01699447631836, 'learning_rate': 3.7422818791946315e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19254/75000 [16:15<48:05, 19.32it/s]

{'loss': 0.252, 'grad_norm': 8.46262264251709, 'learning_rate': 3.741610738255034e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19262/75000 [16:15<53:20, 17.41it/s]

{'loss': 0.2912, 'grad_norm': 3.0755550861358643, 'learning_rate': 3.7409395973154365e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19273/75000 [16:16<51:34, 18.01it/s]

{'loss': 0.2289, 'grad_norm': 1.7916678190231323, 'learning_rate': 3.740268456375839e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19281/75000 [16:17<52:19, 17.75it/s]

{'loss': 0.3624, 'grad_norm': 4.087769508361816, 'learning_rate': 3.7395973154362415e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19292/75000 [16:17<52:46, 17.59it/s]

{'loss': 0.2948, 'grad_norm': 8.974455833435059, 'learning_rate': 3.7389261744966444e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19302/75000 [16:18<50:07, 18.52it/s]

{'loss': 0.3648, 'grad_norm': 1.7174415588378906, 'learning_rate': 3.738255033557047e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19312/75000 [16:18<50:32, 18.36it/s]

{'loss': 0.3557, 'grad_norm': 4.739006996154785, 'learning_rate': 3.73758389261745e-05, 'epoch': 0.77}


                                                       
 26%|██▌       | 19322/75000 [16:19<54:46, 16.94it/s]

{'loss': 0.2061, 'grad_norm': 1.5187938213348389, 'learning_rate': 3.736912751677852e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19333/75000 [16:19<51:02, 18.18it/s]

{'loss': 0.2497, 'grad_norm': 1.867364525794983, 'learning_rate': 3.736241610738255e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19343/75000 [16:20<50:43, 18.29it/s]

{'loss': 0.5238, 'grad_norm': 9.814973831176758, 'learning_rate': 3.735570469798657e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19353/75000 [16:21<51:04, 18.16it/s]

{'loss': 0.3629, 'grad_norm': 5.788078308105469, 'learning_rate': 3.734899328859061e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19363/75000 [16:21<51:02, 18.17it/s]

{'loss': 0.3081, 'grad_norm': 6.704917907714844, 'learning_rate': 3.734228187919464e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19373/75000 [16:22<52:33, 17.64it/s]

{'loss': 0.2743, 'grad_norm': 4.270913600921631, 'learning_rate': 3.733557046979866e-05, 'epoch': 0.77}


                                                     
 26%|██▌       | 19383/75000 [16:22<51:37, 17.96it/s]

{'loss': 0.2841, 'grad_norm': 8.160116195678711, 'learning_rate': 3.732885906040269e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19393/75000 [16:23<49:41, 18.65it/s]

{'loss': 0.2873, 'grad_norm': 2.968381404876709, 'learning_rate': 3.732214765100671e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19404/75000 [16:23<52:20, 17.70it/s]

{'loss': 0.3419, 'grad_norm': 7.2248125076293945, 'learning_rate': 3.731543624161074e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19412/75000 [16:24<51:41, 17.92it/s]

{'loss': 0.3745, 'grad_norm': 8.90439510345459, 'learning_rate': 3.7308724832214766e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19421/75000 [16:24<49:13, 18.82it/s]

{'loss': 0.3123, 'grad_norm': 2.3768973350524902, 'learning_rate': 3.7302013422818795e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19433/75000 [16:25<50:05, 18.49it/s]

{'loss': 0.3534, 'grad_norm': 4.395715236663818, 'learning_rate': 3.729530201342282e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19443/75000 [16:26<51:10, 18.09it/s]

{'loss': 0.365, 'grad_norm': 3.3288350105285645, 'learning_rate': 3.7288590604026845e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19453/75000 [16:26<55:19, 16.73it/s]

{'loss': 0.3386, 'grad_norm': 5.437639236450195, 'learning_rate': 3.7281879194630874e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19463/75000 [16:27<52:30, 17.63it/s]

{'loss': 0.3501, 'grad_norm': 6.667710781097412, 'learning_rate': 3.72751677852349e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19472/75000 [16:27<55:09, 16.78it/s]

{'loss': 0.4066, 'grad_norm': 11.78032112121582, 'learning_rate': 3.726845637583893e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19482/75000 [16:28<52:32, 17.61it/s]

{'loss': 0.3845, 'grad_norm': 4.30267858505249, 'learning_rate': 3.726174496644296e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19493/75000 [16:29<54:21, 17.02it/s]

{'loss': 0.2832, 'grad_norm': 2.338592767715454, 'learning_rate': 3.725503355704698e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19500/75000 [16:29<53:07, 17.41it/s]

{'loss': 0.2543, 'grad_norm': 1.3359493017196655, 'learning_rate': 3.724832214765101e-05, 'epoch': 0.78}


                                                       
 26%|██▌       | 19513/75000 [16:30<59:18, 15.59it/s]  

{'loss': 0.3761, 'grad_norm': 1.647425889968872, 'learning_rate': 3.724161073825503e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19524/75000 [16:31<49:54, 18.52it/s]

{'loss': 0.3938, 'grad_norm': 8.850756645202637, 'learning_rate': 3.723489932885906e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19532/75000 [16:31<54:27, 16.98it/s]

{'loss': 0.3721, 'grad_norm': 3.969438076019287, 'learning_rate': 3.722818791946309e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19542/75000 [16:32<49:53, 18.53it/s]

{'loss': 0.3881, 'grad_norm': 2.8342318534851074, 'learning_rate': 3.722147651006712e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19552/75000 [16:32<49:46, 18.57it/s]

{'loss': 0.239, 'grad_norm': 0.5437233448028564, 'learning_rate': 3.7214765100671146e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19562/75000 [16:33<52:40, 17.54it/s]

{'loss': 0.3064, 'grad_norm': 3.61464262008667, 'learning_rate': 3.720805369127517e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19573/75000 [16:33<49:36, 18.62it/s]

{'loss': 0.3925, 'grad_norm': 1.0912196636199951, 'learning_rate': 3.7201342281879196e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19584/75000 [16:34<48:06, 19.20it/s]

{'loss': 0.2564, 'grad_norm': 2.4639041423797607, 'learning_rate': 3.7194630872483224e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19592/75000 [16:35<55:36, 16.60it/s]

{'loss': 0.3188, 'grad_norm': 4.523283958435059, 'learning_rate': 3.718791946308725e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19603/75000 [16:35<50:37, 18.24it/s]

{'loss': 0.3478, 'grad_norm': 17.094711303710938, 'learning_rate': 3.7181208053691275e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19613/75000 [16:36<53:17, 17.32it/s]

{'loss': 0.3219, 'grad_norm': 5.693119049072266, 'learning_rate': 3.71744966442953e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19621/75000 [16:36<54:18, 17.00it/s]

{'loss': 0.2318, 'grad_norm': 19.134265899658203, 'learning_rate': 3.716778523489933e-05, 'epoch': 0.78}


                                                     
 26%|██▌       | 19633/75000 [16:37<49:29, 18.65it/s]

{'loss': 0.3204, 'grad_norm': 2.1399309635162354, 'learning_rate': 3.7161073825503354e-05, 'epoch': 0.79}


                                                     
 26%|██▌       | 19643/75000 [16:37<52:24, 17.60it/s]

{'loss': 0.4984, 'grad_norm': 5.124155044555664, 'learning_rate': 3.715436241610738e-05, 'epoch': 0.79}


                                                     
 26%|██▌       | 19653/75000 [16:38<50:56, 18.11it/s]

{'loss': 0.3591, 'grad_norm': 1.4693597555160522, 'learning_rate': 3.714765100671141e-05, 'epoch': 0.79}


                                                     
 26%|██▌       | 19661/75000 [16:39<58:19, 15.81it/s]

{'loss': 0.2789, 'grad_norm': 0.9713961482048035, 'learning_rate': 3.714093959731544e-05, 'epoch': 0.79}


                                                     
 26%|██▌       | 19672/75000 [16:39<51:07, 18.04it/s]

{'loss': 0.3219, 'grad_norm': 2.329300880432129, 'learning_rate': 3.713422818791947e-05, 'epoch': 0.79}


                                                     
 26%|██▌       | 19682/75000 [16:40<51:34, 17.88it/s]

{'loss': 0.3281, 'grad_norm': 2.284247875213623, 'learning_rate': 3.712751677852349e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19692/75000 [16:40<53:25, 17.25it/s]

{'loss': 0.2814, 'grad_norm': 5.537312984466553, 'learning_rate': 3.712080536912752e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19704/75000 [16:41<49:01, 18.80it/s]

{'loss': 0.3108, 'grad_norm': 6.319669246673584, 'learning_rate': 3.711409395973155e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19712/75000 [16:41<52:41, 17.49it/s]

{'loss': 0.4459, 'grad_norm': 5.364607334136963, 'learning_rate': 3.7107382550335575e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19722/75000 [16:42<51:39, 17.83it/s]

{'loss': 0.3028, 'grad_norm': 3.400193691253662, 'learning_rate': 3.71006711409396e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19732/75000 [16:43<56:15, 16.37it/s]

{'loss': 0.3546, 'grad_norm': 6.705654621124268, 'learning_rate': 3.7093959731543626e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19743/75000 [16:43<49:58, 18.43it/s]

{'loss': 0.3151, 'grad_norm': 4.352637767791748, 'learning_rate': 3.7087248322147654e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19751/75000 [16:44<49:51, 18.47it/s]

{'loss': 0.3317, 'grad_norm': 4.115171432495117, 'learning_rate': 3.7080536912751676e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19761/75000 [16:44<50:21, 18.28it/s]

{'loss': 0.3412, 'grad_norm': 3.065175771713257, 'learning_rate': 3.707382550335571e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19772/75000 [16:45<48:31, 18.97it/s]

{'loss': 0.2811, 'grad_norm': 3.7832846641540527, 'learning_rate': 3.706711409395973e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19783/75000 [16:45<49:37, 18.55it/s]

{'loss': 0.3533, 'grad_norm': 2.1332781314849854, 'learning_rate': 3.706040268456376e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19793/75000 [16:46<48:31, 18.96it/s]

{'loss': 0.2462, 'grad_norm': 2.6161603927612305, 'learning_rate': 3.705369127516778e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19801/75000 [16:46<53:12, 17.29it/s]

{'loss': 0.2761, 'grad_norm': 6.477229118347168, 'learning_rate': 3.704697986577181e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19813/75000 [16:47<48:29, 18.97it/s]

{'loss': 0.2558, 'grad_norm': 1.1908026933670044, 'learning_rate': 3.704026845637584e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19823/75000 [16:47<48:56, 18.79it/s]

{'loss': 0.3451, 'grad_norm': 2.2592053413391113, 'learning_rate': 3.703355704697987e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19831/75000 [16:48<53:58, 17.04it/s]

{'loss': 0.4089, 'grad_norm': 3.2647829055786133, 'learning_rate': 3.70268456375839e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19844/75000 [16:49<47:14, 19.46it/s]

{'loss': 0.1974, 'grad_norm': 5.54542875289917, 'learning_rate': 3.702013422818792e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19852/75000 [16:49<48:12, 19.06it/s]

{'loss': 0.4048, 'grad_norm': 2.4840893745422363, 'learning_rate': 3.701342281879195e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19861/75000 [16:50<51:19, 17.91it/s]

{'loss': 0.4399, 'grad_norm': 3.8387608528137207, 'learning_rate': 3.7006711409395976e-05, 'epoch': 0.79}


                                                     
 26%|██▋       | 19873/75000 [16:50<47:44, 19.24it/s]

{'loss': 0.327, 'grad_norm': 4.015624046325684, 'learning_rate': 3.7e-05, 'epoch': 0.79}


                                                     
 27%|██▋       | 19882/75000 [16:51<47:48, 19.21it/s]

{'loss': 0.3302, 'grad_norm': 2.1622140407562256, 'learning_rate': 3.6993288590604033e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19892/75000 [16:51<51:53, 17.70it/s]

{'loss': 0.4006, 'grad_norm': 1.9700552225112915, 'learning_rate': 3.6986577181208055e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19901/75000 [16:52<49:07, 18.69it/s]

{'loss': 0.3581, 'grad_norm': 3.017085313796997, 'learning_rate': 3.6979865771812084e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19911/75000 [16:52<48:41, 18.86it/s]

{'loss': 0.3989, 'grad_norm': 1.1899462938308716, 'learning_rate': 3.6973154362416106e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19921/75000 [16:53<50:36, 18.14it/s]

{'loss': 0.2162, 'grad_norm': 0.9655351042747498, 'learning_rate': 3.6966442953020134e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19934/75000 [16:53<46:33, 19.71it/s]

{'loss': 0.3424, 'grad_norm': 5.280961513519287, 'learning_rate': 3.695973154362416e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19942/75000 [16:54<47:03, 19.50it/s]

{'loss': 0.3811, 'grad_norm': 4.511744022369385, 'learning_rate': 3.695302013422819e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19953/75000 [16:54<47:38, 19.26it/s]

{'loss': 0.3397, 'grad_norm': 4.609085559844971, 'learning_rate': 3.694630872483222e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19962/75000 [16:55<47:17, 19.40it/s]

{'loss': 0.3865, 'grad_norm': 4.73822546005249, 'learning_rate': 3.693959731543624e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19974/75000 [16:55<47:49, 19.18it/s]

{'loss': 0.2378, 'grad_norm': 5.764449596405029, 'learning_rate': 3.693288590604027e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19984/75000 [16:56<45:46, 20.03it/s]

{'loss': 0.3515, 'grad_norm': 2.9642646312713623, 'learning_rate': 3.692617449664429e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 19993/75000 [16:56<48:37, 18.86it/s]

{'loss': 0.4216, 'grad_norm': 2.873798370361328, 'learning_rate': 3.691946308724833e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20000/75000 [16:57<47:08, 19.45it/s]

{'loss': 0.2627, 'grad_norm': 1.0778192281723022, 'learning_rate': 3.6912751677852356e-05, 'epoch': 0.8}


                                                       
 27%|██▋       | 20014/75000 [16:58<55:25, 16.54it/s]  

{'loss': 0.4158, 'grad_norm': 5.080774784088135, 'learning_rate': 3.690604026845638e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20024/75000 [16:59<47:37, 19.24it/s]

{'loss': 0.3037, 'grad_norm': 4.058197021484375, 'learning_rate': 3.6899328859060406e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20034/75000 [16:59<46:00, 19.91it/s]

{'loss': 0.314, 'grad_norm': 1.7448197603225708, 'learning_rate': 3.689261744966443e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20042/75000 [16:59<48:52, 18.74it/s]

{'loss': 0.3806, 'grad_norm': 2.891603469848633, 'learning_rate': 3.6885906040268456e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20052/75000 [17:00<46:09, 19.84it/s]

{'loss': 0.4016, 'grad_norm': 3.5531837940216064, 'learning_rate': 3.6879194630872485e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20062/75000 [17:01<48:17, 18.96it/s]

{'loss': 0.3269, 'grad_norm': 2.9848344326019287, 'learning_rate': 3.6872483221476513e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20072/75000 [17:01<46:24, 19.73it/s]

{'loss': 0.394, 'grad_norm': 5.525209426879883, 'learning_rate': 3.686577181208054e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20082/75000 [17:02<49:35, 18.46it/s]

{'loss': 0.2595, 'grad_norm': 6.559577465057373, 'learning_rate': 3.6859060402684564e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20094/75000 [17:02<45:06, 20.29it/s]

{'loss': 0.2851, 'grad_norm': 2.9173970222473145, 'learning_rate': 3.685234899328859e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20103/75000 [17:03<45:31, 20.09it/s]

{'loss': 0.3448, 'grad_norm': 1.469944715499878, 'learning_rate': 3.6845637583892614e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20112/75000 [17:03<48:51, 18.72it/s]

{'loss': 0.2888, 'grad_norm': 2.4790146350860596, 'learning_rate': 3.683892617449665e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20124/75000 [17:04<47:05, 19.42it/s]

{'loss': 0.3893, 'grad_norm': 4.0471296310424805, 'learning_rate': 3.683221476510068e-05, 'epoch': 0.8}


                                                     
 27%|██▋       | 20132/75000 [17:04<50:06, 18.25it/s]

{'loss': 0.391, 'grad_norm': 4.276144504547119, 'learning_rate': 3.68255033557047e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20143/75000 [17:05<46:35, 19.63it/s]

{'loss': 0.3631, 'grad_norm': 3.8350045680999756, 'learning_rate': 3.681879194630873e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20152/75000 [17:05<48:25, 18.88it/s]

{'loss': 0.3928, 'grad_norm': 4.318090438842773, 'learning_rate': 3.681208053691275e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20162/75000 [17:06<46:19, 19.73it/s]

{'loss': 0.336, 'grad_norm': 2.5947234630584717, 'learning_rate': 3.680536912751678e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20173/75000 [17:06<47:42, 19.15it/s]

{'loss': 0.3485, 'grad_norm': 10.89255428314209, 'learning_rate': 3.679865771812081e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20184/75000 [17:07<48:44, 18.75it/s]

{'loss': 0.4066, 'grad_norm': 16.939552307128906, 'learning_rate': 3.6791946308724836e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20193/75000 [17:07<46:58, 19.44it/s]

{'loss': 0.4162, 'grad_norm': 6.186652183532715, 'learning_rate': 3.6785234899328864e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20202/75000 [17:08<49:47, 18.34it/s]

{'loss': 0.3088, 'grad_norm': 4.287817001342773, 'learning_rate': 3.6778523489932886e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20212/75000 [17:08<47:17, 19.31it/s]

{'loss': 0.2858, 'grad_norm': 1.2666914463043213, 'learning_rate': 3.6771812080536915e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20224/75000 [17:09<47:33, 19.20it/s]

{'loss': 0.4033, 'grad_norm': 1.2135918140411377, 'learning_rate': 3.6765100671140936e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20234/75000 [17:10<46:31, 19.62it/s]

{'loss': 0.3734, 'grad_norm': 8.654213905334473, 'learning_rate': 3.675838926174497e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20242/75000 [17:10<49:05, 18.59it/s]

{'loss': 0.4051, 'grad_norm': 8.673152923583984, 'learning_rate': 3.675167785234899e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20254/75000 [17:11<46:16, 19.72it/s]

{'loss': 0.3661, 'grad_norm': 1.4895238876342773, 'learning_rate': 3.674496644295302e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20263/75000 [17:11<46:46, 19.50it/s]

{'loss': 0.2716, 'grad_norm': 5.159267425537109, 'learning_rate': 3.673825503355705e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20272/75000 [17:12<47:06, 19.36it/s]

{'loss': 0.4109, 'grad_norm': 4.198546409606934, 'learning_rate': 3.673154362416107e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20282/75000 [17:12<48:45, 18.70it/s]

{'loss': 0.3375, 'grad_norm': 1.0072494745254517, 'learning_rate': 3.67248322147651e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20292/75000 [17:13<46:57, 19.42it/s]

{'loss': 0.3394, 'grad_norm': 2.2476816177368164, 'learning_rate': 3.671812080536913e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20304/75000 [17:13<47:04, 19.36it/s]

{'loss': 0.2861, 'grad_norm': 2.3045427799224854, 'learning_rate': 3.671140939597316e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20314/75000 [17:14<45:24, 20.07it/s]

{'loss': 0.2859, 'grad_norm': 2.650463819503784, 'learning_rate': 3.6704697986577186e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20322/75000 [17:14<48:51, 18.65it/s]

{'loss': 0.3447, 'grad_norm': 4.888253211975098, 'learning_rate': 3.669798657718121e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20333/75000 [17:15<47:19, 19.25it/s]

{'loss': 0.2094, 'grad_norm': 1.0891526937484741, 'learning_rate': 3.669127516778524e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20344/75000 [17:15<50:31, 18.03it/s]

{'loss': 0.3998, 'grad_norm': 21.757375717163086, 'learning_rate': 3.6684563758389265e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20354/75000 [17:16<45:20, 20.09it/s]

{'loss': 0.2607, 'grad_norm': 4.799154758453369, 'learning_rate': 3.6677852348993294e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20361/75000 [17:16<51:03, 17.84it/s]

{'loss': 0.2602, 'grad_norm': 2.6530139446258545, 'learning_rate': 3.6671140939597316e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20374/75000 [17:17<45:43, 19.91it/s]

{'loss': 0.439, 'grad_norm': 13.843019485473633, 'learning_rate': 3.6664429530201344e-05, 'epoch': 0.81}


                                                     
 27%|██▋       | 20384/75000 [17:17<45:01, 20.21it/s]

{'loss': 0.268, 'grad_norm': 7.059041976928711, 'learning_rate': 3.665771812080537e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20393/75000 [17:18<47:19, 19.23it/s]

{'loss': 0.2949, 'grad_norm': 1.1471455097198486, 'learning_rate': 3.6651006711409394e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20402/75000 [17:18<46:57, 19.38it/s]

{'loss': 0.3979, 'grad_norm': 4.871835231781006, 'learning_rate': 3.664429530201342e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20414/75000 [17:19<47:55, 18.98it/s]

{'loss': 0.3693, 'grad_norm': 14.70044231414795, 'learning_rate': 3.663758389261745e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20424/75000 [17:19<46:00, 19.77it/s]

{'loss': 0.3357, 'grad_norm': 2.98610258102417, 'learning_rate': 3.663087248322148e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20434/75000 [17:20<45:35, 19.94it/s]

{'loss': 0.3586, 'grad_norm': 2.597579002380371, 'learning_rate': 3.66241610738255e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20441/75000 [17:20<48:04, 18.92it/s]

{'loss': 0.3481, 'grad_norm': 1.324188232421875, 'learning_rate': 3.661744966442953e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20454/75000 [17:21<45:24, 20.02it/s]

{'loss': 0.3679, 'grad_norm': 1.633871078491211, 'learning_rate': 3.661073825503356e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20462/75000 [17:21<50:22, 18.05it/s]

{'loss': 0.3366, 'grad_norm': 5.762320041656494, 'learning_rate': 3.660402684563759e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20473/75000 [17:22<45:32, 19.96it/s]

{'loss': 0.395, 'grad_norm': 4.302221298217773, 'learning_rate': 3.6597315436241616e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20483/75000 [17:23<45:39, 19.90it/s]

{'loss': 0.4685, 'grad_norm': 5.389719009399414, 'learning_rate': 3.659060402684564e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20494/75000 [17:23<46:09, 19.68it/s]

{'loss': 0.3916, 'grad_norm': 2.3996315002441406, 'learning_rate': 3.6583892617449666e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20500/75000 [17:23<46:45, 19.43it/s]

{'loss': 0.3384, 'grad_norm': 2.514270544052124, 'learning_rate': 3.6577181208053695e-05, 'epoch': 0.82}


                                                       
 27%|██▋       | 20514/75000 [17:25<52:52, 17.17it/s]  

{'loss': 0.3105, 'grad_norm': 5.849871635437012, 'learning_rate': 3.657046979865772e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20523/75000 [17:25<48:39, 18.66it/s]

{'loss': 0.3402, 'grad_norm': 5.361521244049072, 'learning_rate': 3.6563758389261745e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20534/75000 [17:26<46:24, 19.56it/s]

{'loss': 0.3345, 'grad_norm': 2.2270774841308594, 'learning_rate': 3.6557046979865774e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20541/75000 [17:26<46:04, 19.70it/s]

{'loss': 0.3905, 'grad_norm': 6.142821311950684, 'learning_rate': 3.65503355704698e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20552/75000 [17:27<44:32, 20.38it/s]

{'loss': 0.4056, 'grad_norm': 1.8921045064926147, 'learning_rate': 3.6543624161073824e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20562/75000 [17:27<48:05, 18.87it/s]

{'loss': 0.2843, 'grad_norm': 2.8308844566345215, 'learning_rate': 3.653691275167785e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20573/75000 [17:28<44:38, 20.32it/s]

{'loss': 0.3668, 'grad_norm': 3.469144821166992, 'learning_rate': 3.653020134228188e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20583/75000 [17:28<45:53, 19.76it/s]

{'loss': 0.3708, 'grad_norm': 2.8560752868652344, 'learning_rate': 3.652348993288591e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20593/75000 [17:29<45:32, 19.91it/s]

{'loss': 0.3179, 'grad_norm': 1.750449299812317, 'learning_rate': 3.651677852348994e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20604/75000 [17:29<45:06, 20.10it/s]

{'loss': 0.3104, 'grad_norm': 1.5484685897827148, 'learning_rate': 3.651006711409396e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20612/75000 [17:30<50:10, 18.07it/s]

{'loss': 0.3707, 'grad_norm': 4.252328872680664, 'learning_rate': 3.650335570469799e-05, 'epoch': 0.82}


                                                     
 27%|██▋       | 20622/75000 [17:30<47:05, 19.25it/s]

{'loss': 0.3284, 'grad_norm': 10.852068901062012, 'learning_rate': 3.649664429530201e-05, 'epoch': 0.82}


                                                     
 28%|██▊       | 20631/75000 [17:31<52:38, 17.22it/s]

{'loss': 0.28, 'grad_norm': 1.68356192111969, 'learning_rate': 3.648993288590604e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20642/75000 [17:31<46:21, 19.54it/s]

{'loss': 0.2891, 'grad_norm': 1.1351462602615356, 'learning_rate': 3.6483221476510074e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20654/75000 [17:32<44:42, 20.26it/s]

{'loss': 0.2629, 'grad_norm': 2.118403911590576, 'learning_rate': 3.6476510067114096e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20664/75000 [17:32<46:47, 19.36it/s]

{'loss': 0.41, 'grad_norm': 1.5441147089004517, 'learning_rate': 3.6469798657718125e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20674/75000 [17:33<45:01, 20.11it/s]

{'loss': 0.3251, 'grad_norm': 4.433177947998047, 'learning_rate': 3.6463087248322146e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20682/75000 [17:33<48:48, 18.55it/s]

{'loss': 0.3138, 'grad_norm': 2.5783944129943848, 'learning_rate': 3.6456375838926175e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20694/75000 [17:34<45:37, 19.83it/s]

{'loss': 0.2551, 'grad_norm': 3.83978533744812, 'learning_rate': 3.6449664429530203e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20702/75000 [17:34<50:38, 17.87it/s]

{'loss': 0.5325, 'grad_norm': 3.7609684467315674, 'learning_rate': 3.644295302013423e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20714/75000 [17:35<45:15, 19.99it/s]

{'loss': 0.3726, 'grad_norm': 2.6196720600128174, 'learning_rate': 3.643624161073826e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20721/75000 [17:35<49:59, 18.09it/s]

{'loss': 0.3344, 'grad_norm': 2.411113739013672, 'learning_rate': 3.642953020134228e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20732/75000 [17:36<45:38, 19.82it/s]

{'loss': 0.3628, 'grad_norm': 3.3181440830230713, 'learning_rate': 3.642281879194631e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20743/75000 [17:36<46:30, 19.44it/s]

{'loss': 0.4453, 'grad_norm': 9.248571395874023, 'learning_rate': 3.641610738255033e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20752/75000 [17:37<45:22, 19.92it/s]

{'loss': 0.4268, 'grad_norm': 2.405910015106201, 'learning_rate': 3.640939597315436e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20764/75000 [17:38<47:18, 19.10it/s]

{'loss': 0.4446, 'grad_norm': 3.3025455474853516, 'learning_rate': 3.6402684563758397e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20772/75000 [17:38<48:43, 18.55it/s]

{'loss': 0.4551, 'grad_norm': 3.5477304458618164, 'learning_rate': 3.639597315436242e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20781/75000 [17:39<50:30, 17.89it/s]

{'loss': 0.3208, 'grad_norm': 4.5999531745910645, 'learning_rate': 3.638926174496645e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20792/75000 [17:39<45:44, 19.75it/s]

{'loss': 0.2715, 'grad_norm': 2.0831613540649414, 'learning_rate': 3.638255033557047e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20802/75000 [17:40<45:34, 19.82it/s]

{'loss': 0.2387, 'grad_norm': 2.925168514251709, 'learning_rate': 3.63758389261745e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20812/75000 [17:40<46:55, 19.25it/s]

{'loss': 0.369, 'grad_norm': 6.7721052169799805, 'learning_rate': 3.6369127516778526e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20822/75000 [17:41<46:10, 19.55it/s]

{'loss': 0.2028, 'grad_norm': 4.586540699005127, 'learning_rate': 3.6362416107382554e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20834/75000 [17:41<46:22, 19.46it/s]

{'loss': 0.2367, 'grad_norm': 0.4599560797214508, 'learning_rate': 3.635570469798658e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20844/75000 [17:42<44:56, 20.08it/s]

{'loss': 0.2821, 'grad_norm': 2.9683656692504883, 'learning_rate': 3.6348993288590605e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20852/75000 [17:42<47:05, 19.17it/s]

{'loss': 0.304, 'grad_norm': 11.844438552856445, 'learning_rate': 3.634228187919463e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20863/75000 [17:43<44:48, 20.14it/s]

{'loss': 0.306, 'grad_norm': 7.768498420715332, 'learning_rate': 3.6335570469798655e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20873/75000 [17:43<48:03, 18.77it/s]

{'loss': 0.3688, 'grad_norm': 19.012985229492188, 'learning_rate': 3.632885906040269e-05, 'epoch': 0.83}


                                                     
 28%|██▊       | 20884/75000 [17:44<45:14, 19.94it/s]

{'loss': 0.3695, 'grad_norm': 1.701973557472229, 'learning_rate': 3.632214765100671e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20892/75000 [17:44<48:35, 18.56it/s]

{'loss': 0.2987, 'grad_norm': 3.167447566986084, 'learning_rate': 3.631543624161074e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20904/75000 [17:45<45:20, 19.88it/s]

{'loss': 0.3036, 'grad_norm': 0.7149181962013245, 'learning_rate': 3.630872483221477e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20913/75000 [17:45<45:59, 19.60it/s]

{'loss': 0.3469, 'grad_norm': 3.524991512298584, 'learning_rate': 3.630201342281879e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20924/75000 [17:46<45:10, 19.95it/s]

{'loss': 0.32, 'grad_norm': 3.0231409072875977, 'learning_rate': 3.629530201342282e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20931/75000 [17:46<48:02, 18.75it/s]

{'loss': 0.3904, 'grad_norm': 5.439685344696045, 'learning_rate': 3.628859060402685e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20942/75000 [17:47<45:14, 19.92it/s]

{'loss': 0.3442, 'grad_norm': 2.6134397983551025, 'learning_rate': 3.6281879194630876e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20954/75000 [17:47<44:37, 20.19it/s]

{'loss': 0.3013, 'grad_norm': 2.1348023414611816, 'learning_rate': 3.6275167785234905e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20962/75000 [17:48<47:30, 18.96it/s]

{'loss': 0.3072, 'grad_norm': 0.5557781457901001, 'learning_rate': 3.626845637583893e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20972/75000 [17:48<45:57, 19.59it/s]

{'loss': 0.3073, 'grad_norm': 1.912589192390442, 'learning_rate': 3.6261744966442955e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20984/75000 [17:49<46:35, 19.33it/s]

{'loss': 0.3116, 'grad_norm': 4.043051719665527, 'learning_rate': 3.625503355704698e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 20992/75000 [17:49<47:21, 19.01it/s]

{'loss': 0.4575, 'grad_norm': 10.838125228881836, 'learning_rate': 3.624832214765101e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21000/75000 [17:50<50:04, 17.97it/s]

{'loss': 0.3535, 'grad_norm': 9.87940788269043, 'learning_rate': 3.6241610738255034e-05, 'epoch': 0.84}


                                                       
 28%|██▊       | 21014/75000 [17:51<52:32, 17.13it/s]  

{'loss': 0.4185, 'grad_norm': 3.9553561210632324, 'learning_rate': 3.623489932885906e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21024/75000 [17:52<49:04, 18.33it/s]

{'loss': 0.3703, 'grad_norm': 1.502233862876892, 'learning_rate': 3.622818791946309e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21034/75000 [17:52<45:15, 19.88it/s]

{'loss': 0.2846, 'grad_norm': 8.59956169128418, 'learning_rate': 3.622147651006711e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21042/75000 [17:53<49:41, 18.10it/s]

{'loss': 0.3191, 'grad_norm': 2.4330215454101562, 'learning_rate': 3.621476510067114e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21051/75000 [17:53<46:47, 19.21it/s]

{'loss': 0.3818, 'grad_norm': 2.2578351497650146, 'learning_rate': 3.620805369127517e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21064/75000 [17:54<44:25, 20.24it/s]

{'loss': 0.3053, 'grad_norm': 7.097170829772949, 'learning_rate': 3.62013422818792e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21074/75000 [17:54<46:14, 19.44it/s]

{'loss': 0.2476, 'grad_norm': 3.78397536277771, 'learning_rate': 3.619463087248322e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21084/75000 [17:55<44:40, 20.11it/s]

{'loss': 0.4633, 'grad_norm': 1.7631821632385254, 'learning_rate': 3.618791946308725e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21093/75000 [17:55<47:06, 19.07it/s]

{'loss': 0.421, 'grad_norm': 3.1707539558410645, 'learning_rate': 3.618120805369128e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21102/75000 [17:56<45:20, 19.81it/s]

{'loss': 0.4254, 'grad_norm': 4.65762996673584, 'learning_rate': 3.61744966442953e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21112/75000 [17:56<47:45, 18.81it/s]

{'loss': 0.3793, 'grad_norm': 1.7054201364517212, 'learning_rate': 3.6167785234899335e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21123/75000 [17:57<44:57, 19.97it/s]

{'loss': 0.3819, 'grad_norm': 2.207557201385498, 'learning_rate': 3.6161073825503356e-05, 'epoch': 0.84}


                                                     
 28%|██▊       | 21132/75000 [17:57<48:21, 18.57it/s]

{'loss': 0.2347, 'grad_norm': 3.139030933380127, 'learning_rate': 3.6154362416107385e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21144/75000 [17:58<44:42, 20.08it/s]

{'loss': 0.3213, 'grad_norm': 1.4530394077301025, 'learning_rate': 3.6147651006711414e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21153/75000 [17:58<48:20, 18.57it/s]

{'loss': 0.3746, 'grad_norm': 2.1856701374053955, 'learning_rate': 3.6140939597315435e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21163/75000 [17:59<45:33, 19.70it/s]

{'loss': 0.3159, 'grad_norm': 2.3254599571228027, 'learning_rate': 3.6134228187919464e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21172/75000 [17:59<49:54, 17.97it/s]

{'loss': 0.3039, 'grad_norm': 5.14227294921875, 'learning_rate': 3.612751677852349e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21183/75000 [18:00<44:53, 19.98it/s]

{'loss': 0.3061, 'grad_norm': 4.038543224334717, 'learning_rate': 3.612080536912752e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21194/75000 [18:00<48:19, 18.56it/s]

{'loss': 0.258, 'grad_norm': 6.418686866760254, 'learning_rate': 3.611409395973154e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21204/75000 [18:01<45:04, 19.89it/s]

{'loss': 0.1998, 'grad_norm': 3.9903321266174316, 'learning_rate': 3.610738255033557e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21213/75000 [18:01<44:28, 20.16it/s]

{'loss': 0.3925, 'grad_norm': 1.834726095199585, 'learning_rate': 3.61006711409396e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21224/75000 [18:02<46:01, 19.47it/s]

{'loss': 0.2376, 'grad_norm': 2.761141777038574, 'learning_rate': 3.609395973154363e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21234/75000 [18:02<44:38, 20.07it/s]

{'loss': 0.3415, 'grad_norm': 9.289974212646484, 'learning_rate': 3.608724832214766e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21242/75000 [18:03<47:03, 19.04it/s]

{'loss': 0.2778, 'grad_norm': 4.738995552062988, 'learning_rate': 3.608053691275168e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21254/75000 [18:04<45:00, 19.90it/s]

{'loss': 0.4095, 'grad_norm': 1.9628831148147583, 'learning_rate': 3.607382550335571e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21264/75000 [18:04<45:34, 19.65it/s]

{'loss': 0.5708, 'grad_norm': 2.275676727294922, 'learning_rate': 3.606711409395973e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21274/75000 [18:05<44:26, 20.15it/s]

{'loss': 0.3521, 'grad_norm': 1.4758914709091187, 'learning_rate': 3.606040268456376e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21284/75000 [18:05<46:36, 19.21it/s]

{'loss': 0.4391, 'grad_norm': 5.0930094718933105, 'learning_rate': 3.6053691275167786e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21292/75000 [18:06<45:02, 19.88it/s]

{'loss': 0.3321, 'grad_norm': 6.7456159591674805, 'learning_rate': 3.6046979865771815e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21301/75000 [18:06<48:15, 18.54it/s]

{'loss': 0.351, 'grad_norm': 1.319446325302124, 'learning_rate': 3.604026845637584e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21313/75000 [18:07<44:57, 19.90it/s]

{'loss': 0.302, 'grad_norm': 4.153785705566406, 'learning_rate': 3.6033557046979865e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21323/75000 [18:07<44:50, 19.95it/s]

{'loss': 0.3587, 'grad_norm': 3.0111989974975586, 'learning_rate': 3.6026845637583893e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21334/75000 [18:08<46:07, 19.39it/s]

{'loss': 0.2568, 'grad_norm': 2.6633105278015137, 'learning_rate': 3.602013422818792e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21341/75000 [18:08<46:09, 19.37it/s]

{'loss': 0.3547, 'grad_norm': 4.921359539031982, 'learning_rate': 3.601342281879195e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21353/75000 [18:09<43:57, 20.34it/s]

{'loss': 0.2028, 'grad_norm': 3.1309666633605957, 'learning_rate': 3.600671140939598e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21364/75000 [18:09<46:06, 19.38it/s]

{'loss': 0.3611, 'grad_norm': 2.010547399520874, 'learning_rate': 3.6e-05, 'epoch': 0.85}


                                                     
 28%|██▊       | 21374/75000 [18:10<44:08, 20.24it/s]

{'loss': 0.2927, 'grad_norm': 7.569654941558838, 'learning_rate': 3.599328859060403e-05, 'epoch': 0.85}


                                                     
 29%|██▊       | 21384/75000 [18:10<45:41, 19.56it/s]

{'loss': 0.3374, 'grad_norm': 2.3135905265808105, 'learning_rate': 3.598657718120805e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21394/75000 [18:11<44:42, 19.98it/s]

{'loss': 0.3273, 'grad_norm': 1.5475478172302246, 'learning_rate': 3.597986577181208e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21404/75000 [18:11<45:06, 19.81it/s]

{'loss': 0.4166, 'grad_norm': 10.251960754394531, 'learning_rate': 3.597315436241611e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21412/75000 [18:12<45:38, 19.57it/s]

{'loss': 0.4801, 'grad_norm': 5.031583786010742, 'learning_rate': 3.596644295302014e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21422/75000 [18:12<48:31, 18.40it/s]

{'loss': 0.3364, 'grad_norm': 1.9076381921768188, 'learning_rate': 3.5959731543624165e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21432/75000 [18:13<45:37, 19.57it/s]

{'loss': 0.3315, 'grad_norm': 1.4944543838500977, 'learning_rate': 3.595302013422819e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21444/75000 [18:13<46:58, 19.00it/s]

{'loss': 0.3141, 'grad_norm': 3.929218292236328, 'learning_rate': 3.5946308724832216e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21454/75000 [18:14<45:27, 19.63it/s]

{'loss': 0.2484, 'grad_norm': 2.6018948554992676, 'learning_rate': 3.593959731543624e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21462/75000 [18:14<45:38, 19.55it/s]

{'loss': 0.3483, 'grad_norm': 13.39341926574707, 'learning_rate': 3.593288590604027e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21473/75000 [18:15<44:13, 20.18it/s]

{'loss': 0.4857, 'grad_norm': 5.736502647399902, 'learning_rate': 3.59261744966443e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21484/75000 [18:15<45:30, 19.60it/s]

{'loss': 0.4197, 'grad_norm': 4.2641072273254395, 'learning_rate': 3.591946308724832e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21492/75000 [18:16<49:07, 18.15it/s]

{'loss': 0.4123, 'grad_norm': 5.194202423095703, 'learning_rate': 3.591275167785235e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21500/75000 [18:16<45:47, 19.48it/s]

{'loss': 0.2628, 'grad_norm': 1.3598159551620483, 'learning_rate': 3.5906040268456373e-05, 'epoch': 0.86}


                                                       
 29%|██▊       | 21514/75000 [18:17<53:43, 16.59it/s]  

{'loss': 0.3811, 'grad_norm': 4.133301258087158, 'learning_rate': 3.58993288590604e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21524/75000 [18:18<46:17, 19.25it/s]

{'loss': 0.3493, 'grad_norm': 2.1238107681274414, 'learning_rate': 3.589261744966443e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21533/75000 [18:18<46:25, 19.20it/s]

{'loss': 0.3101, 'grad_norm': 0.5665575861930847, 'learning_rate': 3.588590604026846e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21544/75000 [18:19<44:09, 20.18it/s]

{'loss': 0.4232, 'grad_norm': 3.399686813354492, 'learning_rate': 3.587919463087249e-05, 'epoch': 0.86}


                                                     
 29%|██▊       | 21553/75000 [18:20<49:39, 17.94it/s]

{'loss': 0.2338, 'grad_norm': 3.14269757270813, 'learning_rate': 3.587248322147651e-05, 'epoch': 0.86}


                                                     
 29%|██▉       | 21564/75000 [18:20<45:47, 19.45it/s]

{'loss': 0.2218, 'grad_norm': 1.235278844833374, 'learning_rate': 3.586577181208054e-05, 'epoch': 0.86}


                                                     
 29%|██▉       | 21572/75000 [18:21<53:47, 16.55it/s]

{'loss': 0.251, 'grad_norm': 2.1125428676605225, 'learning_rate': 3.5859060402684567e-05, 'epoch': 0.86}


                                                     
 29%|██▉       | 21582/75000 [18:21<46:54, 18.98it/s]

{'loss': 0.2285, 'grad_norm': 5.029492378234863, 'learning_rate': 3.5852348993288595e-05, 'epoch': 0.86}


                                                     
 29%|██▉       | 21594/75000 [18:22<44:18, 20.09it/s]

{'loss': 0.348, 'grad_norm': 5.926544666290283, 'learning_rate': 3.5845637583892624e-05, 'epoch': 0.86}


                                                     
 29%|██▉       | 21602/75000 [18:22<46:28, 19.15it/s]

{'loss': 0.3182, 'grad_norm': 4.778042316436768, 'learning_rate': 3.5838926174496645e-05, 'epoch': 0.86}


                                                     
 29%|██▉       | 21614/75000 [18:23<43:40, 20.37it/s]

{'loss': 0.3022, 'grad_norm': 3.117744207382202, 'learning_rate': 3.5832214765100674e-05, 'epoch': 0.86}


                                                     
 29%|██▉       | 21624/75000 [18:23<46:29, 19.14it/s]

{'loss': 0.4438, 'grad_norm': 0.39377957582473755, 'learning_rate': 3.5825503355704696e-05, 'epoch': 0.86}


                                                     
 29%|██▉       | 21634/75000 [18:24<44:31, 19.97it/s]

{'loss': 0.3797, 'grad_norm': 1.24778413772583, 'learning_rate': 3.5818791946308724e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21642/75000 [18:24<48:19, 18.40it/s]

{'loss': 0.3576, 'grad_norm': 9.057132720947266, 'learning_rate': 3.581208053691275e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21652/75000 [18:25<45:08, 19.70it/s]

{'loss': 0.307, 'grad_norm': 1.7765240669250488, 'learning_rate': 3.580536912751678e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21664/75000 [18:25<47:15, 18.81it/s]

{'loss': 0.2857, 'grad_norm': 0.8804162740707397, 'learning_rate': 3.579865771812081e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21674/75000 [18:26<44:36, 19.92it/s]

{'loss': 0.4513, 'grad_norm': 4.862271308898926, 'learning_rate': 3.579194630872483e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21683/75000 [18:26<46:38, 19.05it/s]

{'loss': 0.3115, 'grad_norm': 3.023056983947754, 'learning_rate': 3.578523489932886e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21693/75000 [18:27<45:15, 19.63it/s]

{'loss': 0.332, 'grad_norm': 2.852388381958008, 'learning_rate': 3.577852348993289e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21703/75000 [18:27<47:28, 18.71it/s]

{'loss': 0.3149, 'grad_norm': 4.742152214050293, 'learning_rate': 3.577181208053692e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21714/75000 [18:28<44:29, 19.96it/s]

{'loss': 0.2847, 'grad_norm': 3.077428102493286, 'learning_rate': 3.576510067114094e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21723/75000 [18:28<46:43, 19.01it/s]

{'loss': 0.4581, 'grad_norm': 9.829171180725098, 'learning_rate': 3.575838926174497e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21734/75000 [18:29<45:20, 19.58it/s]

{'loss': 0.4041, 'grad_norm': 4.304623126983643, 'learning_rate': 3.5751677852348996e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21743/75000 [18:29<45:24, 19.55it/s]

{'loss': 0.3517, 'grad_norm': 18.104736328125, 'learning_rate': 3.574496644295302e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21754/75000 [18:30<45:06, 19.67it/s]

{'loss': 0.3507, 'grad_norm': 7.769227027893066, 'learning_rate': 3.5738255033557046e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21762/75000 [18:30<44:30, 19.93it/s]

{'loss': 0.326, 'grad_norm': 3.0315842628479004, 'learning_rate': 3.5731543624161075e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21773/75000 [18:31<47:09, 18.81it/s]

{'loss': 0.4173, 'grad_norm': 2.5044472217559814, 'learning_rate': 3.5724832214765104e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21784/75000 [18:32<43:55, 20.19it/s]

{'loss': 0.4034, 'grad_norm': 5.054470539093018, 'learning_rate': 3.571812080536913e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21794/75000 [18:32<45:30, 19.49it/s]

{'loss': 0.2895, 'grad_norm': 3.4963037967681885, 'learning_rate': 3.5711409395973154e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21804/75000 [18:33<44:23, 19.97it/s]

{'loss': 0.4189, 'grad_norm': 2.826387643814087, 'learning_rate': 3.570469798657718e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21813/75000 [18:33<47:48, 18.54it/s]

{'loss': 0.2868, 'grad_norm': 3.3242697715759277, 'learning_rate': 3.569798657718121e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21824/75000 [18:34<44:21, 19.98it/s]

{'loss': 0.43, 'grad_norm': 2.286121368408203, 'learning_rate': 3.569127516778524e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21833/75000 [18:34<45:53, 19.31it/s]

{'loss': 0.3517, 'grad_norm': 5.4164910316467285, 'learning_rate': 3.568456375838926e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21844/75000 [18:35<43:51, 20.20it/s]

{'loss': 0.2979, 'grad_norm': 5.15717887878418, 'learning_rate': 3.567785234899329e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21852/75000 [18:35<46:52, 18.90it/s]

{'loss': 0.3814, 'grad_norm': 4.000735282897949, 'learning_rate': 3.567114093959732e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21864/75000 [18:36<44:25, 19.94it/s]

{'loss': 0.3517, 'grad_norm': 2.918534517288208, 'learning_rate': 3.566442953020134e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21874/75000 [18:36<44:34, 19.87it/s]

{'loss': 0.3517, 'grad_norm': 0.9516729712486267, 'learning_rate': 3.5657718120805375e-05, 'epoch': 0.87}


                                                     
 29%|██▉       | 21884/75000 [18:37<43:46, 20.22it/s]

{'loss': 0.3464, 'grad_norm': 1.4246622323989868, 'learning_rate': 3.56510067114094e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21892/75000 [18:37<47:58, 18.45it/s]

{'loss': 0.3233, 'grad_norm': 19.110321044921875, 'learning_rate': 3.5644295302013426e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21902/75000 [18:38<44:56, 19.69it/s]

{'loss': 0.3359, 'grad_norm': 1.3140621185302734, 'learning_rate': 3.563758389261745e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21911/75000 [18:38<46:10, 19.16it/s]

{'loss': 0.3194, 'grad_norm': 3.9168035984039307, 'learning_rate': 3.5630872483221476e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21922/75000 [18:39<43:50, 20.18it/s]

{'loss': 0.3438, 'grad_norm': 2.2079224586486816, 'learning_rate': 3.5624161073825505e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21934/75000 [18:39<43:26, 20.36it/s]

{'loss': 0.3915, 'grad_norm': 3.918752431869507, 'learning_rate': 3.561744966442953e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21944/75000 [18:40<44:26, 19.89it/s]

{'loss': 0.345, 'grad_norm': 9.66451358795166, 'learning_rate': 3.561073825503356e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21954/75000 [18:40<43:20, 20.40it/s]

{'loss': 0.331, 'grad_norm': 6.992809295654297, 'learning_rate': 3.5604026845637584e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21964/75000 [18:41<45:25, 19.46it/s]

{'loss': 0.3619, 'grad_norm': 4.886369705200195, 'learning_rate': 3.559731543624161e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21974/75000 [18:41<43:29, 20.32it/s]

{'loss': 0.3386, 'grad_norm': 1.5744833946228027, 'learning_rate': 3.559060402684564e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21983/75000 [18:42<45:19, 19.50it/s]

{'loss': 0.2291, 'grad_norm': 2.212397575378418, 'learning_rate': 3.558389261744966e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 21992/75000 [18:42<45:11, 19.55it/s]

{'loss': 0.2894, 'grad_norm': 2.6852200031280518, 'learning_rate': 3.55771812080537e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22000/75000 [18:43<43:59, 20.08it/s]

{'loss': 0.2619, 'grad_norm': 7.394527435302734, 'learning_rate': 3.557046979865772e-05, 'epoch': 0.88}


                                                       
 29%|██▉       | 22012/75000 [18:47<2:28:45,  5.94it/s]

{'loss': 0.3283, 'grad_norm': 3.4023146629333496, 'learning_rate': 3.556375838926175e-05, 'epoch': 0.88}


                                                       
 29%|██▉       | 22022/75000 [18:48<1:09:45, 12.66it/s]

{'loss': 0.4282, 'grad_norm': 3.7200541496276855, 'learning_rate': 3.555704697986577e-05, 'epoch': 0.88}


                                                       
 29%|██▉       | 22034/75000 [18:48<50:05, 17.63it/s]

{'loss': 0.2318, 'grad_norm': 8.645034790039062, 'learning_rate': 3.55503355704698e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22044/75000 [18:49<44:21, 19.89it/s]

{'loss': 0.3545, 'grad_norm': 3.5269253253936768, 'learning_rate': 3.554362416107383e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22053/75000 [18:49<47:49, 18.45it/s]

{'loss': 0.346, 'grad_norm': 5.662933349609375, 'learning_rate': 3.5536912751677855e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22063/75000 [18:50<45:12, 19.52it/s]

{'loss': 0.3916, 'grad_norm': 43.99885559082031, 'learning_rate': 3.5530201342281884e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22072/75000 [18:50<51:17, 17.20it/s]

{'loss': 0.2793, 'grad_norm': 3.175309419631958, 'learning_rate': 3.5523489932885906e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22082/75000 [18:51<45:40, 19.31it/s]

{'loss': 0.3339, 'grad_norm': 1.424433946609497, 'learning_rate': 3.5516778523489934e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22092/75000 [18:51<48:06, 18.33it/s]

{'loss': 0.3834, 'grad_norm': 3.489978790283203, 'learning_rate': 3.5510067114093956e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22104/75000 [18:52<43:40, 20.18it/s]

{'loss': 0.3861, 'grad_norm': 6.814484119415283, 'learning_rate': 3.550335570469799e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22112/75000 [18:52<45:00, 19.59it/s]

{'loss': 0.3466, 'grad_norm': 4.775671005249023, 'learning_rate': 3.549664429530202e-05, 'epoch': 0.88}


                                                     
 29%|██▉       | 22124/75000 [18:53<45:02, 19.56it/s]

{'loss': 0.4023, 'grad_norm': 0.8540894985198975, 'learning_rate': 3.548993288590604e-05, 'epoch': 0.88}


                                                     
 30%|██▉       | 22133/75000 [18:54<45:48, 19.24it/s]

{'loss': 0.2159, 'grad_norm': 6.911706447601318, 'learning_rate': 3.548322147651007e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22144/75000 [18:54<44:16, 19.89it/s]

{'loss': 0.1684, 'grad_norm': 0.8378949165344238, 'learning_rate': 3.547651006711409e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22152/75000 [18:55<46:57, 18.76it/s]

{'loss': 0.5364, 'grad_norm': 9.070066452026367, 'learning_rate': 3.546979865771812e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22162/75000 [18:55<44:35, 19.75it/s]

{'loss': 0.2302, 'grad_norm': 8.577186584472656, 'learning_rate': 3.546308724832215e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22174/75000 [18:56<43:38, 20.17it/s]

{'loss': 0.3032, 'grad_norm': 3.0710480213165283, 'learning_rate': 3.545637583892618e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22182/75000 [18:56<47:51, 18.39it/s]

{'loss': 0.344, 'grad_norm': 0.9479368925094604, 'learning_rate': 3.5449664429530206e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22193/75000 [18:57<44:26, 19.80it/s]

{'loss': 0.2677, 'grad_norm': 5.042255878448486, 'learning_rate': 3.544295302013423e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22202/75000 [18:57<47:51, 18.39it/s]

{'loss': 0.4373, 'grad_norm': 4.022744655609131, 'learning_rate': 3.5436241610738257e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22214/75000 [18:58<44:23, 19.82it/s]

{'loss': 0.3042, 'grad_norm': 4.009057521820068, 'learning_rate': 3.542953020134228e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22223/75000 [18:58<46:26, 18.94it/s]

{'loss': 0.4099, 'grad_norm': 9.9661283493042, 'learning_rate': 3.5422818791946314e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22234/75000 [18:59<44:09, 19.91it/s]

{'loss': 0.3388, 'grad_norm': 3.974107503890991, 'learning_rate': 3.541610738255034e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22242/75000 [18:59<48:08, 18.27it/s]

{'loss': 0.4531, 'grad_norm': 3.8196871280670166, 'learning_rate': 3.5409395973154364e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22254/75000 [19:00<44:08, 19.91it/s]

{'loss': 0.3307, 'grad_norm': 2.234015703201294, 'learning_rate': 3.540268456375839e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22263/75000 [19:00<45:41, 19.24it/s]

{'loss': 0.3506, 'grad_norm': 3.4418797492980957, 'learning_rate': 3.5395973154362414e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22272/75000 [19:01<46:11, 19.02it/s]

{'loss': 0.3668, 'grad_norm': 2.2645628452301025, 'learning_rate': 3.538926174496644e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22284/75000 [19:01<47:17, 18.58it/s]

{'loss': 0.2978, 'grad_norm': 1.3299647569656372, 'learning_rate': 3.538255033557047e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22291/75000 [19:02<46:37, 18.84it/s]

{'loss': 0.2668, 'grad_norm': 6.372917175292969, 'learning_rate': 3.53758389261745e-05, 'epoch': 0.89}


                                                       
 30%|██▉       | 22303/75000 [19:03<1:01:27, 14.29it/s]

{'loss': 0.5305, 'grad_norm': 1.870529055595398, 'learning_rate': 3.536912751677853e-05, 'epoch': 0.89}


                                                       
 30%|██▉       | 22314/75000 [19:03<47:36, 18.45it/s]

{'loss': 0.3742, 'grad_norm': 4.5025811195373535, 'learning_rate': 3.536241610738255e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22323/75000 [19:04<47:13, 18.59it/s]

{'loss': 0.3127, 'grad_norm': 5.4925079345703125, 'learning_rate': 3.535570469798658e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22332/75000 [19:04<45:15, 19.40it/s]

{'loss': 0.2998, 'grad_norm': 16.2377872467041, 'learning_rate': 3.53489932885906e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22344/75000 [19:05<46:22, 18.92it/s]

{'loss': 0.4066, 'grad_norm': 5.018377780914307, 'learning_rate': 3.5342281879194636e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22354/75000 [19:05<44:28, 19.73it/s]

{'loss': 0.3261, 'grad_norm': 2.491539239883423, 'learning_rate': 3.533557046979866e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22363/75000 [19:06<47:35, 18.44it/s]

{'loss': 0.2807, 'grad_norm': 2.453385353088379, 'learning_rate': 3.5328859060402686e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22374/75000 [19:07<44:45, 19.60it/s]

{'loss': 0.3602, 'grad_norm': 4.6301069259643555, 'learning_rate': 3.5322147651006715e-05, 'epoch': 0.89}


                                                     
 30%|██▉       | 22382/75000 [19:07<46:44, 18.76it/s]

{'loss': 0.3039, 'grad_norm': 1.7707154750823975, 'learning_rate': 3.5315436241610737e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22392/75000 [19:08<44:27, 19.72it/s]

{'loss': 0.2482, 'grad_norm': 2.046114444732666, 'learning_rate': 3.5308724832214765e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22404/75000 [19:08<44:11, 19.84it/s]

{'loss': 0.366, 'grad_norm': 1.109495997428894, 'learning_rate': 3.5302013422818794e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22413/75000 [19:09<44:22, 19.75it/s]

{'loss': 0.3993, 'grad_norm': 2.547776699066162, 'learning_rate': 3.529530201342282e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22424/75000 [19:09<44:43, 19.59it/s]

{'loss': 0.4076, 'grad_norm': 3.58410382270813, 'learning_rate': 3.528859060402685e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22432/75000 [19:10<44:28, 19.70it/s]

{'loss': 0.2597, 'grad_norm': 2.0108277797698975, 'learning_rate': 3.528187919463087e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22444/75000 [19:10<43:28, 20.15it/s]

{'loss': 0.3535, 'grad_norm': 7.649786472320557, 'learning_rate': 3.52751677852349e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22454/75000 [19:11<45:59, 19.04it/s]

{'loss': 0.1855, 'grad_norm': 2.4468026161193848, 'learning_rate': 3.526845637583893e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22462/75000 [19:11<46:19, 18.90it/s]

{'loss': 0.3931, 'grad_norm': 3.240691661834717, 'learning_rate': 3.526174496644296e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22472/75000 [19:12<49:30, 17.69it/s]

{'loss': 0.4484, 'grad_norm': 3.5943901538848877, 'learning_rate': 3.525503355704698e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22483/75000 [19:12<48:22, 18.10it/s]

{'loss': 0.3574, 'grad_norm': 5.83281135559082, 'learning_rate': 3.524832214765101e-05, 'epoch': 0.9}


                                                     
 30%|██▉       | 22492/75000 [19:13<53:51, 16.25it/s]

{'loss': 0.2762, 'grad_norm': 11.356606483459473, 'learning_rate': 3.524161073825504e-05, 'epoch': 0.9}


                                                     
 30%|███       | 22500/75000 [19:13<53:43, 16.29it/s]

{'loss': 0.2847, 'grad_norm': 1.9463961124420166, 'learning_rate': 3.523489932885906e-05, 'epoch': 0.9}


                                                       
 30%|███       | 22514/75000 [19:15<57:41, 15.16it/s]  

{'loss': 0.3087, 'grad_norm': 7.1172966957092285, 'learning_rate': 3.522818791946309e-05, 'epoch': 0.9}


                                                       
 30%|███       | 22522/75000 [19:15<1:00:01, 14.57it/s]

{'loss': 0.3346, 'grad_norm': 3.131826162338257, 'learning_rate': 3.5221476510067116e-05, 'epoch': 0.9}


                                                       
 30%|███       | 22531/75000 [19:16<50:37, 17.27it/s]

{'loss': 0.2911, 'grad_norm': 0.6879244446754456, 'learning_rate': 3.5214765100671144e-05, 'epoch': 0.9}


                                                     
 30%|███       | 22542/75000 [19:16<49:38, 17.61it/s]

{'loss': 0.3357, 'grad_norm': 9.667174339294434, 'learning_rate': 3.5208053691275166e-05, 'epoch': 0.9}


                                                     
 30%|███       | 22551/75000 [19:17<1:16:10, 11.48it/s]

{'loss': 0.3071, 'grad_norm': 3.068552017211914, 'learning_rate': 3.5201342281879195e-05, 'epoch': 0.9}


                                                       
 30%|███       | 22563/75000 [19:18<1:01:49, 14.14it/s]

{'loss': 0.2843, 'grad_norm': 4.650752544403076, 'learning_rate': 3.519463087248322e-05, 'epoch': 0.9}


                                                       
 30%|███       | 22574/75000 [19:19<53:43, 16.26it/s]  

{'loss': 0.2783, 'grad_norm': 3.9664382934570312, 'learning_rate': 3.518791946308725e-05, 'epoch': 0.9}


                                                     
 30%|███       | 22581/75000 [19:19<54:05, 16.15it/s]

{'loss': 0.4192, 'grad_norm': 29.87946319580078, 'learning_rate': 3.518120805369128e-05, 'epoch': 0.9}


                                                       
 30%|███       | 22593/75000 [19:20<58:06, 15.03it/s]  

{'loss': 0.4533, 'grad_norm': 0.8607239127159119, 'learning_rate': 3.51744966442953e-05, 'epoch': 0.9}


                                                     
 30%|███       | 22603/75000 [19:21<53:43, 16.25it/s]

{'loss': 0.2593, 'grad_norm': 6.493424892425537, 'learning_rate': 3.516778523489933e-05, 'epoch': 0.9}


                                                     
 30%|███       | 22613/75000 [19:21<52:49, 16.53it/s]

{'loss': 0.2535, 'grad_norm': 3.016585111618042, 'learning_rate': 3.516107382550336e-05, 'epoch': 0.9}


                                                     
 30%|███       | 22622/75000 [19:22<51:09, 17.07it/s]

{'loss': 0.2561, 'grad_norm': 12.06583309173584, 'learning_rate': 3.515436241610738e-05, 'epoch': 0.9}


                                                     
 30%|███       | 22631/75000 [19:22<47:31, 18.37it/s]

{'loss': 0.3059, 'grad_norm': 2.9702744483947754, 'learning_rate': 3.514765100671141e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22644/75000 [19:23<45:09, 19.32it/s]

{'loss': 0.3609, 'grad_norm': 2.7729759216308594, 'learning_rate': 3.514093959731544e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22654/75000 [19:24<45:06, 19.34it/s]

{'loss': 0.2413, 'grad_norm': 8.965312957763672, 'learning_rate': 3.513422818791947e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22664/75000 [19:24<45:10, 19.31it/s]

{'loss': 0.3787, 'grad_norm': 1.5468522310256958, 'learning_rate': 3.512751677852349e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22674/75000 [19:25<44:41, 19.52it/s]

{'loss': 0.4314, 'grad_norm': 7.5505194664001465, 'learning_rate': 3.512080536912752e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22682/75000 [19:25<45:53, 19.00it/s]

{'loss': 0.3062, 'grad_norm': 5.7918901443481445, 'learning_rate': 3.5114093959731546e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22692/75000 [19:25<44:11, 19.72it/s]

{'loss': 0.3166, 'grad_norm': 2.647191286087036, 'learning_rate': 3.5107382550335574e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22703/75000 [19:26<44:18, 19.67it/s]

{'loss': 0.2819, 'grad_norm': 7.266722679138184, 'learning_rate': 3.51006711409396e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22712/75000 [19:27<44:57, 19.38it/s]

{'loss': 0.3578, 'grad_norm': 2.316720962524414, 'learning_rate': 3.5093959731543624e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22724/75000 [19:27<46:23, 18.78it/s]

{'loss': 0.3058, 'grad_norm': 2.2934677600860596, 'learning_rate': 3.508724832214765e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22732/75000 [19:28<44:47, 19.45it/s]

{'loss': 0.3274, 'grad_norm': 1.5075268745422363, 'learning_rate': 3.5080536912751675e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22743/75000 [19:28<46:35, 18.69it/s]

{'loss': 0.3379, 'grad_norm': 3.459062099456787, 'learning_rate': 3.50738255033557e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22751/75000 [19:29<44:55, 19.38it/s]

{'loss': 0.2701, 'grad_norm': 1.7680572271347046, 'learning_rate': 3.506711409395974e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22762/75000 [19:29<43:00, 20.24it/s]

{'loss': 0.2809, 'grad_norm': 5.611964225769043, 'learning_rate': 3.506040268456376e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22772/75000 [19:30<45:37, 19.08it/s]

{'loss': 0.2743, 'grad_norm': 1.6150437593460083, 'learning_rate': 3.505369127516779e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22784/75000 [19:30<43:32, 19.99it/s]

{'loss': 0.3089, 'grad_norm': 7.60753870010376, 'learning_rate': 3.504697986577181e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22792/75000 [19:31<47:31, 18.31it/s]

{'loss': 0.3432, 'grad_norm': 5.738984107971191, 'learning_rate': 3.504026845637584e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22804/75000 [19:31<46:14, 18.81it/s]

{'loss': 0.3083, 'grad_norm': 11.336997985839844, 'learning_rate': 3.503355704697987e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22813/75000 [19:32<46:24, 18.74it/s]

{'loss': 0.364, 'grad_norm': 0.3501393795013428, 'learning_rate': 3.5026845637583896e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22823/75000 [19:32<44:41, 19.46it/s]

{'loss': 0.3024, 'grad_norm': 3.562217950820923, 'learning_rate': 3.5020134228187925e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22831/75000 [19:33<49:50, 17.45it/s]

{'loss': 0.3128, 'grad_norm': 3.1226751804351807, 'learning_rate': 3.501342281879195e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22841/75000 [19:33<48:08, 18.06it/s]

{'loss': 0.3399, 'grad_norm': 8.206711769104004, 'learning_rate': 3.5006711409395975e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22853/75000 [19:34<45:06, 19.27it/s]

{'loss': 0.2861, 'grad_norm': 1.6626760959625244, 'learning_rate': 3.5e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22862/75000 [19:34<47:16, 18.38it/s]

{'loss': 0.3073, 'grad_norm': 4.664999485015869, 'learning_rate': 3.4993288590604025e-05, 'epoch': 0.91}


                                                     
 30%|███       | 22874/75000 [19:35<43:20, 20.04it/s]

{'loss': 0.357, 'grad_norm': 2.4899284839630127, 'learning_rate': 3.498657718120806e-05, 'epoch': 0.91}


                                                     
 31%|███       | 22882/75000 [19:35<47:10, 18.41it/s]

{'loss': 0.2492, 'grad_norm': 2.89040207862854, 'learning_rate': 3.497986577181208e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22894/75000 [19:36<44:46, 19.40it/s]

{'loss': 0.4013, 'grad_norm': 3.249823808670044, 'learning_rate': 3.497315436241611e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22902/75000 [19:37<46:05, 18.84it/s]

{'loss': 0.393, 'grad_norm': 3.5085673332214355, 'learning_rate': 3.496644295302013e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22911/75000 [19:37<45:26, 19.10it/s]

{'loss': 0.4084, 'grad_norm': 3.248887300491333, 'learning_rate': 3.495973154362416e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22923/75000 [19:38<42:42, 20.32it/s]

{'loss': 0.2863, 'grad_norm': 3.8221466541290283, 'learning_rate': 3.495302013422819e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22934/75000 [19:38<45:07, 19.23it/s]

{'loss': 0.3822, 'grad_norm': 1.7172819375991821, 'learning_rate': 3.494630872483222e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22943/75000 [19:39<43:43, 19.84it/s]

{'loss': 0.3761, 'grad_norm': 4.319121837615967, 'learning_rate': 3.493959731543625e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22954/75000 [19:39<44:44, 19.39it/s]

{'loss': 0.2533, 'grad_norm': 4.522177219390869, 'learning_rate': 3.493288590604027e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22964/75000 [19:40<43:20, 20.01it/s]

{'loss': 0.2086, 'grad_norm': 0.7377649545669556, 'learning_rate': 3.49261744966443e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22974/75000 [19:40<45:17, 19.15it/s]

{'loss': 0.2512, 'grad_norm': 4.806401252746582, 'learning_rate': 3.491946308724832e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22984/75000 [19:41<43:50, 19.77it/s]

{'loss': 0.3686, 'grad_norm': 2.1256628036499023, 'learning_rate': 3.4912751677852354e-05, 'epoch': 0.92}


                                                     
 31%|███       | 22992/75000 [19:41<48:18, 17.94it/s]

{'loss': 0.3527, 'grad_norm': 0.9514924883842468, 'learning_rate': 3.4906040268456376e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23000/75000 [19:42<44:43, 19.38it/s]

{'loss': 0.329, 'grad_norm': 3.782592296600342, 'learning_rate': 3.4899328859060405e-05, 'epoch': 0.92}


                                                       
 31%|███       | 23014/75000 [19:43<51:45, 16.74it/s]  

{'loss': 0.318, 'grad_norm': 2.3984386920928955, 'learning_rate': 3.489261744966443e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23022/75000 [19:43<46:19, 18.70it/s]

{'loss': 0.3522, 'grad_norm': 2.163386344909668, 'learning_rate': 3.4885906040268455e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23034/75000 [19:44<46:52, 18.48it/s]

{'loss': 0.3634, 'grad_norm': 2.7200756072998047, 'learning_rate': 3.4879194630872484e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23043/75000 [19:44<44:24, 19.50it/s]

{'loss': 0.3194, 'grad_norm': 1.8269636631011963, 'learning_rate': 3.487248322147651e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23052/75000 [19:45<47:21, 18.28it/s]

{'loss': 0.2656, 'grad_norm': 4.589477062225342, 'learning_rate': 3.486577181208054e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23064/75000 [19:45<43:18, 19.99it/s]

{'loss': 0.2688, 'grad_norm': 6.134091377258301, 'learning_rate': 3.485906040268457e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23072/75000 [19:46<47:30, 18.22it/s]

{'loss': 0.4115, 'grad_norm': 6.554184436798096, 'learning_rate': 3.485234899328859e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23084/75000 [19:47<44:01, 19.66it/s]

{'loss': 0.3276, 'grad_norm': 5.59920072555542, 'learning_rate': 3.484563758389262e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23093/75000 [19:47<43:37, 19.83it/s]

{'loss': 0.3767, 'grad_norm': 9.480875015258789, 'learning_rate': 3.483892617449664e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23102/75000 [19:47<44:00, 19.66it/s]

{'loss': 0.3818, 'grad_norm': 4.114144802093506, 'learning_rate': 3.483221476510068e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23111/75000 [19:48<46:04, 18.77it/s]

{'loss': 0.3528, 'grad_norm': 2.96536922454834, 'learning_rate': 3.48255033557047e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23124/75000 [19:49<43:21, 19.94it/s]

{'loss': 0.4149, 'grad_norm': 4.630558967590332, 'learning_rate': 3.481879194630873e-05, 'epoch': 0.92}


                                                     
 31%|███       | 23134/75000 [19:49<42:11, 20.49it/s]

{'loss': 0.2575, 'grad_norm': 2.8153247833251953, 'learning_rate': 3.4812080536912756e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23144/75000 [19:50<44:37, 19.37it/s]

{'loss': 0.4695, 'grad_norm': 5.89690637588501, 'learning_rate': 3.480536912751678e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23152/75000 [19:50<43:37, 19.81it/s]

{'loss': 0.3476, 'grad_norm': 4.332207679748535, 'learning_rate': 3.4798657718120806e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23164/75000 [19:51<43:53, 19.69it/s]

{'loss': 0.2324, 'grad_norm': 9.037586212158203, 'learning_rate': 3.4791946308724834e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23173/75000 [19:51<43:40, 19.78it/s]

{'loss': 0.3101, 'grad_norm': 1.5573163032531738, 'learning_rate': 3.478523489932886e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23184/75000 [19:52<43:36, 19.80it/s]

{'loss': 0.3063, 'grad_norm': 3.7114875316619873, 'learning_rate': 3.4778523489932885e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23194/75000 [19:52<42:14, 20.44it/s]

{'loss': 0.4131, 'grad_norm': 7.631359100341797, 'learning_rate': 3.477181208053691e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23203/75000 [19:53<44:57, 19.20it/s]

{'loss': 0.3688, 'grad_norm': 6.066418170928955, 'learning_rate': 3.476510067114094e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23214/75000 [19:53<43:29, 19.84it/s]

{'loss': 0.2346, 'grad_norm': 16.907474517822266, 'learning_rate': 3.4758389261744964e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23223/75000 [19:54<46:37, 18.51it/s]

{'loss': 0.4008, 'grad_norm': 2.8597946166992188, 'learning_rate': 3.4751677852349e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23234/75000 [19:54<44:01, 19.60it/s]

{'loss': 0.425, 'grad_norm': 1.4890038967132568, 'learning_rate': 3.474496644295302e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23241/75000 [19:55<46:15, 18.65it/s]

{'loss': 0.3358, 'grad_norm': 5.787749290466309, 'learning_rate': 3.473825503355705e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23254/75000 [19:55<43:02, 20.03it/s]

{'loss': 0.3674, 'grad_norm': 3.1928935050964355, 'learning_rate': 3.473154362416108e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23263/75000 [19:56<44:26, 19.40it/s]

{'loss': 0.329, 'grad_norm': 4.366880893707275, 'learning_rate': 3.47248322147651e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23271/75000 [19:56<51:48, 16.64it/s]

{'loss': 0.2482, 'grad_norm': 2.706749439239502, 'learning_rate': 3.471812080536913e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23283/75000 [19:57<45:34, 18.91it/s]

{'loss': 0.3081, 'grad_norm': 1.9127625226974487, 'learning_rate': 3.471140939597316e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23293/75000 [19:57<50:30, 17.06it/s]

{'loss': 0.3928, 'grad_norm': 1.4275844097137451, 'learning_rate': 3.4704697986577185e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23301/75000 [19:58<46:39, 18.47it/s]

{'loss': 0.2342, 'grad_norm': 4.497165679931641, 'learning_rate': 3.469798657718121e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23311/75000 [19:58<47:02, 18.31it/s]

{'loss': 0.3553, 'grad_norm': 9.702229499816895, 'learning_rate': 3.4691275167785236e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23324/75000 [19:59<44:54, 19.18it/s]

{'loss': 0.4891, 'grad_norm': 3.085452079772949, 'learning_rate': 3.4684563758389264e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23334/75000 [20:00<43:27, 19.81it/s]

{'loss': 0.2519, 'grad_norm': 3.0135438442230225, 'learning_rate': 3.467785234899329e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23343/75000 [20:00<45:01, 19.12it/s]

{'loss': 0.3801, 'grad_norm': 5.066722393035889, 'learning_rate': 3.467114093959732e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23353/75000 [20:01<44:00, 19.56it/s]

{'loss': 0.2447, 'grad_norm': 3.85192608833313, 'learning_rate': 3.466442953020134e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23363/75000 [20:01<45:32, 18.90it/s]

{'loss': 0.227, 'grad_norm': 2.0275371074676514, 'learning_rate': 3.465771812080537e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23372/75000 [20:02<43:57, 19.58it/s]

{'loss': 0.3432, 'grad_norm': 8.170185089111328, 'learning_rate': 3.465100671140939e-05, 'epoch': 0.93}


                                                     
 31%|███       | 23383/75000 [20:02<49:49, 17.26it/s]

{'loss': 0.3264, 'grad_norm': 6.362321376800537, 'learning_rate': 3.464429530201342e-05, 'epoch': 0.94}


                                                     
 31%|███       | 23391/75000 [20:03<45:56, 18.72it/s]

{'loss': 0.4121, 'grad_norm': 9.61887264251709, 'learning_rate': 3.463758389261745e-05, 'epoch': 0.94}


                                                     
 31%|███       | 23402/75000 [20:03<44:52, 19.16it/s]

{'loss': 0.3107, 'grad_norm': 1.478806734085083, 'learning_rate': 3.463087248322148e-05, 'epoch': 0.94}


                                                     
 31%|███       | 23413/75000 [20:04<44:44, 19.22it/s]

{'loss': 0.4331, 'grad_norm': 2.169914484024048, 'learning_rate': 3.462416107382551e-05, 'epoch': 0.94}


                                                     
 31%|███       | 23421/75000 [20:04<44:51, 19.16it/s]

{'loss': 0.2667, 'grad_norm': 0.9240216016769409, 'learning_rate': 3.461744966442953e-05, 'epoch': 0.94}


                                                     
 31%|███       | 23434/75000 [20:05<43:42, 19.66it/s]

{'loss': 0.2579, 'grad_norm': 2.5911741256713867, 'learning_rate': 3.461073825503356e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23442/75000 [20:05<48:11, 17.83it/s]

{'loss': 0.4673, 'grad_norm': 2.4800703525543213, 'learning_rate': 3.4604026845637586e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23454/75000 [20:06<44:39, 19.24it/s]

{'loss': 0.1884, 'grad_norm': 6.256768226623535, 'learning_rate': 3.4597315436241615e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23462/75000 [20:06<50:52, 16.89it/s]

{'loss': 0.3721, 'grad_norm': 4.489422798156738, 'learning_rate': 3.4590604026845643e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23471/75000 [20:07<45:44, 18.78it/s]

{'loss': 0.2853, 'grad_norm': 3.1633129119873047, 'learning_rate': 3.4583892617449665e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23482/75000 [20:07<43:39, 19.67it/s]

{'loss': 0.382, 'grad_norm': 5.852922439575195, 'learning_rate': 3.4577181208053694e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23493/75000 [20:08<47:06, 18.22it/s]

{'loss': 0.3873, 'grad_norm': 4.836709499359131, 'learning_rate': 3.4570469798657716e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23500/75000 [20:08<44:19, 19.37it/s]

{'loss': 0.4277, 'grad_norm': 1.1158698797225952, 'learning_rate': 3.4563758389261744e-05, 'epoch': 0.94}


                                                       
 31%|███▏      | 23512/75000 [20:10<1:00:40, 14.14it/s]

{'loss': 0.2938, 'grad_norm': 2.6592392921447754, 'learning_rate': 3.455704697986577e-05, 'epoch': 0.94}


                                                       
 31%|███▏      | 23522/75000 [20:10<48:32, 17.68it/s]

{'loss': 0.3337, 'grad_norm': 4.902001857757568, 'learning_rate': 3.45503355704698e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23533/75000 [20:11<50:56, 16.84it/s]

{'loss': 0.4166, 'grad_norm': 6.486978054046631, 'learning_rate': 3.454362416107383e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23543/75000 [20:12<49:13, 17.42it/s]

{'loss': 0.3696, 'grad_norm': 2.796180486679077, 'learning_rate': 3.453691275167785e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23553/75000 [20:12<51:46, 16.56it/s]

{'loss': 0.4436, 'grad_norm': 3.254276752471924, 'learning_rate': 3.453020134228188e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23564/75000 [20:13<45:27, 18.86it/s]

{'loss': 0.3741, 'grad_norm': 2.4811758995056152, 'learning_rate': 3.45234899328859e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23571/75000 [20:13<45:59, 18.63it/s]

{'loss': 0.359, 'grad_norm': 4.323030948638916, 'learning_rate': 3.451677852348994e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23583/75000 [20:14<45:30, 18.83it/s]

{'loss': 0.4405, 'grad_norm': 2.953462839126587, 'learning_rate': 3.4510067114093966e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23594/75000 [20:14<45:57, 18.64it/s]

{'loss': 0.3085, 'grad_norm': 7.8767595291137695, 'learning_rate': 3.450335570469799e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23602/75000 [20:15<47:59, 17.85it/s]

{'loss': 0.3539, 'grad_norm': 3.252863883972168, 'learning_rate': 3.4496644295302016e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23613/75000 [20:15<46:23, 18.46it/s]

{'loss': 0.2809, 'grad_norm': 4.377573013305664, 'learning_rate': 3.448993288590604e-05, 'epoch': 0.94}


                                                     
 31%|███▏      | 23623/75000 [20:16<48:35, 17.62it/s]

{'loss': 0.3149, 'grad_norm': 5.2741007804870605, 'learning_rate': 3.4483221476510066e-05, 'epoch': 0.94}


                                                     
 32%|███▏      | 23633/75000 [20:17<56:25, 15.17it/s]  

{'loss': 0.1586, 'grad_norm': 1.8779895305633545, 'learning_rate': 3.44765100671141e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23641/75000 [20:17<53:23, 16.03it/s]

{'loss': 0.4567, 'grad_norm': 5.646131992340088, 'learning_rate': 3.4469798657718123e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23653/75000 [20:18<49:37, 17.24it/s]

{'loss': 0.5111, 'grad_norm': 8.470161437988281, 'learning_rate': 3.446308724832215e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23663/75000 [20:18<51:59, 16.46it/s]

{'loss': 0.3699, 'grad_norm': 1.1823952198028564, 'learning_rate': 3.4456375838926174e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23673/75000 [20:19<46:45, 18.29it/s]

{'loss': 0.433, 'grad_norm': 4.311926364898682, 'learning_rate': 3.44496644295302e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23681/75000 [20:19<47:26, 18.03it/s]

{'loss': 0.2924, 'grad_norm': 3.612531900405884, 'learning_rate': 3.444295302013423e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23692/75000 [20:20<47:47, 17.89it/s]

{'loss': 0.3964, 'grad_norm': 2.7000510692596436, 'learning_rate': 3.443624161073826e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23703/75000 [20:21<45:53, 18.63it/s]

{'loss': 0.366, 'grad_norm': 3.4609477519989014, 'learning_rate': 3.442953020134229e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23711/75000 [20:21<45:05, 18.96it/s]

{'loss': 0.3467, 'grad_norm': 11.273368835449219, 'learning_rate': 3.442281879194631e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23722/75000 [20:22<45:35, 18.75it/s]

{'loss': 0.2317, 'grad_norm': 1.7361705303192139, 'learning_rate': 3.441610738255034e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23731/75000 [20:22<45:03, 18.96it/s]

{'loss': 0.3427, 'grad_norm': 2.8822052478790283, 'learning_rate': 3.440939597315436e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23742/75000 [20:23<46:04, 18.54it/s]

{'loss': 0.4324, 'grad_norm': 3.777435064315796, 'learning_rate': 3.440268456375839e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23751/75000 [20:23<46:10, 18.50it/s]

{'loss': 0.3448, 'grad_norm': 4.939582824707031, 'learning_rate': 3.439597315436242e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23763/75000 [20:24<41:58, 20.34it/s]

{'loss': 0.4281, 'grad_norm': 6.851086139678955, 'learning_rate': 3.4389261744966446e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23772/75000 [20:24<42:36, 20.04it/s]

{'loss': 0.376, 'grad_norm': 4.087108612060547, 'learning_rate': 3.4382550335570474e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23784/75000 [20:25<43:54, 19.44it/s]

{'loss': 0.5542, 'grad_norm': 6.070222854614258, 'learning_rate': 3.4375838926174496e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23794/75000 [20:25<42:20, 20.16it/s]

{'loss': 0.3437, 'grad_norm': 0.9999216794967651, 'learning_rate': 3.4369127516778525e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23804/75000 [20:26<43:15, 19.73it/s]

{'loss': 0.3247, 'grad_norm': 17.15315818786621, 'learning_rate': 3.436241610738255e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23814/75000 [20:26<42:57, 19.86it/s]

{'loss': 0.3717, 'grad_norm': 10.112462997436523, 'learning_rate': 3.435570469798658e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23822/75000 [20:27<42:21, 20.14it/s]

{'loss': 0.322, 'grad_norm': 1.3988564014434814, 'learning_rate': 3.43489932885906e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23833/75000 [20:27<43:35, 19.56it/s]

{'loss': 0.3036, 'grad_norm': 3.07499361038208, 'learning_rate': 3.434228187919463e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23843/75000 [20:28<42:43, 19.96it/s]

{'loss': 0.4027, 'grad_norm': 4.1256256103515625, 'learning_rate': 3.433557046979866e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23854/75000 [20:29<44:11, 19.29it/s]

{'loss': 0.4045, 'grad_norm': 2.4953277111053467, 'learning_rate': 3.432885906040268e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23862/75000 [20:29<42:44, 19.94it/s]

{'loss': 0.3631, 'grad_norm': 4.467448711395264, 'learning_rate': 3.432214765100671e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23874/75000 [20:30<44:14, 19.26it/s]

{'loss': 0.4103, 'grad_norm': 2.648972749710083, 'learning_rate': 3.431543624161074e-05, 'epoch': 0.95}


                                                     
 32%|███▏      | 23884/75000 [20:30<43:03, 19.78it/s]

{'loss': 0.3782, 'grad_norm': 3.164951801300049, 'learning_rate': 3.430872483221477e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23892/75000 [20:31<47:01, 18.11it/s]

{'loss': 0.2408, 'grad_norm': 3.1390419006347656, 'learning_rate': 3.4302013422818796e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23903/75000 [20:31<44:02, 19.34it/s]

{'loss': 0.3372, 'grad_norm': 2.245286226272583, 'learning_rate': 3.429530201342282e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23912/75000 [20:32<44:54, 18.96it/s]

{'loss': 0.4323, 'grad_norm': 1.136357307434082, 'learning_rate': 3.428859060402685e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23922/75000 [20:32<43:31, 19.56it/s]

{'loss': 0.3003, 'grad_norm': 1.8778440952301025, 'learning_rate': 3.4281879194630875e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23934/75000 [20:33<43:25, 19.60it/s]

{'loss': 0.3141, 'grad_norm': 2.785853624343872, 'learning_rate': 3.4275167785234904e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23943/75000 [20:33<42:27, 20.04it/s]

{'loss': 0.5019, 'grad_norm': 5.912792205810547, 'learning_rate': 3.4268456375838926e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23953/75000 [20:34<44:24, 19.15it/s]

{'loss': 0.3344, 'grad_norm': 3.6515541076660156, 'learning_rate': 3.4261744966442954e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23964/75000 [20:34<42:23, 20.06it/s]

{'loss': 0.3422, 'grad_norm': 3.19105863571167, 'learning_rate': 3.425503355704698e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23972/75000 [20:35<42:05, 20.20it/s]

{'loss': 0.3189, 'grad_norm': 1.503936529159546, 'learning_rate': 3.4248322147651004e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23982/75000 [20:35<45:25, 18.72it/s]

{'loss': 0.3724, 'grad_norm': 3.1554481983184814, 'learning_rate': 3.424161073825504e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 23993/75000 [20:36<45:17, 18.77it/s]

{'loss': 0.4845, 'grad_norm': 5.500491142272949, 'learning_rate': 3.423489932885906e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24000/75000 [20:36<43:47, 19.41it/s]

{'loss': 0.3886, 'grad_norm': 1.9106501340866089, 'learning_rate': 3.422818791946309e-05, 'epoch': 0.96}


                                                       
 32%|███▏      | 24011/75000 [20:37<53:41, 15.83it/s]

{'loss': 0.3921, 'grad_norm': 5.636026859283447, 'learning_rate': 3.422147651006711e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24022/75000 [20:38<43:40, 19.45it/s]

{'loss': 0.3118, 'grad_norm': 4.251883506774902, 'learning_rate': 3.421476510067114e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24034/75000 [20:38<42:10, 20.14it/s]

{'loss': 0.3367, 'grad_norm': 4.506594181060791, 'learning_rate': 3.420805369127517e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24044/75000 [20:39<43:16, 19.62it/s]

{'loss': 0.2703, 'grad_norm': 3.9831833839416504, 'learning_rate': 3.42013422818792e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24052/75000 [20:39<42:59, 19.75it/s]

{'loss': 0.4132, 'grad_norm': 3.135535955429077, 'learning_rate': 3.4194630872483226e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24064/75000 [20:40<43:30, 19.51it/s]

{'loss': 0.4608, 'grad_norm': 0.49514809250831604, 'learning_rate': 3.418791946308725e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24074/75000 [20:40<42:04, 20.17it/s]

{'loss': 0.3894, 'grad_norm': 8.671494483947754, 'learning_rate': 3.4181208053691276e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24084/75000 [20:41<45:02, 18.84it/s]

{'loss': 0.3291, 'grad_norm': 4.4032883644104, 'learning_rate': 3.4174496644295305e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24094/75000 [20:41<43:45, 19.39it/s]

{'loss': 0.3648, 'grad_norm': 2.0215811729431152, 'learning_rate': 3.416778523489933e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24101/75000 [20:42<49:58, 16.98it/s]

{'loss': 0.3794, 'grad_norm': 5.724201202392578, 'learning_rate': 3.416107382550336e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24114/75000 [20:43<43:42, 19.40it/s]

{'loss': 0.3025, 'grad_norm': 2.6928045749664307, 'learning_rate': 3.4154362416107384e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24122/75000 [20:43<42:18, 20.04it/s]

{'loss': 0.3151, 'grad_norm': 4.531656265258789, 'learning_rate': 3.414765100671141e-05, 'epoch': 0.96}


                                                     
 32%|███▏      | 24133/75000 [20:44<44:58, 18.85it/s]

{'loss': 0.2777, 'grad_norm': 2.382920503616333, 'learning_rate': 3.4140939597315434e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24144/75000 [20:44<42:04, 20.14it/s]

{'loss': 0.3058, 'grad_norm': 7.309771537780762, 'learning_rate': 3.413422818791946e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24153/75000 [20:45<43:30, 19.48it/s]

{'loss': 0.2719, 'grad_norm': 4.149312973022461, 'learning_rate': 3.412751677852349e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24163/75000 [20:45<43:07, 19.65it/s]

{'loss': 0.4154, 'grad_norm': 2.1615278720855713, 'learning_rate': 3.412080536912752e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24174/75000 [20:46<45:46, 18.51it/s]

{'loss': 0.2212, 'grad_norm': 0.9105130434036255, 'learning_rate': 3.411409395973155e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24182/75000 [20:46<45:30, 18.61it/s]

{'loss': 0.4273, 'grad_norm': 3.5455875396728516, 'learning_rate': 3.410738255033557e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24194/75000 [20:47<45:20, 18.67it/s]

{'loss': 0.2767, 'grad_norm': 0.9168621301651001, 'learning_rate': 3.41006711409396e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24201/75000 [20:47<47:21, 17.87it/s]

{'loss': 0.2917, 'grad_norm': 6.9601945877075195, 'learning_rate': 3.409395973154362e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24214/75000 [20:48<43:05, 19.64it/s]

{'loss': 0.3331, 'grad_norm': 4.932528495788574, 'learning_rate': 3.4087248322147656e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24222/75000 [20:48<47:59, 17.64it/s]

{'loss': 0.3462, 'grad_norm': 1.4830594062805176, 'learning_rate': 3.4080536912751684e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24232/75000 [20:49<43:58, 19.24it/s]

{'loss': 0.227, 'grad_norm': 1.3080004453659058, 'learning_rate': 3.4073825503355706e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24243/75000 [20:49<45:09, 18.73it/s]

{'loss': 0.3487, 'grad_norm': 1.5702506303787231, 'learning_rate': 3.4067114093959735e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24254/75000 [20:50<41:52, 20.20it/s]

{'loss': 0.2155, 'grad_norm': 5.560152053833008, 'learning_rate': 3.4060402684563756e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24263/75000 [20:50<41:38, 20.31it/s]

{'loss': 0.4693, 'grad_norm': 4.625579357147217, 'learning_rate': 3.4053691275167785e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24274/75000 [20:51<43:04, 19.63it/s]

{'loss': 0.3353, 'grad_norm': 9.334378242492676, 'learning_rate': 3.4046979865771813e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24284/75000 [20:52<42:24, 19.93it/s]

{'loss': 0.3911, 'grad_norm': 2.6482043266296387, 'learning_rate': 3.404026845637584e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24294/75000 [20:52<41:42, 20.27it/s]

{'loss': 0.3471, 'grad_norm': 1.955917239189148, 'learning_rate': 3.403355704697987e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24303/75000 [20:53<44:56, 18.80it/s]

{'loss': 0.3521, 'grad_norm': 0.8126436471939087, 'learning_rate': 3.402684563758389e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24314/75000 [20:53<41:53, 20.16it/s]

{'loss': 0.3765, 'grad_norm': 1.022485613822937, 'learning_rate': 3.402013422818792e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24324/75000 [20:54<43:49, 19.27it/s]

{'loss': 0.211, 'grad_norm': 3.778726100921631, 'learning_rate': 3.401342281879194e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24333/75000 [20:54<42:05, 20.06it/s]

{'loss': 0.4504, 'grad_norm': 1.4563708305358887, 'learning_rate': 3.400671140939598e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24342/75000 [20:55<45:47, 18.44it/s]

{'loss': 0.2823, 'grad_norm': 1.2821437120437622, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24353/75000 [20:55<42:35, 19.82it/s]

{'loss': 0.3906, 'grad_norm': 2.361788511276245, 'learning_rate': 3.399328859060403e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24362/75000 [20:56<47:34, 17.74it/s]

{'loss': 0.3583, 'grad_norm': 1.0695394277572632, 'learning_rate': 3.398657718120806e-05, 'epoch': 0.97}


                                                     
 32%|███▏      | 24374/75000 [20:56<42:28, 19.87it/s]

{'loss': 0.4304, 'grad_norm': 4.358999252319336, 'learning_rate': 3.397986577181208e-05, 'epoch': 0.97}


                                                     
 33%|███▎      | 24383/75000 [20:57<45:38, 18.48it/s]

{'loss': 0.3099, 'grad_norm': 0.742055356502533, 'learning_rate': 3.397315436241611e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24394/75000 [20:57<41:48, 20.17it/s]

{'loss': 0.2539, 'grad_norm': 1.745041847229004, 'learning_rate': 3.3966442953020136e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24403/75000 [20:58<45:19, 18.60it/s]

{'loss': 0.3997, 'grad_norm': 4.652807712554932, 'learning_rate': 3.3959731543624164e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24414/75000 [20:58<42:02, 20.05it/s]

{'loss': 0.3125, 'grad_norm': 3.2521324157714844, 'learning_rate': 3.395302013422819e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24423/75000 [20:59<42:24, 19.87it/s]

{'loss': 0.3451, 'grad_norm': 1.6302683353424072, 'learning_rate': 3.3946308724832215e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24432/75000 [20:59<44:46, 18.83it/s]

{'loss': 0.3808, 'grad_norm': 3.9101452827453613, 'learning_rate': 3.393959731543624e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24442/75000 [21:00<42:21, 19.89it/s]

{'loss': 0.2385, 'grad_norm': 4.897675514221191, 'learning_rate': 3.3932885906040265e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24452/75000 [21:00<43:50, 19.22it/s]

{'loss': 0.315, 'grad_norm': 7.947039604187012, 'learning_rate': 3.39261744966443e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24462/75000 [21:01<42:06, 20.01it/s]

{'loss': 0.2774, 'grad_norm': 1.4184279441833496, 'learning_rate': 3.391946308724832e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24471/75000 [21:01<42:15, 19.93it/s]

{'loss': 0.335, 'grad_norm': 5.516419410705566, 'learning_rate': 3.391275167785235e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24484/75000 [21:02<42:43, 19.70it/s]

{'loss': 0.2972, 'grad_norm': 5.691671371459961, 'learning_rate': 3.390604026845638e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24494/75000 [21:02<41:26, 20.31it/s]

{'loss': 0.2628, 'grad_norm': 7.857404708862305, 'learning_rate': 3.38993288590604e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24500/75000 [21:03<44:38, 18.85it/s]

{'loss': 0.3349, 'grad_norm': 3.3383677005767822, 'learning_rate': 3.389261744966443e-05, 'epoch': 0.98}


                                                       
 33%|███▎      | 24514/75000 [21:04<50:55, 16.52it/s]  

{'loss': 0.4558, 'grad_norm': 4.909794807434082, 'learning_rate': 3.388590604026846e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24521/75000 [21:04<52:31, 16.02it/s]

{'loss': 0.445, 'grad_norm': 1.065360188484192, 'learning_rate': 3.3879194630872486e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24534/75000 [21:05<42:33, 19.76it/s]

{'loss': 0.2634, 'grad_norm': 0.977385938167572, 'learning_rate': 3.3872483221476515e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24543/75000 [21:06<43:51, 19.17it/s]

{'loss': 0.4215, 'grad_norm': 7.857161045074463, 'learning_rate': 3.386577181208054e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24552/75000 [21:06<43:39, 19.26it/s]

{'loss': 0.3055, 'grad_norm': 0.45375871658325195, 'learning_rate': 3.3859060402684565e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24562/75000 [21:06<42:16, 19.89it/s]

{'loss': 0.3106, 'grad_norm': 11.333861351013184, 'learning_rate': 3.3852348993288594e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24572/75000 [21:07<45:09, 18.61it/s]

{'loss': 0.4219, 'grad_norm': 0.7779865860939026, 'learning_rate': 3.384563758389262e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24583/75000 [21:08<41:49, 20.09it/s]

{'loss': 0.274, 'grad_norm': 8.020099639892578, 'learning_rate': 3.3838926174496644e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24592/75000 [21:08<43:45, 19.20it/s]

{'loss': 0.3697, 'grad_norm': 10.20114803314209, 'learning_rate': 3.383221476510067e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24602/75000 [21:09<42:25, 19.80it/s]

{'loss': 0.3004, 'grad_norm': 2.810364007949829, 'learning_rate': 3.38255033557047e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24613/75000 [21:09<43:55, 19.12it/s]

{'loss': 0.2871, 'grad_norm': 6.46722936630249, 'learning_rate': 3.381879194630872e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24622/75000 [21:10<41:59, 19.99it/s]

{'loss': 0.3241, 'grad_norm': 16.157440185546875, 'learning_rate': 3.381208053691275e-05, 'epoch': 0.98}


                                                     
 33%|███▎      | 24633/75000 [21:10<45:00, 18.65it/s]

{'loss': 0.2311, 'grad_norm': 5.078071117401123, 'learning_rate': 3.380536912751678e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24644/75000 [21:11<42:02, 19.96it/s]

{'loss': 0.3104, 'grad_norm': 1.9135096073150635, 'learning_rate': 3.379865771812081e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24651/75000 [21:11<43:56, 19.09it/s]

{'loss': 0.3225, 'grad_norm': 5.485080718994141, 'learning_rate': 3.379194630872483e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24663/75000 [21:12<41:06, 20.41it/s]

{'loss': 0.4595, 'grad_norm': 4.447214126586914, 'learning_rate': 3.378523489932886e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24672/75000 [21:12<41:34, 20.17it/s]

{'loss': 0.4333, 'grad_norm': 3.0888564586639404, 'learning_rate': 3.377852348993289e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24684/75000 [21:13<42:48, 19.59it/s]

{'loss': 0.4083, 'grad_norm': 7.140542507171631, 'learning_rate': 3.3771812080536916e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24692/75000 [21:13<42:14, 19.85it/s]

{'loss': 0.3239, 'grad_norm': 3.785548210144043, 'learning_rate': 3.3765100671140945e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24702/75000 [21:14<44:32, 18.82it/s]

{'loss': 0.4309, 'grad_norm': 4.4617180824279785, 'learning_rate': 3.3758389261744966e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24713/75000 [21:14<44:29, 18.84it/s]

{'loss': 0.4967, 'grad_norm': 1.9005135297775269, 'learning_rate': 3.3751677852348995e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24722/75000 [21:15<47:12, 17.75it/s]

{'loss': 0.3845, 'grad_norm': 2.546095371246338, 'learning_rate': 3.3744966442953024e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24734/75000 [21:15<42:53, 19.53it/s]

{'loss': 0.3229, 'grad_norm': 2.46501088142395, 'learning_rate': 3.3738255033557045e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24744/75000 [21:16<41:37, 20.12it/s]

{'loss': 0.2376, 'grad_norm': 8.203276634216309, 'learning_rate': 3.3731543624161074e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24751/75000 [21:16<46:44, 17.91it/s]

{'loss': 0.3281, 'grad_norm': 12.794214248657227, 'learning_rate': 3.37248322147651e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24763/75000 [21:17<41:32, 20.16it/s]

{'loss': 0.3219, 'grad_norm': 6.0087127685546875, 'learning_rate': 3.371812080536913e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24773/75000 [21:17<49:40, 16.85it/s]

{'loss': 0.3898, 'grad_norm': 1.7438998222351074, 'learning_rate': 3.371140939597315e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24782/75000 [21:18<47:13, 17.72it/s]

{'loss': 0.3291, 'grad_norm': 3.136643886566162, 'learning_rate': 3.370469798657718e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24794/75000 [21:19<46:20, 18.06it/s]

{'loss': 0.2376, 'grad_norm': 1.7779896259307861, 'learning_rate': 3.369798657718121e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24802/75000 [21:19<45:32, 18.37it/s]

{'loss': 0.2892, 'grad_norm': 1.177158236503601, 'learning_rate': 3.369127516778524e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24814/75000 [21:20<46:37, 17.94it/s]

{'loss': 0.4485, 'grad_norm': 1.3927853107452393, 'learning_rate': 3.368456375838927e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24822/75000 [21:20<45:18, 18.46it/s]

{'loss': 0.2984, 'grad_norm': 1.8824564218521118, 'learning_rate': 3.367785234899329e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24833/75000 [21:21<45:31, 18.37it/s]

{'loss': 0.3693, 'grad_norm': 0.7312426567077637, 'learning_rate': 3.367114093959732e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24843/75000 [21:21<44:59, 18.58it/s]

{'loss': 0.395, 'grad_norm': 6.228332996368408, 'learning_rate': 3.366442953020134e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24853/75000 [21:22<43:41, 19.13it/s]

{'loss': 0.3105, 'grad_norm': 3.5125725269317627, 'learning_rate': 3.365771812080537e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24863/75000 [21:22<47:18, 17.67it/s]

{'loss': 0.4153, 'grad_norm': 2.31390643119812, 'learning_rate': 3.36510067114094e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24871/75000 [21:23<44:46, 18.66it/s]

{'loss': 0.259, 'grad_norm': 3.770001173019409, 'learning_rate': 3.3644295302013425e-05, 'epoch': 0.99}


                                                     
 33%|███▎      | 24883/75000 [21:24<46:34, 17.94it/s]

{'loss': 0.3834, 'grad_norm': 4.022974491119385, 'learning_rate': 3.363758389261745e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24891/75000 [21:24<44:48, 18.64it/s]

{'loss': 0.4512, 'grad_norm': 2.7361221313476562, 'learning_rate': 3.3630872483221475e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24903/75000 [21:25<48:06, 17.36it/s]

{'loss': 0.3134, 'grad_norm': 2.192650079727173, 'learning_rate': 3.3624161073825504e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24913/75000 [21:25<46:11, 18.07it/s]

{'loss': 0.354, 'grad_norm': 1.6227668523788452, 'learning_rate': 3.361744966442953e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24923/75000 [21:26<47:33, 17.55it/s]

{'loss': 0.4376, 'grad_norm': 4.831364154815674, 'learning_rate': 3.361073825503356e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24931/75000 [21:26<45:28, 18.35it/s]

{'loss': 0.3985, 'grad_norm': 8.444580078125, 'learning_rate': 3.360402684563759e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24941/75000 [21:27<50:22, 16.56it/s]

{'loss': 0.3356, 'grad_norm': 4.15702486038208, 'learning_rate': 3.359731543624161e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24951/75000 [21:27<46:43, 17.85it/s]

{'loss': 0.3659, 'grad_norm': 8.354841232299805, 'learning_rate': 3.359060402684564e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24963/75000 [21:28<46:03, 18.11it/s]

{'loss': 0.2497, 'grad_norm': 3.2640938758850098, 'learning_rate': 3.358389261744966e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24972/75000 [21:28<43:58, 18.96it/s]

{'loss': 0.2893, 'grad_norm': 1.3715202808380127, 'learning_rate': 3.357718120805369e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24983/75000 [21:29<48:09, 17.31it/s]

{'loss': 0.3424, 'grad_norm': 2.982422351837158, 'learning_rate': 3.3570469798657725e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 24993/75000 [21:30<47:08, 17.68it/s]

{'loss': 0.35, 'grad_norm': 1.6741150617599487, 'learning_rate': 3.356375838926175e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25000/75000 [21:30<46:03, 18.09it/s]

{'loss': 0.3002, 'grad_norm': 3.2707135677337646, 'learning_rate': 3.3557046979865775e-05, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
 33%|███▎      | 25000/75000 [21:40<46:03, 18.09it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.33587247133255005, 'eval_runtime': 75.6604, 'eval_samples_per_second': 1321.695, 'eval_steps_per_second': 20.658, 'epoch': 1.0}


                                                         
 33%|███▎      | 25014/75000 [22:47<21:23:47,  1.54s/it]

{'loss': 0.4155, 'grad_norm': 0.4545600414276123, 'learning_rate': 3.35503355704698e-05, 'epoch': 1.0}


                                                        
 33%|███▎      | 25022/75000 [22:47<6:42:27,  2.07it/s]

{'loss': 0.1981, 'grad_norm': 1.6170146465301514, 'learning_rate': 3.3543624161073826e-05, 'epoch': 1.0}


                                                       
 33%|███▎      | 25031/75000 [22:48<2:10:14,  6.39it/s]

{'loss': 0.2989, 'grad_norm': 3.8283255100250244, 'learning_rate': 3.3536912751677854e-05, 'epoch': 1.0}


                                                       
 33%|███▎      | 25044/75000 [22:49<53:04, 15.69it/s]  

{'loss': 0.3034, 'grad_norm': 6.020969390869141, 'learning_rate': 3.353020134228188e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25053/75000 [22:49<47:54, 17.38it/s]

{'loss': 0.3802, 'grad_norm': 3.4258363246917725, 'learning_rate': 3.352348993288591e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25063/75000 [22:50<43:10, 19.27it/s]

{'loss': 0.3526, 'grad_norm': 7.440670013427734, 'learning_rate': 3.351677852348993e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25074/75000 [22:50<41:23, 20.10it/s]

{'loss': 0.3153, 'grad_norm': 5.750247955322266, 'learning_rate': 3.351006711409396e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25082/75000 [22:51<45:23, 18.33it/s]

{'loss': 0.2846, 'grad_norm': 6.913243770599365, 'learning_rate': 3.3503355704697983e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25094/75000 [22:51<41:58, 19.81it/s]

{'loss': 0.2747, 'grad_norm': 1.6707026958465576, 'learning_rate': 3.349664429530202e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25104/75000 [22:52<41:07, 20.22it/s]

{'loss': 0.3621, 'grad_norm': 5.475949764251709, 'learning_rate': 3.348993288590605e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25113/75000 [22:52<41:38, 19.97it/s]

{'loss': 0.2787, 'grad_norm': 3.762699842453003, 'learning_rate': 3.348322147651007e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 25122/75000 [22:53<43:59, 18.89it/s]

{'loss': 0.3372, 'grad_norm': 5.211741924285889, 'learning_rate': 3.34765100671141e-05, 'epoch': 1.0}


                                                     
 34%|███▎      | 25132/75000 [22:53<42:16, 19.66it/s]

{'loss': 0.2748, 'grad_norm': 2.6597275733947754, 'learning_rate': 3.346979865771812e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25144/75000 [22:54<41:08, 20.20it/s]

{'loss': 0.3887, 'grad_norm': 6.037569999694824, 'learning_rate': 3.346308724832215e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25154/75000 [22:54<42:34, 19.51it/s]

{'loss': 0.1902, 'grad_norm': 1.7733736038208008, 'learning_rate': 3.3456375838926177e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25164/75000 [22:55<40:54, 20.30it/s]

{'loss': 0.3261, 'grad_norm': 4.272397041320801, 'learning_rate': 3.3449664429530205e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25172/75000 [22:55<42:17, 19.64it/s]

{'loss': 0.3367, 'grad_norm': 2.9419918060302734, 'learning_rate': 3.3442953020134234e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25184/75000 [22:56<42:54, 19.35it/s]

{'loss': 0.3863, 'grad_norm': 1.8931735754013062, 'learning_rate': 3.3436241610738255e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25194/75000 [22:56<41:45, 19.88it/s]

{'loss': 0.2763, 'grad_norm': 3.4906041622161865, 'learning_rate': 3.3429530201342284e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25202/75000 [22:57<46:35, 17.82it/s]

{'loss': 0.344, 'grad_norm': 10.14109992980957, 'learning_rate': 3.3422818791946306e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25213/75000 [22:57<45:10, 18.37it/s]

{'loss': 0.2892, 'grad_norm': 0.678592324256897, 'learning_rate': 3.341610738255034e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25222/75000 [22:58<43:25, 19.11it/s]

{'loss': 0.3499, 'grad_norm': 4.7040324211120605, 'learning_rate': 3.340939597315436e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25234/75000 [22:58<41:44, 19.87it/s]

{'loss': 0.2521, 'grad_norm': 4.993129730224609, 'learning_rate': 3.340268456375839e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25242/75000 [22:59<45:42, 18.15it/s]

{'loss': 0.3792, 'grad_norm': 2.734025001525879, 'learning_rate': 3.339597315436242e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25252/75000 [22:59<43:06, 19.23it/s]

{'loss': 0.2394, 'grad_norm': 2.8771495819091797, 'learning_rate': 3.338926174496644e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25262/75000 [23:00<42:25, 19.54it/s]

{'loss': 0.3664, 'grad_norm': 2.017756700515747, 'learning_rate': 3.338255033557047e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25274/75000 [23:01<41:57, 19.75it/s]

{'loss': 0.224, 'grad_norm': 3.6581614017486572, 'learning_rate': 3.33758389261745e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25282/75000 [23:01<46:22, 17.87it/s]

{'loss': 0.3818, 'grad_norm': 8.292593002319336, 'learning_rate': 3.336912751677853e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25292/75000 [23:02<44:18, 18.70it/s]

{'loss': 0.2351, 'grad_norm': 7.248081207275391, 'learning_rate': 3.336241610738255e-05, 'epoch': 1.01}


                                                     
 34%|███▎      | 25303/75000 [23:02<46:32, 17.80it/s]

{'loss': 0.4521, 'grad_norm': 29.886022567749023, 'learning_rate': 3.335570469798658e-05, 'epoch': 1.01}


                                                     
 34%|███▍      | 25313/75000 [23:03<46:18, 17.88it/s]

{'loss': 0.2852, 'grad_norm': 4.720621109008789, 'learning_rate': 3.3348993288590606e-05, 'epoch': 1.01}


                                                     
 34%|███▍      | 25322/75000 [23:03<46:39, 17.74it/s]

{'loss': 0.2823, 'grad_norm': 2.802905797958374, 'learning_rate': 3.334228187919463e-05, 'epoch': 1.01}


                                                     
 34%|███▍      | 25331/75000 [23:04<45:09, 18.33it/s]

{'loss': 0.2949, 'grad_norm': 2.4394497871398926, 'learning_rate': 3.333557046979866e-05, 'epoch': 1.01}


                                                     
 34%|███▍      | 25342/75000 [23:04<49:05, 16.86it/s]

{'loss': 0.1816, 'grad_norm': 5.228691101074219, 'learning_rate': 3.3328859060402685e-05, 'epoch': 1.01}


                                                     
 34%|███▍      | 25353/75000 [23:05<44:29, 18.60it/s]

{'loss': 0.4672, 'grad_norm': 3.280620813369751, 'learning_rate': 3.3322147651006714e-05, 'epoch': 1.01}


                                                     
 34%|███▍      | 25361/75000 [23:05<46:46, 17.69it/s]

{'loss': 0.1718, 'grad_norm': 11.89080810546875, 'learning_rate': 3.331543624161074e-05, 'epoch': 1.01}


                                                     
 34%|███▍      | 25373/75000 [23:06<43:10, 19.16it/s]

{'loss': 0.3396, 'grad_norm': 2.4993741512298584, 'learning_rate': 3.3308724832214764e-05, 'epoch': 1.01}


                                                     
 34%|███▍      | 25383/75000 [23:07<46:46, 17.68it/s]

{'loss': 0.3997, 'grad_norm': 9.545867919921875, 'learning_rate': 3.330201342281879e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25393/75000 [23:07<44:59, 18.37it/s]

{'loss': 0.3593, 'grad_norm': 2.9622199535369873, 'learning_rate': 3.329530201342282e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25401/75000 [23:08<47:44, 17.32it/s]

{'loss': 0.2382, 'grad_norm': 3.204116106033325, 'learning_rate': 3.328859060402685e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25413/75000 [23:08<43:56, 18.81it/s]

{'loss': 0.2775, 'grad_norm': 3.5924298763275146, 'learning_rate': 3.328187919463087e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25423/75000 [23:09<48:25, 17.06it/s]

{'loss': 0.2576, 'grad_norm': 3.905472993850708, 'learning_rate': 3.32751677852349e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25432/75000 [23:09<46:59, 17.58it/s]

{'loss': 0.3033, 'grad_norm': 1.8626753091812134, 'learning_rate': 3.326845637583893e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25442/75000 [23:10<49:22, 16.73it/s]

{'loss': 0.3919, 'grad_norm': 1.5793356895446777, 'learning_rate': 3.326174496644296e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25452/75000 [23:11<46:21, 17.82it/s]

{'loss': 0.2534, 'grad_norm': 0.4839254915714264, 'learning_rate': 3.3255033557046986e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25463/75000 [23:11<45:31, 18.14it/s]

{'loss': 0.2974, 'grad_norm': 1.4758470058441162, 'learning_rate': 3.324832214765101e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25473/75000 [23:12<43:50, 18.83it/s]

{'loss': 0.2975, 'grad_norm': 37.702274322509766, 'learning_rate': 3.3241610738255036e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25483/75000 [23:12<51:21, 16.07it/s]

{'loss': 0.4026, 'grad_norm': 1.66439688205719, 'learning_rate': 3.323489932885906e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25491/75000 [23:13<48:40, 16.95it/s]

{'loss': 0.2862, 'grad_norm': 2.5414962768554688, 'learning_rate': 3.3228187919463086e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25500/75000 [23:13<54:36, 15.11it/s]

{'loss': 0.4208, 'grad_norm': 5.438117504119873, 'learning_rate': 3.3221476510067115e-05, 'epoch': 1.02}


                                                       
 34%|███▍      | 25514/75000 [23:15<53:51, 15.31it/s]  

{'loss': 0.2702, 'grad_norm': 2.9258174896240234, 'learning_rate': 3.321476510067114e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25522/75000 [23:15<50:11, 16.43it/s]

{'loss': 0.4657, 'grad_norm': 2.085108518600464, 'learning_rate': 3.320805369127517e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25534/75000 [23:16<43:23, 19.00it/s]

{'loss': 0.4249, 'grad_norm': 1.8117705583572388, 'learning_rate': 3.3201342281879194e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25542/75000 [23:16<47:58, 17.18it/s]

{'loss': 0.3245, 'grad_norm': 6.873403549194336, 'learning_rate': 3.319463087248322e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25554/75000 [23:17<42:52, 19.22it/s]

{'loss': 0.3234, 'grad_norm': 1.874563455581665, 'learning_rate': 3.318791946308725e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25562/75000 [23:17<46:58, 17.54it/s]

{'loss': 0.2859, 'grad_norm': 5.289458751678467, 'learning_rate': 3.318120805369128e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25574/75000 [23:18<41:20, 19.92it/s]

{'loss': 0.2925, 'grad_norm': 6.203437805175781, 'learning_rate': 3.317449664429531e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25583/75000 [23:19<43:40, 18.86it/s]

{'loss': 0.3087, 'grad_norm': 1.1552895307540894, 'learning_rate': 3.316778523489933e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25591/75000 [23:19<42:46, 19.25it/s]

{'loss': 0.3455, 'grad_norm': 4.715610504150391, 'learning_rate': 3.316107382550336e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25601/75000 [23:20<45:02, 18.28it/s]

{'loss': 0.3357, 'grad_norm': 5.8357157707214355, 'learning_rate': 3.315436241610738e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25612/75000 [23:20<41:22, 19.90it/s]

{'loss': 0.4401, 'grad_norm': 3.3899149894714355, 'learning_rate': 3.314765100671141e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25624/75000 [23:21<40:19, 20.41it/s]

{'loss': 0.3074, 'grad_norm': 4.361510276794434, 'learning_rate': 3.314093959731544e-05, 'epoch': 1.02}


                                                     
 34%|███▍      | 25633/75000 [23:21<41:55, 19.63it/s]

{'loss': 0.3008, 'grad_norm': 5.38762092590332, 'learning_rate': 3.3134228187919465e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25643/75000 [23:22<43:18, 18.99it/s]

{'loss': 0.3227, 'grad_norm': 1.0394715070724487, 'learning_rate': 3.3127516778523494e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25654/75000 [23:22<40:53, 20.11it/s]

{'loss': 0.43, 'grad_norm': 7.040756702423096, 'learning_rate': 3.3120805369127516e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25662/75000 [23:23<44:18, 18.56it/s]

{'loss': 0.3046, 'grad_norm': 3.0544586181640625, 'learning_rate': 3.3114093959731544e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25672/75000 [23:23<42:04, 19.54it/s]

{'loss': 0.4739, 'grad_norm': 4.316178321838379, 'learning_rate': 3.3107382550335566e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25682/75000 [23:24<46:00, 17.86it/s]

{'loss': 0.2766, 'grad_norm': 1.8904348611831665, 'learning_rate': 3.31006711409396e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25694/75000 [23:24<41:42, 19.70it/s]

{'loss': 0.3386, 'grad_norm': 0.849003255367279, 'learning_rate': 3.309395973154363e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25702/75000 [23:25<46:30, 17.67it/s]

{'loss': 0.2752, 'grad_norm': 2.0578370094299316, 'learning_rate': 3.308724832214765e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25714/75000 [23:25<41:50, 19.64it/s]

{'loss': 0.2257, 'grad_norm': 1.9989672899246216, 'learning_rate': 3.308053691275168e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25724/75000 [23:26<42:46, 19.20it/s]

{'loss': 0.3216, 'grad_norm': 5.4528021812438965, 'learning_rate': 3.30738255033557e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25734/75000 [23:26<42:57, 19.11it/s]

{'loss': 0.2686, 'grad_norm': 3.3335859775543213, 'learning_rate': 3.306711409395973e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25742/75000 [23:27<44:30, 18.44it/s]

{'loss': 0.2422, 'grad_norm': 5.731210708618164, 'learning_rate': 3.3060402684563766e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25753/75000 [23:28<43:34, 18.84it/s]

{'loss': 0.2666, 'grad_norm': 2.3313138484954834, 'learning_rate': 3.305369127516779e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25764/75000 [23:28<40:55, 20.05it/s]

{'loss': 0.2954, 'grad_norm': 3.9884536266326904, 'learning_rate': 3.3046979865771816e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25772/75000 [23:29<44:55, 18.26it/s]

{'loss': 0.3569, 'grad_norm': 3.9018001556396484, 'learning_rate': 3.304026845637584e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25784/75000 [23:29<40:46, 20.12it/s]

{'loss': 0.3196, 'grad_norm': 4.052947998046875, 'learning_rate': 3.3033557046979867e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25794/75000 [23:30<42:57, 19.09it/s]

{'loss': 0.3042, 'grad_norm': 4.715742588043213, 'learning_rate': 3.3026845637583895e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25804/75000 [23:30<41:50, 19.60it/s]

{'loss': 0.2874, 'grad_norm': 7.518764972686768, 'learning_rate': 3.3020134228187924e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25813/75000 [23:31<42:31, 19.27it/s]

{'loss': 0.3911, 'grad_norm': 10.83942699432373, 'learning_rate': 3.301342281879195e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25823/75000 [23:31<40:54, 20.04it/s]

{'loss': 0.2887, 'grad_norm': 3.6222827434539795, 'learning_rate': 3.3006711409395974e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25834/75000 [23:32<41:37, 19.68it/s]

{'loss': 0.2565, 'grad_norm': 3.2623653411865234, 'learning_rate': 3.3e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25842/75000 [23:32<42:19, 19.36it/s]

{'loss': 0.3057, 'grad_norm': 14.412848472595215, 'learning_rate': 3.2993288590604024e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25854/75000 [23:33<43:56, 18.64it/s]

{'loss': 0.3272, 'grad_norm': 35.03278732299805, 'learning_rate': 3.298657718120805e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25864/75000 [23:33<42:15, 19.38it/s]

{'loss': 0.3498, 'grad_norm': 10.09533405303955, 'learning_rate': 3.297986577181208e-05, 'epoch': 1.03}


                                                     
 34%|███▍      | 25872/75000 [23:34<44:30, 18.40it/s]

{'loss': 0.2883, 'grad_norm': 1.8159033060073853, 'learning_rate': 3.297315436241611e-05, 'epoch': 1.03}


                                                     
 35%|███▍      | 25883/75000 [23:34<42:52, 19.09it/s]

{'loss': 0.3351, 'grad_norm': 8.144397735595703, 'learning_rate': 3.296644295302014e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25892/75000 [23:35<42:39, 19.19it/s]

{'loss': 0.2426, 'grad_norm': 2.6702911853790283, 'learning_rate': 3.295973154362416e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25904/75000 [23:35<42:58, 19.04it/s]

{'loss': 0.21, 'grad_norm': 1.6796132326126099, 'learning_rate': 3.295302013422819e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25911/75000 [23:36<44:16, 18.48it/s]

{'loss': 0.3981, 'grad_norm': 3.4250481128692627, 'learning_rate': 3.294630872483222e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25922/75000 [23:37<48:42, 16.79it/s]

{'loss': 0.2542, 'grad_norm': 2.4078826904296875, 'learning_rate': 3.2939597315436246e-05, 'epoch': 1.04}


                                                       
 35%|███▍      | 25932/75000 [23:37<59:21, 13.78it/s]

{'loss': 0.2951, 'grad_norm': 1.4624342918395996, 'learning_rate': 3.2932885906040274e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25944/75000 [23:38<45:21, 18.02it/s]

{'loss': 0.2599, 'grad_norm': 2.88420033454895, 'learning_rate': 3.2926174496644296e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25953/75000 [23:38<48:43, 16.78it/s]

{'loss': 0.2082, 'grad_norm': 3.0414514541625977, 'learning_rate': 3.2919463087248325e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25964/75000 [23:39<42:42, 19.14it/s]

{'loss': 0.3655, 'grad_norm': 7.774754047393799, 'learning_rate': 3.2912751677852347e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25972/75000 [23:40<49:19, 16.57it/s]

{'loss': 0.3054, 'grad_norm': 3.271427869796753, 'learning_rate': 3.2906040268456375e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25983/75000 [23:40<42:00, 19.45it/s]

{'loss': 0.3716, 'grad_norm': 4.168834209442139, 'learning_rate': 3.2899328859060404e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 25994/75000 [23:41<42:36, 19.17it/s]

{'loss': 0.2822, 'grad_norm': 4.843328952789307, 'learning_rate': 3.289261744966443e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26000/75000 [23:41<41:49, 19.52it/s]

{'loss': 0.3877, 'grad_norm': 5.234481334686279, 'learning_rate': 3.288590604026846e-05, 'epoch': 1.04}


                                                       
 35%|███▍      | 26013/75000 [23:42<52:41, 15.50it/s]  

{'loss': 0.2946, 'grad_norm': 1.195618987083435, 'learning_rate': 3.287919463087248e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26021/75000 [23:43<49:07, 16.62it/s]

{'loss': 0.2199, 'grad_norm': 4.150463104248047, 'learning_rate': 3.287248322147651e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26033/75000 [23:43<46:03, 17.72it/s]

{'loss': 0.1982, 'grad_norm': 5.231598377227783, 'learning_rate': 3.286577181208054e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26041/75000 [23:44<54:58, 14.84it/s]

{'loss': 0.4586, 'grad_norm': 5.384536266326904, 'learning_rate': 3.285906040268457e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26052/75000 [23:44<45:15, 18.03it/s]

{'loss': 0.4124, 'grad_norm': 2.3137505054473877, 'learning_rate': 3.285234899328859e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26064/75000 [23:45<40:53, 19.95it/s]

{'loss': 0.4608, 'grad_norm': 2.313209056854248, 'learning_rate': 3.284563758389262e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26073/75000 [23:46<44:39, 18.26it/s]

{'loss': 0.3292, 'grad_norm': 4.753259658813477, 'learning_rate': 3.283892617449665e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26082/75000 [23:46<43:26, 18.77it/s]

{'loss': 0.2948, 'grad_norm': 3.703011989593506, 'learning_rate': 3.283221476510067e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26094/75000 [23:47<43:10, 18.88it/s]

{'loss': 0.431, 'grad_norm': 9.399909973144531, 'learning_rate': 3.2825503355704704e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26102/75000 [23:47<47:37, 17.11it/s]

{'loss': 0.3468, 'grad_norm': 4.436871528625488, 'learning_rate': 3.2818791946308726e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26114/75000 [23:48<42:09, 19.33it/s]

{'loss': 0.4289, 'grad_norm': 4.57253360748291, 'learning_rate': 3.2812080536912754e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26124/75000 [23:48<42:56, 18.97it/s]

{'loss': 0.2562, 'grad_norm': 2.9531972408294678, 'learning_rate': 3.2805369127516776e-05, 'epoch': 1.04}


                                                     
 35%|███▍      | 26134/75000 [23:49<41:52, 19.45it/s]

{'loss': 0.491, 'grad_norm': 5.157522201538086, 'learning_rate': 3.2798657718120805e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26142/75000 [23:49<44:26, 18.32it/s]

{'loss': 0.212, 'grad_norm': 4.979520320892334, 'learning_rate': 3.279194630872483e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26153/75000 [23:50<42:54, 18.97it/s]

{'loss': 0.4079, 'grad_norm': 4.061404228210449, 'learning_rate': 3.278523489932886e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26162/75000 [23:50<46:24, 17.54it/s]

{'loss': 0.3079, 'grad_norm': 5.44260835647583, 'learning_rate': 3.277852348993289e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26174/75000 [23:51<40:43, 19.98it/s]

{'loss': 0.2412, 'grad_norm': 2.8724865913391113, 'learning_rate': 3.277181208053691e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26182/75000 [23:52<51:06, 15.92it/s]

{'loss': 0.2321, 'grad_norm': 4.048152923583984, 'learning_rate': 3.276510067114094e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26192/75000 [23:52<46:52, 17.35it/s]

{'loss': 0.5148, 'grad_norm': 2.633713722229004, 'learning_rate': 3.275838926174497e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26202/75000 [23:53<50:54, 15.98it/s]

{'loss': 0.1906, 'grad_norm': 8.458895683288574, 'learning_rate': 3.275167785234899e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26214/75000 [23:53<41:42, 19.50it/s]

{'loss': 0.4131, 'grad_norm': 4.6488728523254395, 'learning_rate': 3.2744966442953026e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26222/75000 [23:54<44:14, 18.38it/s]

{'loss': 0.332, 'grad_norm': 2.903610944747925, 'learning_rate': 3.273825503355705e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26234/75000 [23:54<40:59, 19.82it/s]

{'loss': 0.3719, 'grad_norm': 25.530452728271484, 'learning_rate': 3.273154362416108e-05, 'epoch': 1.05}


                                                     
 35%|███▍      | 26244/75000 [23:55<42:58, 18.91it/s]

{'loss': 0.4486, 'grad_norm': 2.7301015853881836, 'learning_rate': 3.27248322147651e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26254/75000 [23:55<40:46, 19.93it/s]

{'loss': 0.4191, 'grad_norm': 14.387514114379883, 'learning_rate': 3.271812080536913e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26262/75000 [23:56<43:27, 18.69it/s]

{'loss': 0.3266, 'grad_norm': 2.074258804321289, 'learning_rate': 3.2711409395973156e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26272/75000 [23:56<41:57, 19.35it/s]

{'loss': 0.2787, 'grad_norm': 2.1442482471466064, 'learning_rate': 3.2704697986577184e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26281/75000 [23:57<44:04, 18.42it/s]

{'loss': 0.3046, 'grad_norm': 2.452254056930542, 'learning_rate': 3.269798657718121e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26292/75000 [23:58<44:26, 18.26it/s]

{'loss': 0.2808, 'grad_norm': 1.549651026725769, 'learning_rate': 3.2691275167785234e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26303/75000 [23:58<48:23, 16.77it/s]

{'loss': 0.4277, 'grad_norm': 4.221560478210449, 'learning_rate': 3.268456375838926e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26313/75000 [23:59<43:44, 18.55it/s]

{'loss': 0.3256, 'grad_norm': 0.7138247489929199, 'learning_rate': 3.2677852348993285e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26323/75000 [23:59<45:15, 17.93it/s]

{'loss': 0.3201, 'grad_norm': 1.0559773445129395, 'learning_rate': 3.267114093959732e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26332/75000 [24:00<43:25, 18.68it/s]

{'loss': 0.2764, 'grad_norm': 4.290213584899902, 'learning_rate': 3.266442953020135e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26342/75000 [24:00<45:40, 17.76it/s]

{'loss': 0.3405, 'grad_norm': 5.510313034057617, 'learning_rate': 3.265771812080537e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26353/75000 [24:01<45:13, 17.93it/s]

{'loss': 0.3567, 'grad_norm': 1.4929317235946655, 'learning_rate': 3.26510067114094e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26363/75000 [24:02<48:05, 16.86it/s]

{'loss': 0.3762, 'grad_norm': 5.90070104598999, 'learning_rate': 3.264429530201342e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26371/75000 [24:02<47:42, 16.99it/s]

{'loss': 0.2075, 'grad_norm': 17.526805877685547, 'learning_rate': 3.263758389261745e-05, 'epoch': 1.05}


                                                     
 35%|███▌      | 26382/75000 [24:03<47:04, 17.22it/s]

{'loss': 0.2669, 'grad_norm': 3.883221387863159, 'learning_rate': 3.263087248322148e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26392/75000 [24:03<45:02, 17.98it/s]

{'loss': 0.3953, 'grad_norm': 5.89410400390625, 'learning_rate': 3.2624161073825506e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26402/75000 [24:04<56:00, 14.46it/s]

{'loss': 0.3218, 'grad_norm': 2.925752878189087, 'learning_rate': 3.2617449664429535e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26412/75000 [24:04<47:49, 16.93it/s]

{'loss': 0.3377, 'grad_norm': 2.4948208332061768, 'learning_rate': 3.261073825503356e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26422/75000 [24:05<44:05, 18.36it/s]

{'loss': 0.2852, 'grad_norm': 3.2456586360931396, 'learning_rate': 3.2604026845637585e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26433/75000 [24:06<48:17, 16.76it/s]

{'loss': 0.305, 'grad_norm': 7.186266899108887, 'learning_rate': 3.259731543624161e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26442/75000 [24:06<45:25, 17.81it/s]

{'loss': 0.2602, 'grad_norm': 4.4604692459106445, 'learning_rate': 3.259060402684564e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26454/75000 [24:07<41:53, 19.31it/s]

{'loss': 0.3808, 'grad_norm': 27.03129005432129, 'learning_rate': 3.258389261744967e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26462/75000 [24:07<45:03, 17.95it/s]

{'loss': 0.3796, 'grad_norm': 1.8581998348236084, 'learning_rate': 3.257718120805369e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26472/75000 [24:08<42:28, 19.04it/s]

{'loss': 0.2813, 'grad_norm': 2.4820666313171387, 'learning_rate': 3.257046979865772e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26482/75000 [24:08<43:18, 18.67it/s]

{'loss': 0.3714, 'grad_norm': 2.191394090652466, 'learning_rate': 3.256375838926174e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26493/75000 [24:09<43:56, 18.40it/s]

{'loss': 0.3171, 'grad_norm': 3.954768180847168, 'learning_rate': 3.255704697986577e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26500/75000 [24:09<43:43, 18.48it/s]

{'loss': 0.1921, 'grad_norm': 0.8498392701148987, 'learning_rate': 3.25503355704698e-05, 'epoch': 1.06}


                                                       
 35%|███▌      | 26512/75000 [24:10<51:37, 15.65it/s]

{'loss': 0.2378, 'grad_norm': 0.4957899749279022, 'learning_rate': 3.254362416107383e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26521/75000 [24:11<48:51, 16.54it/s]

{'loss': 0.4104, 'grad_norm': 1.8779499530792236, 'learning_rate': 3.253691275167786e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26533/75000 [24:12<43:28, 18.58it/s]

{'loss': 0.2565, 'grad_norm': 1.7877053022384644, 'learning_rate': 3.253020134228188e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26541/75000 [24:12<47:03, 17.17it/s]

{'loss': 0.3767, 'grad_norm': 18.884681701660156, 'learning_rate': 3.252348993288591e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26551/75000 [24:13<42:59, 18.78it/s]

{'loss': 0.3408, 'grad_norm': 0.8316171765327454, 'learning_rate': 3.251677852348993e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26562/75000 [24:13<42:48, 18.86it/s]

{'loss': 0.4009, 'grad_norm': 2.266009569168091, 'learning_rate': 3.2510067114093964e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26572/75000 [24:14<44:24, 18.17it/s]

{'loss': 0.3612, 'grad_norm': 0.7736634612083435, 'learning_rate': 3.250335570469799e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26582/75000 [24:14<42:38, 18.93it/s]

{'loss': 0.377, 'grad_norm': 2.327441930770874, 'learning_rate': 3.2496644295302015e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26592/75000 [24:15<43:59, 18.34it/s]

{'loss': 0.2364, 'grad_norm': 1.714216709136963, 'learning_rate': 3.248993288590604e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26604/75000 [24:15<42:32, 18.96it/s]

{'loss': 0.3181, 'grad_norm': 27.814138412475586, 'learning_rate': 3.2483221476510065e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26612/75000 [24:16<45:21, 17.78it/s]

{'loss': 0.4024, 'grad_norm': 7.988437652587891, 'learning_rate': 3.2476510067114094e-05, 'epoch': 1.06}


                                                     
 35%|███▌      | 26623/75000 [24:16<42:56, 18.78it/s]

{'loss': 0.4481, 'grad_norm': 2.116116762161255, 'learning_rate': 3.246979865771812e-05, 'epoch': 1.06}


                                                     
 36%|███▌      | 26633/75000 [24:17<43:10, 18.67it/s]

{'loss': 0.443, 'grad_norm': 2.5197410583496094, 'learning_rate': 3.246308724832215e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26644/75000 [24:18<40:26, 19.93it/s]

{'loss': 0.3296, 'grad_norm': 9.057193756103516, 'learning_rate': 3.245637583892618e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26652/75000 [24:18<42:35, 18.92it/s]

{'loss': 0.3287, 'grad_norm': 15.677165031433105, 'learning_rate': 3.24496644295302e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26662/75000 [24:19<41:02, 19.63it/s]

{'loss': 0.2764, 'grad_norm': 1.1815930604934692, 'learning_rate': 3.244295302013423e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26671/75000 [24:19<41:39, 19.34it/s]

{'loss': 0.3342, 'grad_norm': 3.4522812366485596, 'learning_rate': 3.243624161073826e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26684/75000 [24:20<40:26, 19.91it/s]

{'loss': 0.2335, 'grad_norm': 0.8812718391418457, 'learning_rate': 3.242953020134229e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26692/75000 [24:20<43:51, 18.36it/s]

{'loss': 0.2276, 'grad_norm': 5.58888578414917, 'learning_rate': 3.242281879194631e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26702/75000 [24:21<42:16, 19.04it/s]

{'loss': 0.4495, 'grad_norm': 4.297992706298828, 'learning_rate': 3.241610738255034e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26714/75000 [24:21<40:25, 19.91it/s]

{'loss': 0.4518, 'grad_norm': 2.471961259841919, 'learning_rate': 3.2409395973154366e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26722/75000 [24:22<44:22, 18.13it/s]

{'loss': 0.2382, 'grad_norm': 3.5018715858459473, 'learning_rate': 3.240268456375839e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26732/75000 [24:22<42:25, 18.96it/s]

{'loss': 0.273, 'grad_norm': 2.4011971950531006, 'learning_rate': 3.2395973154362416e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26744/75000 [24:23<39:58, 20.12it/s]

{'loss': 0.3887, 'grad_norm': 4.926825523376465, 'learning_rate': 3.2389261744966444e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26752/75000 [24:23<42:06, 19.09it/s]

{'loss': 0.3062, 'grad_norm': 1.5133886337280273, 'learning_rate': 3.238255033557047e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26764/75000 [24:24<39:34, 20.32it/s]

{'loss': 0.396, 'grad_norm': 5.6492791175842285, 'learning_rate': 3.2375838926174495e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26774/75000 [24:24<40:38, 19.78it/s]

{'loss': 0.4166, 'grad_norm': 3.4124255180358887, 'learning_rate': 3.236912751677852e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26783/75000 [24:25<42:24, 18.95it/s]

{'loss': 0.2987, 'grad_norm': 2.008350372314453, 'learning_rate': 3.236241610738255e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26791/75000 [24:25<41:29, 19.37it/s]

{'loss': 0.3305, 'grad_norm': 2.745884895324707, 'learning_rate': 3.235570469798658e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26801/75000 [24:26<42:30, 18.90it/s]

{'loss': 0.3743, 'grad_norm': 8.160768508911133, 'learning_rate': 3.234899328859061e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26814/75000 [24:26<40:14, 19.95it/s]

{'loss': 0.3346, 'grad_norm': 0.6075690984725952, 'learning_rate': 3.234228187919463e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26822/75000 [24:27<43:24, 18.50it/s]

{'loss': 0.3858, 'grad_norm': 5.264593124389648, 'learning_rate': 3.233557046979866e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26834/75000 [24:27<40:18, 19.91it/s]

{'loss': 0.3741, 'grad_norm': 8.202106475830078, 'learning_rate': 3.232885906040269e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26842/75000 [24:28<43:44, 18.35it/s]

{'loss': 0.2645, 'grad_norm': 3.2236502170562744, 'learning_rate': 3.232214765100671e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26852/75000 [24:28<41:20, 19.41it/s]

{'loss': 0.4026, 'grad_norm': 5.726068019866943, 'learning_rate': 3.231543624161074e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26864/75000 [24:29<42:47, 18.75it/s]

{'loss': 0.3102, 'grad_norm': 1.7430996894836426, 'learning_rate': 3.230872483221477e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26873/75000 [24:29<40:44, 19.69it/s]

{'loss': 0.333, 'grad_norm': 1.7340350151062012, 'learning_rate': 3.2302013422818795e-05, 'epoch': 1.07}


                                                     
 36%|███▌      | 26882/75000 [24:30<44:53, 17.86it/s]

{'loss': 0.4296, 'grad_norm': 1.7564235925674438, 'learning_rate': 3.229530201342282e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26894/75000 [24:31<41:06, 19.50it/s]

{'loss': 0.3474, 'grad_norm': 8.481005668640137, 'learning_rate': 3.2288590604026846e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26904/75000 [24:31<41:57, 19.11it/s]

{'loss': 0.3768, 'grad_norm': 5.984654426574707, 'learning_rate': 3.2281879194630874e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26913/75000 [24:32<40:54, 19.59it/s]

{'loss': 0.3008, 'grad_norm': 1.8073596954345703, 'learning_rate': 3.22751677852349e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26922/75000 [24:32<44:50, 17.87it/s]

{'loss': 0.2927, 'grad_norm': 3.5850417613983154, 'learning_rate': 3.226845637583893e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26932/75000 [24:33<41:22, 19.37it/s]

{'loss': 0.301, 'grad_norm': 0.8948880434036255, 'learning_rate': 3.226174496644295e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26944/75000 [24:33<42:41, 18.76it/s]

{'loss': 0.3302, 'grad_norm': 5.399186134338379, 'learning_rate': 3.225503355704698e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26952/75000 [24:34<42:18, 18.93it/s]

{'loss': 0.2633, 'grad_norm': 7.229637622833252, 'learning_rate': 3.2248322147651e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26964/75000 [24:34<40:29, 19.77it/s]

{'loss': 0.4419, 'grad_norm': 0.9513896107673645, 'learning_rate': 3.224161073825503e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26974/75000 [24:35<40:46, 19.63it/s]

{'loss': 0.3231, 'grad_norm': 6.211371898651123, 'learning_rate': 3.223489932885907e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26982/75000 [24:35<41:24, 19.33it/s]

{'loss': 0.4221, 'grad_norm': 7.197553634643555, 'learning_rate': 3.222818791946309e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 26994/75000 [24:36<40:27, 19.78it/s]

{'loss': 0.2923, 'grad_norm': 1.252224326133728, 'learning_rate': 3.222147651006712e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27000/75000 [24:36<40:44, 19.64it/s]

{'loss': 0.32, 'grad_norm': 3.5443520545959473, 'learning_rate': 3.221476510067114e-05, 'epoch': 1.08}


                                                       
 36%|███▌      | 27011/75000 [24:37<51:46, 15.45it/s]

{'loss': 0.411, 'grad_norm': 4.916579723358154, 'learning_rate': 3.220805369127517e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27022/75000 [24:38<41:59, 19.04it/s]

{'loss': 0.2838, 'grad_norm': 12.033246994018555, 'learning_rate': 3.2201342281879196e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27034/75000 [24:38<41:44, 19.15it/s]

{'loss': 0.258, 'grad_norm': 3.3114123344421387, 'learning_rate': 3.2194630872483225e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27044/75000 [24:39<39:34, 20.20it/s]

{'loss': 0.3213, 'grad_norm': 16.692684173583984, 'learning_rate': 3.2187919463087253e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27054/75000 [24:39<40:44, 19.61it/s]

{'loss': 0.1591, 'grad_norm': 1.1671687364578247, 'learning_rate': 3.2181208053691275e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27063/75000 [24:40<40:05, 19.92it/s]

{'loss': 0.3241, 'grad_norm': 3.299769163131714, 'learning_rate': 3.2174496644295304e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27074/75000 [24:40<41:40, 19.17it/s]

{'loss': 0.2951, 'grad_norm': 6.877472400665283, 'learning_rate': 3.2167785234899326e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27082/75000 [24:41<39:59, 19.97it/s]

{'loss': 0.23, 'grad_norm': 2.6488559246063232, 'learning_rate': 3.2161073825503354e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27092/75000 [24:41<42:03, 18.98it/s]

{'loss': 0.262, 'grad_norm': 3.0529873371124268, 'learning_rate': 3.215436241610739e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27102/75000 [24:42<40:09, 19.88it/s]

{'loss': 0.3729, 'grad_norm': 10.797210693359375, 'learning_rate': 3.214765100671141e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27112/75000 [24:42<44:12, 18.06it/s]

{'loss': 0.4549, 'grad_norm': 0.49451714754104614, 'learning_rate': 3.214093959731544e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27124/75000 [24:43<40:03, 19.92it/s]

{'loss': 0.3007, 'grad_norm': 6.336402893066406, 'learning_rate': 3.213422818791946e-05, 'epoch': 1.08}


                                                     
 36%|███▌      | 27134/75000 [24:44<41:24, 19.26it/s]

{'loss': 0.3561, 'grad_norm': 7.482120990753174, 'learning_rate': 3.212751677852349e-05, 'epoch': 1.09}


                                                     
 36%|███▌      | 27144/75000 [24:44<41:06, 19.40it/s]

{'loss': 0.4336, 'grad_norm': 12.650391578674316, 'learning_rate': 3.212080536912752e-05, 'epoch': 1.09}


                                                     
 36%|███▌      | 27152/75000 [24:45<41:28, 19.23it/s]

{'loss': 0.3276, 'grad_norm': 1.1712861061096191, 'learning_rate': 3.211409395973155e-05, 'epoch': 1.09}


                                                     
 36%|███▌      | 27163/75000 [24:45<39:27, 20.20it/s]

{'loss': 0.3102, 'grad_norm': 5.173747539520264, 'learning_rate': 3.2107382550335576e-05, 'epoch': 1.09}


                                                     
 36%|███▌      | 27173/75000 [24:46<41:21, 19.28it/s]

{'loss': 0.265, 'grad_norm': 8.954228401184082, 'learning_rate': 3.21006711409396e-05, 'epoch': 1.09}


                                                     
 36%|███▌      | 27183/75000 [24:46<40:06, 19.87it/s]

{'loss': 0.3322, 'grad_norm': 6.272851467132568, 'learning_rate': 3.2093959731543626e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27194/75000 [24:47<39:56, 19.95it/s]

{'loss': 0.2122, 'grad_norm': 9.558237075805664, 'learning_rate': 3.208724832214765e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27201/75000 [24:47<40:17, 19.77it/s]

{'loss': 0.4104, 'grad_norm': 0.577146589756012, 'learning_rate': 3.208053691275168e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27214/75000 [24:48<40:55, 19.46it/s]

{'loss': 0.4009, 'grad_norm': 1.3290036916732788, 'learning_rate': 3.207382550335571e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27224/75000 [24:48<40:07, 19.84it/s]

{'loss': 0.3167, 'grad_norm': 3.587543487548828, 'learning_rate': 3.2067114093959733e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27234/75000 [24:49<42:14, 18.85it/s]

{'loss': 0.2816, 'grad_norm': 6.812072277069092, 'learning_rate': 3.206040268456376e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27242/75000 [24:49<41:22, 19.24it/s]

{'loss': 0.2568, 'grad_norm': 2.1916074752807617, 'learning_rate': 3.2053691275167784e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27254/75000 [24:50<40:53, 19.46it/s]

{'loss': 0.2153, 'grad_norm': 5.442018985748291, 'learning_rate': 3.204697986577181e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27263/75000 [24:50<41:13, 19.30it/s]

{'loss': 0.344, 'grad_norm': 4.842950344085693, 'learning_rate': 3.204026845637584e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27272/75000 [24:51<40:16, 19.75it/s]

{'loss': 0.3234, 'grad_norm': 5.400562763214111, 'learning_rate': 3.203355704697987e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27282/75000 [24:51<41:12, 19.30it/s]

{'loss': 0.2757, 'grad_norm': 0.7513014674186707, 'learning_rate': 3.20268456375839e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27294/75000 [24:52<39:56, 19.90it/s]

{'loss': 0.3323, 'grad_norm': 2.8315539360046387, 'learning_rate': 3.202013422818792e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27303/75000 [24:52<42:39, 18.63it/s]

{'loss': 0.1902, 'grad_norm': 2.6872522830963135, 'learning_rate': 3.201342281879195e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27314/75000 [24:53<39:56, 19.90it/s]

{'loss': 0.3145, 'grad_norm': 10.522391319274902, 'learning_rate': 3.200671140939597e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27323/75000 [24:53<42:45, 18.59it/s]

{'loss': 0.2844, 'grad_norm': 5.364891052246094, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27334/75000 [24:54<39:26, 20.14it/s]

{'loss': 0.2997, 'grad_norm': 1.3484914302825928, 'learning_rate': 3.199328859060403e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27342/75000 [24:54<43:50, 18.12it/s]

{'loss': 0.2668, 'grad_norm': 1.422424077987671, 'learning_rate': 3.1986577181208056e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27352/75000 [24:55<39:43, 19.99it/s]

{'loss': 0.2588, 'grad_norm': 2.8442652225494385, 'learning_rate': 3.1979865771812084e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27362/75000 [24:55<44:51, 17.70it/s]

{'loss': 0.2957, 'grad_norm': 1.7818467617034912, 'learning_rate': 3.1973154362416106e-05, 'epoch': 1.09}


                                                     
 36%|███▋      | 27374/75000 [24:56<40:41, 19.51it/s]

{'loss': 0.2468, 'grad_norm': 2.261625051498413, 'learning_rate': 3.1966442953020135e-05, 'epoch': 1.09}


                                                     
 37%|███▋      | 27383/75000 [24:57<42:32, 18.65it/s]

{'loss': 0.35, 'grad_norm': 3.715630054473877, 'learning_rate': 3.195973154362416e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27392/75000 [24:57<43:27, 18.26it/s]

{'loss': 0.3852, 'grad_norm': 6.647233963012695, 'learning_rate': 3.195302013422819e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27403/75000 [24:58<40:46, 19.46it/s]

{'loss': 0.3521, 'grad_norm': 2.572591543197632, 'learning_rate': 3.194630872483222e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27412/75000 [24:58<43:17, 18.32it/s]

{'loss': 0.3512, 'grad_norm': 4.382244110107422, 'learning_rate': 3.193959731543624e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27424/75000 [24:59<39:16, 20.19it/s]

{'loss': 0.2028, 'grad_norm': 2.302780866622925, 'learning_rate': 3.193288590604027e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27432/75000 [24:59<42:02, 18.86it/s]

{'loss': 0.4111, 'grad_norm': 1.7846654653549194, 'learning_rate': 3.192617449664429e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27444/75000 [25:00<40:30, 19.57it/s]

{'loss': 0.2928, 'grad_norm': 1.2137314081192017, 'learning_rate': 3.191946308724833e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27453/75000 [25:00<40:09, 19.73it/s]

{'loss': 0.4341, 'grad_norm': 7.391910552978516, 'learning_rate': 3.191275167785235e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27463/75000 [25:01<41:59, 18.87it/s]

{'loss': 0.1974, 'grad_norm': 3.2871294021606445, 'learning_rate': 3.190604026845638e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27472/75000 [25:01<43:12, 18.33it/s]

{'loss': 0.4642, 'grad_norm': 3.943519115447998, 'learning_rate': 3.1899328859060406e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27484/75000 [25:02<40:48, 19.40it/s]

{'loss': 0.2832, 'grad_norm': 2.130568504333496, 'learning_rate': 3.189261744966443e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27492/75000 [25:02<41:01, 19.30it/s]

{'loss': 0.3062, 'grad_norm': 4.184732913970947, 'learning_rate': 3.188590604026846e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27500/75000 [25:03<42:42, 18.54it/s]

{'loss': 0.2567, 'grad_norm': 3.7103543281555176, 'learning_rate': 3.1879194630872485e-05, 'epoch': 1.1}


                                                       
 37%|███▋      | 27513/75000 [25:08<2:17:47,  5.74it/s]

{'loss': 0.3018, 'grad_norm': 8.74960994720459, 'learning_rate': 3.1872483221476514e-05, 'epoch': 1.1}


                                                       
 37%|███▋      | 27524/75000 [25:09<1:03:48, 12.40it/s]

{'loss': 0.3904, 'grad_norm': 4.80872917175293, 'learning_rate': 3.1865771812080536e-05, 'epoch': 1.1}


                                                       
 37%|███▋      | 27534/75000 [25:09<45:06, 17.54it/s]

{'loss': 0.3794, 'grad_norm': 0.9547415375709534, 'learning_rate': 3.1859060402684564e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27543/75000 [25:10<42:46, 18.49it/s]

{'loss': 0.3055, 'grad_norm': 2.0177481174468994, 'learning_rate': 3.185234899328859e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27551/75000 [25:10<42:06, 18.78it/s]

{'loss': 0.2746, 'grad_norm': 1.380432367324829, 'learning_rate': 3.184563758389262e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27562/75000 [25:11<39:44, 19.90it/s]

{'loss': 0.3272, 'grad_norm': 1.8116703033447266, 'learning_rate': 3.183892617449665e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27572/75000 [25:11<42:31, 18.59it/s]

{'loss': 0.4484, 'grad_norm': 4.55833101272583, 'learning_rate': 3.183221476510067e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27584/75000 [25:12<39:09, 20.18it/s]

{'loss': 0.2971, 'grad_norm': 3.070216178894043, 'learning_rate': 3.18255033557047e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27594/75000 [25:12<40:56, 19.30it/s]

{'loss': 0.3475, 'grad_norm': 8.623174667358398, 'learning_rate': 3.181879194630872e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27601/75000 [25:13<40:34, 19.47it/s]

{'loss': 0.3854, 'grad_norm': 5.5478835105896, 'learning_rate': 3.181208053691275e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27614/75000 [25:14<40:36, 19.45it/s]

{'loss': 0.3221, 'grad_norm': 3.519920825958252, 'learning_rate': 3.180536912751678e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27623/75000 [25:14<41:10, 19.18it/s]

{'loss': 0.4205, 'grad_norm': 3.4612176418304443, 'learning_rate': 3.179865771812081e-05, 'epoch': 1.1}


                                                     
 37%|███▋      | 27631/75000 [25:14<41:24, 19.07it/s]

{'loss': 0.289, 'grad_norm': 7.6039628982543945, 'learning_rate': 3.1791946308724836e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27642/75000 [25:15<39:44, 19.86it/s]

{'loss': 0.3517, 'grad_norm': 4.776432037353516, 'learning_rate': 3.178523489932886e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27651/75000 [25:15<40:27, 19.50it/s]

{'loss': 0.4215, 'grad_norm': 3.5100464820861816, 'learning_rate': 3.1778523489932886e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27664/75000 [25:16<39:42, 19.87it/s]

{'loss': 0.4023, 'grad_norm': 6.312459945678711, 'learning_rate': 3.1771812080536915e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27674/75000 [25:17<40:40, 19.39it/s]

{'loss': 0.3479, 'grad_norm': 2.3850765228271484, 'learning_rate': 3.1765100671140943e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27682/75000 [25:17<41:15, 19.12it/s]

{'loss': 0.3368, 'grad_norm': 2.387810230255127, 'learning_rate': 3.175838926174497e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27691/75000 [25:18<44:10, 17.85it/s]

{'loss': 0.2498, 'grad_norm': 5.533154487609863, 'learning_rate': 3.1751677852348994e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27703/75000 [25:18<42:49, 18.41it/s]

{'loss': 0.2168, 'grad_norm': 2.131758451461792, 'learning_rate': 3.174496644295302e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27714/75000 [25:19<39:16, 20.06it/s]

{'loss': 0.2125, 'grad_norm': 1.0980297327041626, 'learning_rate': 3.1738255033557044e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27722/75000 [25:19<40:57, 19.24it/s]

{'loss': 0.2848, 'grad_norm': 3.626094102859497, 'learning_rate': 3.173154362416107e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27733/75000 [25:20<38:57, 20.23it/s]

{'loss': 0.3897, 'grad_norm': 3.859171152114868, 'learning_rate': 3.17248322147651e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27742/75000 [25:20<38:51, 20.27it/s]

{'loss': 0.2763, 'grad_norm': 3.681715726852417, 'learning_rate': 3.171812080536913e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27752/75000 [25:21<41:05, 19.16it/s]

{'loss': 0.2815, 'grad_norm': 1.6825631856918335, 'learning_rate': 3.171140939597316e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27762/75000 [25:21<41:39, 18.90it/s]

{'loss': 0.3387, 'grad_norm': 10.643500328063965, 'learning_rate': 3.170469798657718e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27774/75000 [25:22<39:11, 20.09it/s]

{'loss': 0.4081, 'grad_norm': 14.170654296875, 'learning_rate': 3.169798657718121e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27782/75000 [25:22<43:46, 17.98it/s]

{'loss': 0.2044, 'grad_norm': 2.0883073806762695, 'learning_rate': 3.169127516778523e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27794/75000 [25:23<39:32, 19.90it/s]

{'loss': 0.4034, 'grad_norm': 11.071995735168457, 'learning_rate': 3.1684563758389266e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27801/75000 [25:23<43:44, 17.98it/s]

{'loss': 0.3857, 'grad_norm': 3.354703903198242, 'learning_rate': 3.1677852348993294e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27813/75000 [25:24<38:53, 20.22it/s]

{'loss': 0.2639, 'grad_norm': 3.0019137859344482, 'learning_rate': 3.1671140939597316e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27822/75000 [25:24<39:25, 19.95it/s]

{'loss': 0.4551, 'grad_norm': 4.055088996887207, 'learning_rate': 3.1664429530201345e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27834/75000 [25:25<39:12, 20.05it/s]

{'loss': 0.33, 'grad_norm': 2.7685136795043945, 'learning_rate': 3.1657718120805366e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27844/75000 [25:25<40:20, 19.48it/s]

{'loss': 0.2478, 'grad_norm': 4.862738609313965, 'learning_rate': 3.1651006711409395e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27854/75000 [25:26<38:51, 20.22it/s]

{'loss': 0.2858, 'grad_norm': 2.9060239791870117, 'learning_rate': 3.164429530201343e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27862/75000 [25:26<39:37, 19.82it/s]

{'loss': 0.433, 'grad_norm': 3.4039690494537354, 'learning_rate': 3.163758389261745e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27874/75000 [25:27<38:47, 20.25it/s]

{'loss': 0.2363, 'grad_norm': 2.554755449295044, 'learning_rate': 3.163087248322148e-05, 'epoch': 1.11}


                                                     
 37%|███▋      | 27884/75000 [25:28<39:11, 20.04it/s]

{'loss': 0.4253, 'grad_norm': 1.5841798782348633, 'learning_rate': 3.16241610738255e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27894/75000 [25:28<39:37, 19.82it/s]

{'loss': 0.2102, 'grad_norm': 2.2746403217315674, 'learning_rate': 3.161744966442953e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27902/75000 [25:28<44:11, 17.76it/s]

{'loss': 0.2649, 'grad_norm': 2.205493211746216, 'learning_rate': 3.161073825503356e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27914/75000 [25:29<40:24, 19.42it/s]

{'loss': 0.3332, 'grad_norm': 0.7224622964859009, 'learning_rate': 3.160402684563759e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27923/75000 [25:30<42:24, 18.50it/s]

{'loss': 0.2165, 'grad_norm': 5.860464572906494, 'learning_rate': 3.1597315436241617e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27933/75000 [25:30<39:49, 19.70it/s]

{'loss': 0.2817, 'grad_norm': 1.6968506574630737, 'learning_rate': 3.159060402684564e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27941/75000 [25:31<40:24, 19.41it/s]

{'loss': 0.4628, 'grad_norm': 12.050171852111816, 'learning_rate': 3.158389261744967e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27954/75000 [25:31<39:00, 20.10it/s]

{'loss': 0.2977, 'grad_norm': 4.380146503448486, 'learning_rate': 3.157718120805369e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27962/75000 [25:32<38:46, 20.22it/s]

{'loss': 0.4263, 'grad_norm': 3.4934918880462646, 'learning_rate': 3.157046979865772e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27974/75000 [25:32<41:05, 19.07it/s]

{'loss': 0.4118, 'grad_norm': 15.338119506835938, 'learning_rate': 3.1563758389261746e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27984/75000 [25:33<39:09, 20.01it/s]

{'loss': 0.2329, 'grad_norm': 5.266002655029297, 'learning_rate': 3.1557046979865774e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 27994/75000 [25:33<40:24, 19.39it/s]

{'loss': 0.2551, 'grad_norm': 3.7924931049346924, 'learning_rate': 3.15503355704698e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28000/75000 [25:34<41:10, 19.03it/s]

{'loss': 0.2764, 'grad_norm': 6.560017108917236, 'learning_rate': 3.1543624161073825e-05, 'epoch': 1.12}


                                                       
 37%|███▋      | 28012/75000 [25:35<51:16, 15.27it/s]

{'loss': 0.3604, 'grad_norm': 5.014359951019287, 'learning_rate': 3.153691275167785e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28023/75000 [25:35<42:39, 18.36it/s]

{'loss': 0.2888, 'grad_norm': 4.133832931518555, 'learning_rate': 3.153020134228188e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28034/75000 [25:36<39:26, 19.85it/s]

{'loss': 0.3175, 'grad_norm': 4.657602787017822, 'learning_rate': 3.152348993288591e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28042/75000 [25:36<42:29, 18.42it/s]

{'loss': 0.3257, 'grad_norm': 31.562911987304688, 'learning_rate': 3.151677852348994e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28052/75000 [25:37<40:46, 19.19it/s]

{'loss': 0.2284, 'grad_norm': 8.868228912353516, 'learning_rate': 3.151006711409396e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28064/75000 [25:37<38:47, 20.17it/s]

{'loss': 0.2459, 'grad_norm': 3.5723512172698975, 'learning_rate': 3.150335570469799e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28073/75000 [25:38<40:46, 19.18it/s]

{'loss': 0.4408, 'grad_norm': 12.814240455627441, 'learning_rate': 3.149664429530201e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28084/75000 [25:38<38:22, 20.37it/s]

{'loss': 0.3042, 'grad_norm': 0.9369357228279114, 'learning_rate': 3.148993288590604e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28092/75000 [25:39<41:42, 18.75it/s]

{'loss': 0.3152, 'grad_norm': 1.7319563627243042, 'learning_rate': 3.148322147651007e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28104/75000 [25:39<39:07, 19.98it/s]

{'loss': 0.2953, 'grad_norm': 4.101898670196533, 'learning_rate': 3.1476510067114096e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28112/75000 [25:40<41:55, 18.64it/s]

{'loss': 0.3252, 'grad_norm': 6.737221717834473, 'learning_rate': 3.1469798657718125e-05, 'epoch': 1.12}


                                                     
 37%|███▋      | 28122/75000 [25:40<40:08, 19.46it/s]

{'loss': 0.3607, 'grad_norm': 3.1172337532043457, 'learning_rate': 3.146308724832215e-05, 'epoch': 1.12}


                                                     
 38%|███▊      | 28131/75000 [25:41<42:36, 18.33it/s]

{'loss': 0.2504, 'grad_norm': 2.612638473510742, 'learning_rate': 3.1456375838926175e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28144/75000 [25:42<38:28, 20.30it/s]

{'loss': 0.3606, 'grad_norm': 0.8841832876205444, 'learning_rate': 3.1449664429530204e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28152/75000 [25:42<42:56, 18.18it/s]

{'loss': 0.2977, 'grad_norm': 4.347661018371582, 'learning_rate': 3.144295302013423e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28163/75000 [25:43<40:08, 19.45it/s]

{'loss': 0.3964, 'grad_norm': 3.5331342220306396, 'learning_rate': 3.1436241610738254e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28174/75000 [25:43<40:37, 19.21it/s]

{'loss': 0.2614, 'grad_norm': 3.7997536659240723, 'learning_rate': 3.142953020134228e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28184/75000 [25:44<39:14, 19.89it/s]

{'loss': 0.1974, 'grad_norm': 1.522065281867981, 'learning_rate': 3.142281879194631e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28194/75000 [25:44<40:09, 19.43it/s]

{'loss': 0.3846, 'grad_norm': 3.9959936141967773, 'learning_rate': 3.141610738255033e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28202/75000 [25:45<39:17, 19.85it/s]

{'loss': 0.3648, 'grad_norm': 5.196652412414551, 'learning_rate': 3.140939597315437e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28213/75000 [25:45<40:13, 19.38it/s]

{'loss': 0.3909, 'grad_norm': 1.2221518754959106, 'learning_rate': 3.140268456375839e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28224/75000 [25:46<39:23, 19.79it/s]

{'loss': 0.4102, 'grad_norm': 7.07626485824585, 'learning_rate': 3.139597315436242e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28231/75000 [25:46<40:58, 19.02it/s]

{'loss': 0.3483, 'grad_norm': 1.2320640087127686, 'learning_rate': 3.138926174496645e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28242/75000 [25:47<38:48, 20.08it/s]

{'loss': 0.2927, 'grad_norm': 3.432386636734009, 'learning_rate': 3.138255033557047e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28254/75000 [25:47<39:09, 19.89it/s]

{'loss': 0.4982, 'grad_norm': 11.335081100463867, 'learning_rate': 3.13758389261745e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28264/75000 [25:48<39:20, 19.80it/s]

{'loss': 0.3196, 'grad_norm': 8.104589462280273, 'learning_rate': 3.1369127516778526e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28272/75000 [25:48<42:51, 18.17it/s]

{'loss': 0.3683, 'grad_norm': 8.607032775878906, 'learning_rate': 3.1362416107382555e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28282/75000 [25:49<39:49, 19.55it/s]

{'loss': 0.4097, 'grad_norm': 14.167085647583008, 'learning_rate': 3.1355704697986576e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28294/75000 [25:49<39:48, 19.56it/s]

{'loss': 0.2926, 'grad_norm': 4.845784664154053, 'learning_rate': 3.1348993288590605e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28302/75000 [25:50<40:24, 19.26it/s]

{'loss': 0.3224, 'grad_norm': 6.022872447967529, 'learning_rate': 3.1342281879194634e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28311/75000 [25:50<42:54, 18.14it/s]

{'loss': 0.3322, 'grad_norm': 6.581836700439453, 'learning_rate': 3.1335570469798655e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28323/75000 [25:51<38:36, 20.15it/s]

{'loss': 0.3511, 'grad_norm': 3.4761788845062256, 'learning_rate': 3.132885906040269e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28334/75000 [25:51<38:59, 19.95it/s]

{'loss': 0.2259, 'grad_norm': 2.273084878921509, 'learning_rate': 3.132214765100671e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28344/75000 [25:52<40:04, 19.41it/s]

{'loss': 0.2618, 'grad_norm': 2.6962411403656006, 'learning_rate': 3.131543624161074e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28352/75000 [25:52<42:33, 18.27it/s]

{'loss': 0.318, 'grad_norm': 4.738894939422607, 'learning_rate': 3.130872483221476e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28362/75000 [25:53<39:40, 19.59it/s]

{'loss': 0.3863, 'grad_norm': 4.046221733093262, 'learning_rate': 3.130201342281879e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28372/75000 [25:53<41:15, 18.84it/s]

{'loss': 0.3197, 'grad_norm': 4.929291248321533, 'learning_rate': 3.129530201342282e-05, 'epoch': 1.13}


                                                     
 38%|███▊      | 28383/75000 [25:54<38:48, 20.02it/s]

{'loss': 0.3335, 'grad_norm': 5.247217655181885, 'learning_rate': 3.128859060402685e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28394/75000 [25:55<39:52, 19.48it/s]

{'loss': 0.3725, 'grad_norm': 2.443004846572876, 'learning_rate': 3.128187919463088e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28402/75000 [25:55<41:33, 18.69it/s]

{'loss': 0.3448, 'grad_norm': 2.964521646499634, 'learning_rate': 3.12751677852349e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28414/75000 [25:56<38:05, 20.39it/s]

{'loss': 0.3438, 'grad_norm': 1.0548039674758911, 'learning_rate': 3.126845637583893e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28424/75000 [25:56<39:22, 19.72it/s]

{'loss': 0.3486, 'grad_norm': 6.400212287902832, 'learning_rate': 3.126174496644295e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28434/75000 [25:57<38:42, 20.05it/s]

{'loss': 0.5258, 'grad_norm': 3.0334150791168213, 'learning_rate': 3.1255033557046984e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28442/75000 [25:57<41:03, 18.90it/s]

{'loss': 0.2168, 'grad_norm': 6.198696136474609, 'learning_rate': 3.124832214765101e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28451/75000 [25:58<43:40, 17.77it/s]

{'loss': 0.2753, 'grad_norm': 2.3296427726745605, 'learning_rate': 3.1241610738255035e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28464/75000 [25:58<39:10, 19.80it/s]

{'loss': 0.3252, 'grad_norm': 3.398672580718994, 'learning_rate': 3.123489932885906e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28472/75000 [25:59<42:39, 18.18it/s]

{'loss': 0.2958, 'grad_norm': 1.8012937307357788, 'learning_rate': 3.1228187919463085e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28482/75000 [25:59<39:39, 19.55it/s]

{'loss': 0.3424, 'grad_norm': 7.510166168212891, 'learning_rate': 3.1221476510067114e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28494/75000 [26:00<39:50, 19.45it/s]

{'loss': 0.3963, 'grad_norm': 2.533738374710083, 'learning_rate': 3.121476510067114e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28500/75000 [26:00<39:45, 19.49it/s]

{'loss': 0.2171, 'grad_norm': 5.619240760803223, 'learning_rate': 3.120805369127517e-05, 'epoch': 1.14}


                                                       
 38%|███▊      | 28513/75000 [26:01<47:37, 16.27it/s]

{'loss': 0.3251, 'grad_norm': 0.8580735921859741, 'learning_rate': 3.12013422818792e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28521/75000 [26:02<41:47, 18.54it/s]

{'loss': 0.2848, 'grad_norm': 2.863279342651367, 'learning_rate': 3.119463087248322e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28534/75000 [26:02<39:06, 19.80it/s]

{'loss': 0.2651, 'grad_norm': 5.1073994636535645, 'learning_rate': 3.118791946308725e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28542/75000 [26:03<38:25, 20.15it/s]

{'loss': 0.3378, 'grad_norm': 2.0413761138916016, 'learning_rate': 3.118120805369127e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28553/75000 [26:03<40:32, 19.09it/s]

{'loss': 0.2843, 'grad_norm': 5.714344501495361, 'learning_rate': 3.1174496644295307e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28562/75000 [26:04<41:43, 18.55it/s]

{'loss': 0.3293, 'grad_norm': 7.702937126159668, 'learning_rate': 3.1167785234899335e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28573/75000 [26:04<39:19, 19.68it/s]

{'loss': 0.2692, 'grad_norm': 1.430056095123291, 'learning_rate': 3.116107382550336e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28583/75000 [26:05<42:32, 18.19it/s]

{'loss': 0.3173, 'grad_norm': 2.2977421283721924, 'learning_rate': 3.1154362416107385e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28593/75000 [26:05<40:21, 19.17it/s]

{'loss': 0.3261, 'grad_norm': 0.48642483353614807, 'learning_rate': 3.114765100671141e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28604/75000 [26:06<39:13, 19.71it/s]

{'loss': 0.2496, 'grad_norm': 1.4846398830413818, 'learning_rate': 3.1140939597315436e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28612/75000 [26:06<39:56, 19.36it/s]

{'loss': 0.3964, 'grad_norm': 0.4068595767021179, 'learning_rate': 3.1134228187919464e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28624/75000 [26:07<40:43, 18.98it/s]

{'loss': 0.3438, 'grad_norm': 9.4768648147583, 'learning_rate': 3.112751677852349e-05, 'epoch': 1.14}


                                                     
 38%|███▊      | 28631/75000 [26:07<40:39, 19.01it/s]

{'loss': 0.2355, 'grad_norm': 0.9429438710212708, 'learning_rate': 3.112080536912752e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28644/75000 [26:08<39:36, 19.51it/s]

{'loss': 0.2928, 'grad_norm': 1.4662818908691406, 'learning_rate': 3.111409395973154e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28654/75000 [26:09<38:16, 20.19it/s]

{'loss': 0.2743, 'grad_norm': 3.2541983127593994, 'learning_rate': 3.110738255033557e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28664/75000 [26:09<39:49, 19.39it/s]

{'loss': 0.2305, 'grad_norm': 2.3858938217163086, 'learning_rate': 3.1100671140939593e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28674/75000 [26:10<38:48, 19.89it/s]

{'loss': 0.4841, 'grad_norm': 1.227756381034851, 'learning_rate': 3.109395973154363e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28682/75000 [26:10<43:02, 17.94it/s]

{'loss': 0.3115, 'grad_norm': 0.3997136652469635, 'learning_rate': 3.108724832214766e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28693/75000 [26:11<39:14, 19.66it/s]

{'loss': 0.3708, 'grad_norm': 2.3287477493286133, 'learning_rate': 3.108053691275168e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28704/75000 [26:11<41:21, 18.66it/s]

{'loss': 0.284, 'grad_norm': 9.40017318725586, 'learning_rate': 3.107382550335571e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28714/75000 [26:12<38:48, 19.88it/s]

{'loss': 0.2698, 'grad_norm': 3.004011869430542, 'learning_rate': 3.106711409395973e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28722/75000 [26:12<41:04, 18.78it/s]

{'loss': 0.452, 'grad_norm': 4.463342189788818, 'learning_rate': 3.106040268456376e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28734/75000 [26:13<38:05, 20.24it/s]

{'loss': 0.2526, 'grad_norm': 3.080548048019409, 'learning_rate': 3.1053691275167787e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28742/75000 [26:13<41:48, 18.44it/s]

{'loss': 0.2092, 'grad_norm': 1.940026879310608, 'learning_rate': 3.1046979865771815e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28752/75000 [26:14<39:26, 19.54it/s]

{'loss': 0.3848, 'grad_norm': 1.139176845550537, 'learning_rate': 3.1040268456375844e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28762/75000 [26:14<43:00, 17.92it/s]

{'loss': 0.3223, 'grad_norm': 3.4756312370300293, 'learning_rate': 3.1033557046979865e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28773/75000 [26:15<39:59, 19.27it/s]

{'loss': 0.3989, 'grad_norm': 4.050422191619873, 'learning_rate': 3.1026845637583894e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28783/75000 [26:15<39:28, 19.51it/s]

{'loss': 0.3845, 'grad_norm': 5.289234161376953, 'learning_rate': 3.102013422818792e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28794/75000 [26:16<39:44, 19.37it/s]

{'loss': 0.1613, 'grad_norm': 1.1017637252807617, 'learning_rate': 3.101342281879195e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28802/75000 [26:16<42:38, 18.05it/s]

{'loss': 0.4362, 'grad_norm': 4.647956371307373, 'learning_rate': 3.100671140939597e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28813/75000 [26:17<38:56, 19.77it/s]

{'loss': 0.1789, 'grad_norm': 18.30286979675293, 'learning_rate': 3.1e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28822/75000 [26:17<42:32, 18.09it/s]

{'loss': 0.3217, 'grad_norm': 0.8881708383560181, 'learning_rate': 3.099328859060403e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28834/75000 [26:18<38:25, 20.02it/s]

{'loss': 0.4603, 'grad_norm': 7.054793834686279, 'learning_rate': 3.098657718120805e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28843/75000 [26:19<41:09, 18.69it/s]

{'loss': 0.3862, 'grad_norm': 9.3751802444458, 'learning_rate': 3.097986577181208e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28853/75000 [26:19<40:20, 19.07it/s]

{'loss': 0.3401, 'grad_norm': 5.084059715270996, 'learning_rate': 3.097315436241611e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28862/75000 [26:20<39:24, 19.51it/s]

{'loss': 0.2588, 'grad_norm': 3.302907943725586, 'learning_rate': 3.096644295302014e-05, 'epoch': 1.15}


                                                     
 38%|███▊      | 28874/75000 [26:20<37:48, 20.33it/s]

{'loss': 0.371, 'grad_norm': 13.440927505493164, 'learning_rate': 3.0959731543624166e-05, 'epoch': 1.15}


                                                     
 39%|███▊      | 28882/75000 [26:21<38:56, 19.74it/s]

{'loss': 0.3239, 'grad_norm': 6.2361741065979, 'learning_rate': 3.095302013422819e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28894/75000 [26:21<39:32, 19.44it/s]

{'loss': 0.2475, 'grad_norm': 3.4590535163879395, 'learning_rate': 3.0946308724832216e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28902/75000 [26:22<44:02, 17.44it/s]

{'loss': 0.2994, 'grad_norm': 2.99021053314209, 'learning_rate': 3.0939597315436245e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28913/75000 [26:22<39:59, 19.21it/s]

{'loss': 0.2768, 'grad_norm': 3.4491817951202393, 'learning_rate': 3.093288590604027e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28923/75000 [26:23<40:33, 18.93it/s]

{'loss': 0.305, 'grad_norm': 1.7540405988693237, 'learning_rate': 3.0926174496644295e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28932/75000 [26:23<39:04, 19.65it/s]

{'loss': 0.4465, 'grad_norm': 2.5377562046051025, 'learning_rate': 3.0919463087248324e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28942/75000 [26:24<39:49, 19.27it/s]

{'loss': 0.2177, 'grad_norm': 2.1314468383789062, 'learning_rate': 3.091275167785235e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28954/75000 [26:24<40:26, 18.98it/s]

{'loss': 0.2478, 'grad_norm': 1.447227120399475, 'learning_rate': 3.0906040268456374e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28964/75000 [26:25<38:16, 20.05it/s]

{'loss': 0.3569, 'grad_norm': 1.4951573610305786, 'learning_rate': 3.08993288590604e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28974/75000 [26:25<42:29, 18.05it/s]

{'loss': 0.2455, 'grad_norm': 1.334689974784851, 'learning_rate': 3.089261744966443e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28984/75000 [26:26<39:14, 19.54it/s]

{'loss': 0.2631, 'grad_norm': 2.7146008014678955, 'learning_rate': 3.088590604026846e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 28992/75000 [26:26<40:53, 18.75it/s]

{'loss': 0.3685, 'grad_norm': 3.143404722213745, 'learning_rate': 3.087919463087248e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 29000/75000 [26:27<39:34, 19.38it/s]

{'loss': 0.4336, 'grad_norm': 3.3503644466400146, 'learning_rate': 3.087248322147651e-05, 'epoch': 1.16}


                                                       
 39%|███▊      | 29012/75000 [26:28<53:22, 14.36it/s]

{'loss': 0.3894, 'grad_norm': 4.250366687774658, 'learning_rate': 3.086577181208054e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 29022/75000 [26:29<42:03, 18.22it/s]

{'loss': 0.4044, 'grad_norm': 21.058719635009766, 'learning_rate': 3.085906040268457e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 29034/75000 [26:29<39:26, 19.43it/s]

{'loss': 0.3584, 'grad_norm': 3.946563243865967, 'learning_rate': 3.0852348993288596e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 29042/75000 [26:30<43:08, 17.75it/s]

{'loss': 0.3025, 'grad_norm': 5.372851848602295, 'learning_rate': 3.084563758389262e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 29052/75000 [26:30<40:09, 19.07it/s]

{'loss': 0.3642, 'grad_norm': 3.6925318241119385, 'learning_rate': 3.0838926174496646e-05, 'epoch': 1.16}


                                                     
 39%|███▊      | 29061/75000 [26:31<41:18, 18.53it/s]

{'loss': 0.3025, 'grad_norm': 1.714558720588684, 'learning_rate': 3.083221476510067e-05, 'epoch': 1.16}


                                                     
 39%|███▉      | 29072/75000 [26:31<38:20, 19.97it/s]

{'loss': 0.2541, 'grad_norm': 3.870711326599121, 'learning_rate': 3.0825503355704696e-05, 'epoch': 1.16}


                                                     
 39%|███▉      | 29082/75000 [26:32<39:49, 19.22it/s]

{'loss': 0.3865, 'grad_norm': 6.477397918701172, 'learning_rate': 3.081879194630873e-05, 'epoch': 1.16}


                                                     
 39%|███▉      | 29093/75000 [26:32<38:18, 19.97it/s]

{'loss': 0.3206, 'grad_norm': 5.227150917053223, 'learning_rate': 3.081208053691275e-05, 'epoch': 1.16}


                                                     
 39%|███▉      | 29103/75000 [26:33<39:40, 19.28it/s]

{'loss': 0.3931, 'grad_norm': 2.0257763862609863, 'learning_rate': 3.080536912751678e-05, 'epoch': 1.16}


                                                     
 39%|███▉      | 29114/75000 [26:33<37:45, 20.25it/s]

{'loss': 0.2864, 'grad_norm': 2.3710505962371826, 'learning_rate': 3.0798657718120804e-05, 'epoch': 1.16}


                                                     
 39%|███▉      | 29122/75000 [26:34<40:19, 18.96it/s]

{'loss': 0.2041, 'grad_norm': 1.6047874689102173, 'learning_rate': 3.079194630872483e-05, 'epoch': 1.16}


                                                     
 39%|███▉      | 29132/75000 [26:34<38:34, 19.82it/s]

{'loss': 0.3198, 'grad_norm': 1.7589091062545776, 'learning_rate': 3.078523489932886e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29142/75000 [26:35<40:39, 18.80it/s]

{'loss': 0.4392, 'grad_norm': 3.199535369873047, 'learning_rate': 3.077852348993289e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29152/75000 [26:35<39:00, 19.59it/s]

{'loss': 0.2629, 'grad_norm': 1.9128955602645874, 'learning_rate': 3.077181208053692e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29162/75000 [26:36<40:19, 18.95it/s]

{'loss': 0.2705, 'grad_norm': 2.896615982055664, 'learning_rate': 3.076510067114094e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29174/75000 [26:36<37:58, 20.11it/s]

{'loss': 0.3503, 'grad_norm': 2.8396992683410645, 'learning_rate': 3.075838926174497e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29184/75000 [26:37<40:41, 18.76it/s]

{'loss': 0.2742, 'grad_norm': 3.715747117996216, 'learning_rate': 3.075167785234899e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29192/75000 [26:37<43:18, 17.63it/s]

{'loss': 0.3188, 'grad_norm': 1.4732321500778198, 'learning_rate': 3.074496644295302e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29204/75000 [26:38<38:00, 20.08it/s]

{'loss': 0.437, 'grad_norm': 1.213146448135376, 'learning_rate': 3.0738255033557054e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29214/75000 [26:39<39:22, 19.38it/s]

{'loss': 0.3007, 'grad_norm': 4.430393218994141, 'learning_rate': 3.0731543624161075e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29223/75000 [26:39<38:29, 19.83it/s]

{'loss': 0.2842, 'grad_norm': 7.926265716552734, 'learning_rate': 3.0724832214765104e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29232/75000 [26:40<41:34, 18.35it/s]

{'loss': 0.3894, 'grad_norm': 2.9856491088867188, 'learning_rate': 3.0718120805369126e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29242/75000 [26:40<38:33, 19.78it/s]

{'loss': 0.2318, 'grad_norm': 2.162374973297119, 'learning_rate': 3.0711409395973154e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29252/75000 [26:41<40:34, 18.79it/s]

{'loss': 0.2993, 'grad_norm': 2.476590394973755, 'learning_rate': 3.070469798657718e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29264/75000 [26:41<38:27, 19.82it/s]

{'loss': 0.3352, 'grad_norm': 10.395211219787598, 'learning_rate': 3.069798657718121e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29271/75000 [26:42<41:11, 18.50it/s]

{'loss': 0.3455, 'grad_norm': 5.126255989074707, 'learning_rate': 3.069127516778524e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29284/75000 [26:42<37:59, 20.05it/s]

{'loss': 0.3187, 'grad_norm': 9.458148002624512, 'learning_rate': 3.068456375838926e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29290/75000 [26:43<37:47, 20.16it/s]

{'loss': 0.233, 'grad_norm': 0.3771759569644928, 'learning_rate': 3.067785234899329e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29302/75000 [26:43<38:39, 19.70it/s]

{'loss': 0.4184, 'grad_norm': 2.577404737472534, 'learning_rate': 3.067114093959731e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29312/75000 [26:44<41:44, 18.24it/s]

{'loss': 0.2969, 'grad_norm': 2.0357542037963867, 'learning_rate': 3.066442953020135e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29322/75000 [26:44<39:16, 19.38it/s]

{'loss': 0.2337, 'grad_norm': 3.3991448879241943, 'learning_rate': 3.0657718120805376e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29332/75000 [26:45<38:54, 19.56it/s]

{'loss': 0.281, 'grad_norm': 2.5117263793945312, 'learning_rate': 3.06510067114094e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29343/75000 [26:45<38:20, 19.85it/s]

{'loss': 0.3363, 'grad_norm': 2.8439462184906006, 'learning_rate': 3.0644295302013426e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29354/75000 [26:46<38:58, 19.52it/s]

{'loss': 0.2603, 'grad_norm': 2.3689327239990234, 'learning_rate': 3.063758389261745e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29362/75000 [26:46<41:27, 18.34it/s]

{'loss': 0.2166, 'grad_norm': 0.39741823077201843, 'learning_rate': 3.0630872483221477e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29372/75000 [26:47<38:45, 19.62it/s]

{'loss': 0.2766, 'grad_norm': 0.3306984603404999, 'learning_rate': 3.0624161073825505e-05, 'epoch': 1.17}


                                                     
 39%|███▉      | 29381/75000 [26:47<41:20, 18.39it/s]

{'loss': 0.2423, 'grad_norm': 1.5528278350830078, 'learning_rate': 3.0617449664429534e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29394/75000 [26:48<38:15, 19.87it/s]

{'loss': 0.2222, 'grad_norm': 1.416147232055664, 'learning_rate': 3.061073825503356e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29402/75000 [26:48<42:32, 17.86it/s]

{'loss': 0.305, 'grad_norm': 2.5941123962402344, 'learning_rate': 3.0604026845637584e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29412/75000 [26:49<38:37, 19.67it/s]

{'loss': 0.2772, 'grad_norm': 1.838507056236267, 'learning_rate': 3.059731543624161e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29421/75000 [26:49<42:02, 18.07it/s]

{'loss': 0.325, 'grad_norm': 4.4947099685668945, 'learning_rate': 3.0590604026845634e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29433/75000 [26:50<37:50, 20.07it/s]

{'loss': 0.3512, 'grad_norm': 0.40053707361221313, 'learning_rate': 3.058389261744967e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29443/75000 [26:50<39:15, 19.34it/s]

{'loss': 0.2582, 'grad_norm': 3.2851524353027344, 'learning_rate': 3.057718120805369e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29452/75000 [26:51<38:23, 19.78it/s]

{'loss': 0.3392, 'grad_norm': 5.194872856140137, 'learning_rate': 3.057046979865772e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29461/75000 [26:51<43:37, 17.40it/s]

{'loss': 0.3858, 'grad_norm': 1.3645589351654053, 'learning_rate': 3.056375838926175e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29474/75000 [26:52<37:48, 20.07it/s]

{'loss': 0.2398, 'grad_norm': 4.156017780303955, 'learning_rate': 3.055704697986577e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29483/75000 [26:53<37:52, 20.03it/s]

{'loss': 0.3018, 'grad_norm': 1.8111110925674438, 'learning_rate': 3.05503355704698e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29494/75000 [26:53<39:24, 19.25it/s]

{'loss': 0.2174, 'grad_norm': 2.1371240615844727, 'learning_rate': 3.054362416107383e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29500/75000 [26:53<40:24, 18.77it/s]

{'loss': 0.3023, 'grad_norm': 5.207543849945068, 'learning_rate': 3.0536912751677856e-05, 'epoch': 1.18}


                                                       
 39%|███▉      | 29512/75000 [26:55<48:17, 15.70it/s]

{'loss': 0.3225, 'grad_norm': 1.5396790504455566, 'learning_rate': 3.0530201342281884e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29521/75000 [26:55<44:56, 16.87it/s]

{'loss': 0.3033, 'grad_norm': 6.743098735809326, 'learning_rate': 3.0523489932885906e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29534/75000 [26:56<38:10, 19.85it/s]

{'loss': 0.2712, 'grad_norm': 8.916142463684082, 'learning_rate': 3.0516778523489935e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29544/75000 [26:56<39:06, 19.37it/s]

{'loss': 0.3285, 'grad_norm': 3.676295518875122, 'learning_rate': 3.051006711409396e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29551/75000 [26:57<39:51, 19.00it/s]

{'loss': 0.2442, 'grad_norm': 15.317179679870605, 'learning_rate': 3.050335570469799e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29564/75000 [26:57<37:45, 20.06it/s]

{'loss': 0.3467, 'grad_norm': 0.8486364483833313, 'learning_rate': 3.0496644295302014e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29573/75000 [26:58<39:23, 19.22it/s]

{'loss': 0.2642, 'grad_norm': 2.770589828491211, 'learning_rate': 3.0489932885906042e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29582/75000 [26:58<38:16, 19.78it/s]

{'loss': 0.2756, 'grad_norm': 4.108402252197266, 'learning_rate': 3.048322147651007e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29591/75000 [26:59<40:15, 18.80it/s]

{'loss': 0.3257, 'grad_norm': 4.130959987640381, 'learning_rate': 3.0476510067114096e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29602/75000 [26:59<37:43, 20.06it/s]

{'loss': 0.3287, 'grad_norm': 1.6051514148712158, 'learning_rate': 3.0469798657718124e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29614/75000 [27:00<36:42, 20.61it/s]

{'loss': 0.4329, 'grad_norm': 1.1344307661056519, 'learning_rate': 3.0463087248322146e-05, 'epoch': 1.18}


                                                     
 39%|███▉      | 29624/75000 [27:00<39:10, 19.30it/s]

{'loss': 0.3187, 'grad_norm': 3.1517138481140137, 'learning_rate': 3.0456375838926178e-05, 'epoch': 1.18}


                                                     
 40%|███▉      | 29632/75000 [27:01<39:00, 19.38it/s]

{'loss': 0.3589, 'grad_norm': 4.588340759277344, 'learning_rate': 3.04496644295302e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29643/75000 [27:01<41:27, 18.23it/s]

{'loss': 0.2465, 'grad_norm': 4.376880645751953, 'learning_rate': 3.044295302013423e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29652/75000 [27:02<39:32, 19.11it/s]

{'loss': 0.2891, 'grad_norm': 1.5530102252960205, 'learning_rate': 3.0436241610738257e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29661/75000 [27:02<41:10, 18.35it/s]

{'loss': 0.3152, 'grad_norm': 14.511268615722656, 'learning_rate': 3.0429530201342282e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29674/75000 [27:03<37:46, 20.00it/s]

{'loss': 0.3901, 'grad_norm': 2.98480486869812, 'learning_rate': 3.042281879194631e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29682/75000 [27:03<37:25, 20.18it/s]

{'loss': 0.3087, 'grad_norm': 2.0983402729034424, 'learning_rate': 3.0416107382550336e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29693/75000 [27:04<38:46, 19.47it/s]

{'loss': 0.236, 'grad_norm': 5.353349208831787, 'learning_rate': 3.0409395973154364e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29704/75000 [27:04<39:08, 19.29it/s]

{'loss': 0.3627, 'grad_norm': 3.799811601638794, 'learning_rate': 3.0402684563758393e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29714/75000 [27:05<37:44, 20.00it/s]

{'loss': 0.3558, 'grad_norm': 1.0881551504135132, 'learning_rate': 3.0395973154362418e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29724/75000 [27:06<39:53, 18.91it/s]

{'loss': 0.3984, 'grad_norm': 2.089953660964966, 'learning_rate': 3.0389261744966447e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29734/75000 [27:06<37:37, 20.05it/s]

{'loss': 0.2636, 'grad_norm': 1.689462423324585, 'learning_rate': 3.038255033557047e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29742/75000 [27:06<40:27, 18.65it/s]

{'loss': 0.311, 'grad_norm': 4.507571697235107, 'learning_rate': 3.03758389261745e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29754/75000 [27:07<37:41, 20.01it/s]

{'loss': 0.313, 'grad_norm': 6.170954704284668, 'learning_rate': 3.0369127516778522e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29762/75000 [27:07<39:50, 18.93it/s]

{'loss': 0.3253, 'grad_norm': 2.503704071044922, 'learning_rate': 3.036241610738255e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29772/75000 [27:08<38:20, 19.66it/s]

{'loss': 0.2779, 'grad_norm': 3.4722275733947754, 'learning_rate': 3.0355704697986583e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29782/75000 [27:09<41:25, 18.20it/s]

{'loss': 0.3616, 'grad_norm': 2.232858657836914, 'learning_rate': 3.0348993288590604e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29794/75000 [27:09<37:47, 19.94it/s]

{'loss': 0.2875, 'grad_norm': 3.8212642669677734, 'learning_rate': 3.0342281879194633e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29802/75000 [27:10<40:34, 18.57it/s]

{'loss': 0.2468, 'grad_norm': 3.919682502746582, 'learning_rate': 3.0335570469798658e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29814/75000 [27:10<37:26, 20.11it/s]

{'loss': 0.3214, 'grad_norm': 2.0199079513549805, 'learning_rate': 3.0328859060402687e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29824/75000 [27:11<40:56, 18.39it/s]

{'loss': 0.3807, 'grad_norm': 6.449443340301514, 'learning_rate': 3.0322147651006712e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29832/75000 [27:11<38:57, 19.32it/s]

{'loss': 0.3183, 'grad_norm': 2.29111647605896, 'learning_rate': 3.031543624161074e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29844/75000 [27:12<38:43, 19.43it/s]

{'loss': 0.3911, 'grad_norm': 1.3742680549621582, 'learning_rate': 3.030872483221477e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29854/75000 [27:12<37:49, 19.89it/s]

{'loss': 0.3213, 'grad_norm': 3.4928395748138428, 'learning_rate': 3.0302013422818794e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29863/75000 [27:13<40:23, 18.63it/s]

{'loss': 0.3067, 'grad_norm': 7.5854997634887695, 'learning_rate': 3.0295302013422823e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29872/75000 [27:13<37:42, 19.95it/s]

{'loss': 0.3225, 'grad_norm': 1.9146143198013306, 'learning_rate': 3.0288590604026844e-05, 'epoch': 1.19}


                                                     
 40%|███▉      | 29883/75000 [27:14<40:26, 18.59it/s]

{'loss': 0.3578, 'grad_norm': 5.5595622062683105, 'learning_rate': 3.0281879194630873e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29894/75000 [27:14<37:50, 19.87it/s]

{'loss': 0.3816, 'grad_norm': 3.2178537845611572, 'learning_rate': 3.0275167785234898e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29904/75000 [27:15<38:22, 19.59it/s]

{'loss': 0.2954, 'grad_norm': 5.758545398712158, 'learning_rate': 3.0268456375838927e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29914/75000 [27:15<37:32, 20.02it/s]

{'loss': 0.3066, 'grad_norm': 1.3352936506271362, 'learning_rate': 3.0261744966442955e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29923/75000 [27:16<39:27, 19.04it/s]

{'loss': 0.3081, 'grad_norm': 0.9761168956756592, 'learning_rate': 3.025503355704698e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29934/75000 [27:16<37:10, 20.20it/s]

{'loss': 0.2602, 'grad_norm': 2.319035768508911, 'learning_rate': 3.024832214765101e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29942/75000 [27:17<40:50, 18.39it/s]

{'loss': 0.4318, 'grad_norm': 7.354806423187256, 'learning_rate': 3.0241610738255034e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29952/75000 [27:17<38:24, 19.55it/s]

{'loss': 0.3361, 'grad_norm': 2.7360191345214844, 'learning_rate': 3.0234899328859063e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29961/75000 [27:18<39:54, 18.81it/s]

{'loss': 0.4535, 'grad_norm': 2.5842599868774414, 'learning_rate': 3.022818791946309e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29974/75000 [27:18<37:02, 20.26it/s]

{'loss': 0.2911, 'grad_norm': 1.3157250881195068, 'learning_rate': 3.0221476510067116e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29983/75000 [27:19<36:58, 20.29it/s]

{'loss': 0.343, 'grad_norm': 1.7499959468841553, 'learning_rate': 3.0214765100671145e-05, 'epoch': 1.2}


                                                     
 40%|███▉      | 29992/75000 [27:19<39:26, 19.02it/s]

{'loss': 0.281, 'grad_norm': 1.7973438501358032, 'learning_rate': 3.0208053691275167e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30000/75000 [27:20<37:36, 19.94it/s]

{'loss': 0.4022, 'grad_norm': 8.442505836486816, 'learning_rate': 3.02013422818792e-05, 'epoch': 1.2}


                                                       
 40%|████      | 30014/75000 [27:21<43:07, 17.39it/s]

{'loss': 0.3182, 'grad_norm': 3.484694242477417, 'learning_rate': 3.019463087248322e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30022/75000 [27:21<43:38, 17.18it/s]

{'loss': 0.358, 'grad_norm': 2.447118043899536, 'learning_rate': 3.018791946308725e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30031/75000 [27:22<43:02, 17.41it/s]

{'loss': 0.3075, 'grad_norm': 0.8790843486785889, 'learning_rate': 3.0181208053691277e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30042/75000 [27:23<41:38, 17.99it/s]

{'loss': 0.397, 'grad_norm': 6.72434663772583, 'learning_rate': 3.0174496644295303e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30053/75000 [27:23<39:15, 19.08it/s]

{'loss': 0.3202, 'grad_norm': 6.851955890655518, 'learning_rate': 3.016778523489933e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30064/75000 [27:24<38:07, 19.65it/s]

{'loss': 0.3355, 'grad_norm': 2.6828293800354004, 'learning_rate': 3.0161073825503356e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30072/75000 [27:24<40:26, 18.52it/s]

{'loss': 0.2736, 'grad_norm': 13.612137794494629, 'learning_rate': 3.0154362416107385e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30082/75000 [27:25<44:37, 16.77it/s]

{'loss': 0.4053, 'grad_norm': 2.786442756652832, 'learning_rate': 3.0147651006711407e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30094/75000 [27:25<38:34, 19.40it/s]

{'loss': 0.3123, 'grad_norm': 1.227704644203186, 'learning_rate': 3.014093959731544e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30104/75000 [27:26<39:15, 19.06it/s]

{'loss': 0.2881, 'grad_norm': 1.1279315948486328, 'learning_rate': 3.0134228187919467e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30113/75000 [27:26<39:15, 19.06it/s]

{'loss': 0.2973, 'grad_norm': 3.1910440921783447, 'learning_rate': 3.012751677852349e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30122/75000 [27:27<39:43, 18.83it/s]

{'loss': 0.3139, 'grad_norm': 2.4887871742248535, 'learning_rate': 3.012080536912752e-05, 'epoch': 1.2}


                                                     
 40%|████      | 30131/75000 [27:27<38:29, 19.43it/s]

{'loss': 0.2323, 'grad_norm': 0.8863775134086609, 'learning_rate': 3.0114093959731543e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30144/75000 [27:28<37:42, 19.83it/s]

{'loss': 0.3155, 'grad_norm': 4.230108737945557, 'learning_rate': 3.010738255033557e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30154/75000 [27:28<37:29, 19.94it/s]

{'loss': 0.4021, 'grad_norm': 0.9613957405090332, 'learning_rate': 3.0100671140939603e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30162/75000 [27:29<39:36, 18.87it/s]

{'loss': 0.2796, 'grad_norm': 10.170888900756836, 'learning_rate': 3.0093959731543625e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30173/75000 [27:29<42:14, 17.69it/s]

{'loss': 0.3672, 'grad_norm': 1.8771133422851562, 'learning_rate': 3.0087248322147653e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30182/75000 [27:30<38:11, 19.55it/s]

{'loss': 0.4215, 'grad_norm': 2.034036874771118, 'learning_rate': 3.008053691275168e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30192/75000 [27:30<39:55, 18.71it/s]

{'loss': 0.3264, 'grad_norm': 1.2926864624023438, 'learning_rate': 3.0073825503355707e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30204/75000 [27:31<36:31, 20.44it/s]

{'loss': 0.3286, 'grad_norm': 5.622072219848633, 'learning_rate': 3.0067114093959732e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30214/75000 [27:32<38:42, 19.28it/s]

{'loss': 0.2765, 'grad_norm': 3.1730589866638184, 'learning_rate': 3.006040268456376e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30222/75000 [27:32<41:32, 17.97it/s]

{'loss': 0.334, 'grad_norm': 3.106797456741333, 'learning_rate': 3.005369127516779e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30233/75000 [27:33<38:01, 19.62it/s]

{'loss': 0.4038, 'grad_norm': 8.780405044555664, 'learning_rate': 3.004697986577181e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30241/75000 [27:33<40:55, 18.22it/s]

{'loss': 0.1989, 'grad_norm': 2.1719746589660645, 'learning_rate': 3.0040268456375843e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30252/75000 [27:34<39:12, 19.02it/s]

{'loss': 0.4373, 'grad_norm': 7.024611473083496, 'learning_rate': 3.0033557046979865e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30261/75000 [27:34<43:06, 17.30it/s]

{'loss': 0.2666, 'grad_norm': 6.611057758331299, 'learning_rate': 3.0026845637583893e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30273/75000 [27:35<37:30, 19.88it/s]

{'loss': 0.3478, 'grad_norm': 3.012763500213623, 'learning_rate': 3.002013422818792e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30282/75000 [27:35<41:43, 17.86it/s]

{'loss': 0.257, 'grad_norm': 4.0946784019470215, 'learning_rate': 3.0013422818791947e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30294/75000 [27:36<37:43, 19.75it/s]

{'loss': 0.438, 'grad_norm': 2.9532310962677, 'learning_rate': 3.0006711409395976e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30302/75000 [27:36<43:07, 17.27it/s]

{'loss': 0.2644, 'grad_norm': 2.963705539703369, 'learning_rate': 3e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30312/75000 [27:37<39:13, 18.99it/s]

{'loss': 0.2834, 'grad_norm': 2.7604684829711914, 'learning_rate': 2.999328859060403e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30323/75000 [27:37<38:40, 19.26it/s]

{'loss': 0.2897, 'grad_norm': 6.388890743255615, 'learning_rate': 2.9986577181208054e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30334/75000 [27:38<37:23, 19.90it/s]

{'loss': 0.1858, 'grad_norm': 1.7059766054153442, 'learning_rate': 2.9979865771812083e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30341/75000 [27:38<42:13, 17.63it/s]

{'loss': 0.2766, 'grad_norm': 3.9135146141052246, 'learning_rate': 2.997315436241611e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30353/75000 [27:39<37:25, 19.88it/s]

{'loss': 0.2634, 'grad_norm': 5.468012809753418, 'learning_rate': 2.9966442953020137e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30364/75000 [27:40<38:56, 19.10it/s]

{'loss': 0.3468, 'grad_norm': 8.866276741027832, 'learning_rate': 2.9959731543624165e-05, 'epoch': 1.21}


                                                     
 40%|████      | 30374/75000 [27:40<37:20, 19.92it/s]

{'loss': 0.2754, 'grad_norm': 2.3762636184692383, 'learning_rate': 2.9953020134228187e-05, 'epoch': 1.21}


                                                     
 41%|████      | 30383/75000 [27:40<40:14, 18.48it/s]

{'loss': 0.3221, 'grad_norm': 5.409226417541504, 'learning_rate': 2.9946308724832216e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30394/75000 [27:41<37:02, 20.07it/s]

{'loss': 0.3789, 'grad_norm': 3.010824680328369, 'learning_rate': 2.993959731543624e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30402/75000 [27:41<39:33, 18.79it/s]

{'loss': 0.3386, 'grad_norm': 9.449509620666504, 'learning_rate': 2.993288590604027e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30414/75000 [27:42<37:12, 19.97it/s]

{'loss': 0.2381, 'grad_norm': 3.0194284915924072, 'learning_rate': 2.9926174496644298e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30424/75000 [27:43<38:37, 19.23it/s]

{'loss': 0.3929, 'grad_norm': 3.5589065551757812, 'learning_rate': 2.9919463087248323e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30432/75000 [27:43<39:53, 18.62it/s]

{'loss': 0.307, 'grad_norm': 1.0434517860412598, 'learning_rate': 2.991275167785235e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30443/75000 [27:44<40:08, 18.50it/s]

{'loss': 0.3374, 'grad_norm': 6.002469539642334, 'learning_rate': 2.9906040268456377e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30452/75000 [27:44<42:06, 17.63it/s]

{'loss': 0.3756, 'grad_norm': 4.965097904205322, 'learning_rate': 2.9899328859060405e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30462/75000 [27:45<37:19, 19.89it/s]

{'loss': 0.3179, 'grad_norm': 3.15645694732666, 'learning_rate': 2.9892617449664427e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30472/75000 [27:45<39:27, 18.81it/s]

{'loss': 0.2932, 'grad_norm': 4.689742088317871, 'learning_rate': 2.988590604026846e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30483/75000 [27:46<36:46, 20.17it/s]

{'loss': 0.2535, 'grad_norm': 9.791631698608398, 'learning_rate': 2.9879194630872488e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30491/75000 [27:46<40:29, 18.32it/s]

{'loss': 0.3392, 'grad_norm': 4.96960973739624, 'learning_rate': 2.987248322147651e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30500/75000 [27:47<37:49, 19.60it/s]

{'loss': 0.3285, 'grad_norm': 5.391050815582275, 'learning_rate': 2.986577181208054e-05, 'epoch': 1.22}


                                                       
 41%|████      | 30513/75000 [27:48<43:58, 16.86it/s]

{'loss': 0.3617, 'grad_norm': 1.2049131393432617, 'learning_rate': 2.9859060402684563e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30522/75000 [27:48<39:58, 18.54it/s]

{'loss': 0.3869, 'grad_norm': 1.376352310180664, 'learning_rate': 2.985234899328859e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30534/75000 [27:49<37:52, 19.56it/s]

{'loss': 0.2694, 'grad_norm': 9.758244514465332, 'learning_rate': 2.9845637583892617e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30542/75000 [27:49<37:31, 19.75it/s]

{'loss': 0.3582, 'grad_norm': 1.5799100399017334, 'learning_rate': 2.9838926174496645e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30552/75000 [27:50<40:29, 18.30it/s]

{'loss': 0.2978, 'grad_norm': 1.8297258615493774, 'learning_rate': 2.9832214765100674e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30564/75000 [27:50<38:05, 19.44it/s]

{'loss': 0.2847, 'grad_norm': 4.925170421600342, 'learning_rate': 2.98255033557047e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30574/75000 [27:51<40:00, 18.51it/s]

{'loss': 0.2594, 'grad_norm': 11.202073097229004, 'learning_rate': 2.9818791946308727e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30584/75000 [27:51<37:19, 19.83it/s]

{'loss': 0.229, 'grad_norm': 4.5083394050598145, 'learning_rate': 2.981208053691275e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30592/75000 [27:52<39:43, 18.63it/s]

{'loss': 0.3116, 'grad_norm': 7.221006393432617, 'learning_rate': 2.980536912751678e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30604/75000 [27:52<37:12, 19.89it/s]

{'loss': 0.5244, 'grad_norm': 6.396159648895264, 'learning_rate': 2.979865771812081e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30611/75000 [27:53<41:57, 17.64it/s]

{'loss': 0.3667, 'grad_norm': 1.9352465867996216, 'learning_rate': 2.979194630872483e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30624/75000 [27:54<36:56, 20.02it/s]

{'loss': 0.4479, 'grad_norm': 1.5860496759414673, 'learning_rate': 2.9785234899328863e-05, 'epoch': 1.22}


                                                     
 41%|████      | 30632/75000 [27:54<39:42, 18.62it/s]

{'loss': 0.3834, 'grad_norm': 1.8509432077407837, 'learning_rate': 2.9778523489932885e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30644/75000 [27:55<37:38, 19.64it/s]

{'loss': 0.4405, 'grad_norm': 3.238436460494995, 'learning_rate': 2.9771812080536914e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30652/75000 [27:55<39:59, 18.49it/s]

{'loss': 0.2907, 'grad_norm': 3.5733420848846436, 'learning_rate': 2.976510067114094e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30663/75000 [27:56<40:25, 18.28it/s]

{'loss': 0.2096, 'grad_norm': 7.810286521911621, 'learning_rate': 2.9758389261744967e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30672/75000 [27:56<38:32, 19.17it/s]

{'loss': 0.3005, 'grad_norm': 5.7146897315979, 'learning_rate': 2.9751677852348996e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30682/75000 [27:57<39:13, 18.83it/s]

{'loss': 0.5276, 'grad_norm': 3.4894139766693115, 'learning_rate': 2.974496644295302e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30691/75000 [27:57<38:58, 18.95it/s]

{'loss': 0.2247, 'grad_norm': 2.1543264389038086, 'learning_rate': 2.973825503355705e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30703/75000 [27:58<38:21, 19.25it/s]

{'loss': 0.2782, 'grad_norm': 7.577592372894287, 'learning_rate': 2.9731543624161075e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30713/75000 [27:58<37:19, 19.78it/s]

{'loss': 0.2247, 'grad_norm': 1.4959840774536133, 'learning_rate': 2.9724832214765103e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30722/75000 [27:59<38:19, 19.26it/s]

{'loss': 0.26, 'grad_norm': 7.2530059814453125, 'learning_rate': 2.9718120805369125e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30731/75000 [27:59<38:45, 19.03it/s]

{'loss': 0.2662, 'grad_norm': 1.9173310995101929, 'learning_rate': 2.9711409395973154e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30742/75000 [28:00<36:18, 20.31it/s]

{'loss': 0.436, 'grad_norm': 0.885915219783783, 'learning_rate': 2.9704697986577186e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30751/75000 [28:00<38:33, 19.13it/s]

{'loss': 0.3429, 'grad_norm': 1.4778410196304321, 'learning_rate': 2.9697986577181207e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30764/75000 [28:01<36:29, 20.20it/s]

{'loss': 0.3287, 'grad_norm': 1.0645571947097778, 'learning_rate': 2.9691275167785236e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30773/75000 [28:01<36:29, 20.20it/s]

{'loss': 0.2397, 'grad_norm': 0.6690480709075928, 'learning_rate': 2.968456375838926e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30784/75000 [28:02<38:59, 18.90it/s]

{'loss': 0.3949, 'grad_norm': 7.555950164794922, 'learning_rate': 2.967785234899329e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30793/75000 [28:02<38:01, 19.37it/s]

{'loss': 0.3496, 'grad_norm': 2.120839834213257, 'learning_rate': 2.9671140939597318e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30802/75000 [28:03<40:19, 18.26it/s]

{'loss': 0.272, 'grad_norm': 2.26240873336792, 'learning_rate': 2.9664429530201343e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30812/75000 [28:03<38:06, 19.33it/s]

{'loss': 0.5191, 'grad_norm': 2.03398060798645, 'learning_rate': 2.9657718120805372e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30824/75000 [28:04<39:28, 18.65it/s]

{'loss': 0.2899, 'grad_norm': 1.547564148902893, 'learning_rate': 2.9651006711409397e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30834/75000 [28:04<37:36, 19.57it/s]

{'loss': 0.2714, 'grad_norm': 2.2491347789764404, 'learning_rate': 2.9644295302013426e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30842/75000 [28:05<41:48, 17.60it/s]

{'loss': 0.2795, 'grad_norm': 2.8885340690612793, 'learning_rate': 2.9637583892617447e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30854/75000 [28:06<37:54, 19.41it/s]

{'loss': 0.3091, 'grad_norm': 4.529993534088135, 'learning_rate': 2.963087248322148e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30862/75000 [28:06<41:57, 17.53it/s]

{'loss': 0.3035, 'grad_norm': 1.7273039817810059, 'learning_rate': 2.9624161073825508e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30872/75000 [28:06<38:11, 19.26it/s]

{'loss': 0.3651, 'grad_norm': 2.570697784423828, 'learning_rate': 2.961744966442953e-05, 'epoch': 1.23}


                                                     
 41%|████      | 30884/75000 [28:07<37:41, 19.51it/s]

{'loss': 0.3008, 'grad_norm': 1.7386343479156494, 'learning_rate': 2.9610738255033558e-05, 'epoch': 1.24}


                                                     
 41%|████      | 30892/75000 [28:08<40:22, 18.21it/s]

{'loss': 0.4039, 'grad_norm': 0.928203821182251, 'learning_rate': 2.9604026845637583e-05, 'epoch': 1.24}


                                                     
 41%|████      | 30903/75000 [28:08<43:01, 17.08it/s]

{'loss': 0.377, 'grad_norm': 6.5656938552856445, 'learning_rate': 2.9597315436241612e-05, 'epoch': 1.24}


                                                     
 41%|████      | 30913/75000 [28:09<46:48, 15.70it/s]

{'loss': 0.2323, 'grad_norm': 3.7943358421325684, 'learning_rate': 2.9590604026845637e-05, 'epoch': 1.24}


                                                     
 41%|████      | 30923/75000 [28:10<49:20, 14.89it/s]

{'loss': 0.3584, 'grad_norm': 8.642278671264648, 'learning_rate': 2.9583892617449666e-05, 'epoch': 1.24}


                                                     
 41%|████      | 30933/75000 [28:10<48:20, 15.19it/s]

{'loss': 0.3271, 'grad_norm': 9.339898109436035, 'learning_rate': 2.9577181208053694e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 30943/75000 [28:11<47:57, 15.31it/s]

{'loss': 0.3474, 'grad_norm': 2.0194742679595947, 'learning_rate': 2.957046979865772e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 30953/75000 [28:12<48:02, 15.28it/s]

{'loss': 0.3418, 'grad_norm': 1.5148015022277832, 'learning_rate': 2.9563758389261748e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 30963/75000 [28:12<47:27, 15.47it/s]

{'loss': 0.3592, 'grad_norm': 4.173379898071289, 'learning_rate': 2.955704697986577e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 30971/75000 [28:13<49:27, 14.84it/s]

{'loss': 0.3674, 'grad_norm': 8.440661430358887, 'learning_rate': 2.95503355704698e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 30983/75000 [28:13<45:12, 16.23it/s]

{'loss': 0.4074, 'grad_norm': 3.6887896060943604, 'learning_rate': 2.954362416107383e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 30993/75000 [28:14<45:15, 16.21it/s]

{'loss': 0.3539, 'grad_norm': 5.805161952972412, 'learning_rate': 2.9536912751677852e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31000/75000 [28:14<45:04, 16.27it/s]

{'loss': 0.3157, 'grad_norm': 4.184220790863037, 'learning_rate': 2.9530201342281884e-05, 'epoch': 1.24}


                                                       
 41%|████▏     | 31013/75000 [28:16<55:32, 13.20it/s]  

{'loss': 0.3545, 'grad_norm': 3.8473427295684814, 'learning_rate': 2.9523489932885906e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31023/75000 [28:17<48:50, 15.01it/s]

{'loss': 0.2975, 'grad_norm': 4.297184467315674, 'learning_rate': 2.9516778523489934e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31033/75000 [28:17<45:45, 16.02it/s]

{'loss': 0.5335, 'grad_norm': 10.972101211547852, 'learning_rate': 2.951006711409396e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31041/75000 [28:18<47:37, 15.38it/s]

{'loss': 0.3777, 'grad_norm': 0.8455917835235596, 'learning_rate': 2.9503355704697988e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31052/75000 [28:19<44:42, 16.38it/s]

{'loss': 0.3122, 'grad_norm': 1.2750461101531982, 'learning_rate': 2.9496644295302016e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31062/75000 [28:19<51:23, 14.25it/s]

{'loss': 0.3565, 'grad_norm': 5.106100082397461, 'learning_rate': 2.948993288590604e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31072/75000 [28:20<46:03, 15.89it/s]

{'loss': 0.3062, 'grad_norm': 3.5724453926086426, 'learning_rate': 2.948322147651007e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31082/75000 [28:20<47:38, 15.36it/s]

{'loss': 0.2887, 'grad_norm': 2.158400535583496, 'learning_rate': 2.9476510067114095e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31092/75000 [28:21<46:42, 15.67it/s]

{'loss': 0.3905, 'grad_norm': 0.9637542366981506, 'learning_rate': 2.9469798657718124e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31102/75000 [28:22<47:59, 15.25it/s]

{'loss': 0.393, 'grad_norm': 3.145676374435425, 'learning_rate': 2.9463087248322146e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31114/75000 [28:22<41:07, 17.78it/s]

{'loss': 0.2401, 'grad_norm': 2.347838878631592, 'learning_rate': 2.9456375838926174e-05, 'epoch': 1.24}


                                                     
 41%|████▏     | 31122/75000 [28:23<52:27, 13.94it/s]

{'loss': 0.2958, 'grad_norm': 2.7287518978118896, 'learning_rate': 2.9449664429530206e-05, 'epoch': 1.24}


                                                     
 42%|████▏     | 31132/75000 [28:24<48:54, 14.95it/s]

{'loss': 0.2883, 'grad_norm': 4.3454670906066895, 'learning_rate': 2.9442953020134228e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31142/75000 [28:24<49:19, 14.82it/s]

{'loss': 0.2373, 'grad_norm': 2.764606237411499, 'learning_rate': 2.9436241610738256e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31152/75000 [28:25<46:45, 15.63it/s]

{'loss': 0.3445, 'grad_norm': 5.695135116577148, 'learning_rate': 2.942953020134228e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31162/75000 [28:26<54:17, 13.46it/s]

{'loss': 0.3064, 'grad_norm': 1.3080787658691406, 'learning_rate': 2.942281879194631e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31172/75000 [28:26<43:47, 16.68it/s]

{'loss': 0.277, 'grad_norm': 6.264266014099121, 'learning_rate': 2.941610738255034e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31182/75000 [28:27<43:49, 16.67it/s]

{'loss': 0.3532, 'grad_norm': 2.0746991634368896, 'learning_rate': 2.9409395973154364e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31192/75000 [28:27<47:27, 15.38it/s]

{'loss': 0.3518, 'grad_norm': 5.967815399169922, 'learning_rate': 2.9402684563758392e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31202/75000 [28:28<48:39, 15.00it/s]

{'loss': 0.3071, 'grad_norm': 2.2039871215820312, 'learning_rate': 2.9395973154362418e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31212/75000 [28:29<44:24, 16.44it/s]

{'loss': 0.2087, 'grad_norm': 1.5385488271713257, 'learning_rate': 2.9389261744966446e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31223/75000 [28:29<43:48, 16.66it/s]

{'loss': 0.3911, 'grad_norm': 2.181077718734741, 'learning_rate': 2.9382550335570468e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31233/75000 [28:30<41:19, 17.65it/s]

{'loss': 0.4077, 'grad_norm': 4.993393898010254, 'learning_rate': 2.93758389261745e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31243/75000 [28:30<42:55, 16.99it/s]

{'loss': 0.4198, 'grad_norm': 2.4309275150299072, 'learning_rate': 2.936912751677853e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31251/75000 [28:31<43:38, 16.71it/s]

{'loss': 0.4434, 'grad_norm': 3.362537384033203, 'learning_rate': 2.936241610738255e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31262/75000 [28:32<42:07, 17.31it/s]

{'loss': 0.369, 'grad_norm': 7.589253902435303, 'learning_rate': 2.935570469798658e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31272/75000 [28:32<46:47, 15.57it/s]

{'loss': 0.3414, 'grad_norm': 1.8069672584533691, 'learning_rate': 2.9348993288590604e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31284/75000 [28:33<41:30, 17.55it/s]

{'loss': 0.3863, 'grad_norm': 5.391196250915527, 'learning_rate': 2.9342281879194632e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31292/75000 [28:33<50:47, 14.34it/s]

{'loss': 0.2568, 'grad_norm': 6.224164009094238, 'learning_rate': 2.9335570469798658e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31302/75000 [28:34<45:31, 16.00it/s]

{'loss': 0.1831, 'grad_norm': 1.4424108266830444, 'learning_rate': 2.9328859060402686e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31312/75000 [28:35<46:15, 15.74it/s]

{'loss': 0.2674, 'grad_norm': 1.8489218950271606, 'learning_rate': 2.9322147651006715e-05, 'epoch': 1.25}


                                                       
 42%|████▏     | 31322/75000 [28:36<1:01:25, 11.85it/s]

{'loss': 0.1744, 'grad_norm': 1.6370795965194702, 'learning_rate': 2.931543624161074e-05, 'epoch': 1.25}


                                                       
 42%|████▏     | 31332/75000 [28:36<52:04, 13.98it/s]

{'loss': 0.5203, 'grad_norm': 27.072492599487305, 'learning_rate': 2.930872483221477e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31343/75000 [28:37<40:41, 17.88it/s]

{'loss': 0.3639, 'grad_norm': 1.8947324752807617, 'learning_rate': 2.930201342281879e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31353/75000 [28:37<44:13, 16.45it/s]

{'loss': 0.2845, 'grad_norm': 18.778135299682617, 'learning_rate': 2.9295302013422822e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31363/75000 [28:38<39:31, 18.40it/s]

{'loss': 0.2835, 'grad_norm': 6.8305864334106445, 'learning_rate': 2.9288590604026844e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31373/75000 [28:39<44:55, 16.19it/s]

{'loss': 0.2705, 'grad_norm': 1.770815372467041, 'learning_rate': 2.9281879194630872e-05, 'epoch': 1.25}


                                                     
 42%|████▏     | 31383/75000 [28:39<40:20, 18.02it/s]

{'loss': 0.4105, 'grad_norm': 2.9329583644866943, 'learning_rate': 2.9275167785234904e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31391/75000 [28:40<43:53, 16.56it/s]

{'loss': 0.2547, 'grad_norm': 3.6613521575927734, 'learning_rate': 2.9268456375838926e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31402/75000 [28:40<41:17, 17.60it/s]

{'loss': 0.2404, 'grad_norm': 1.7265980243682861, 'learning_rate': 2.9261744966442955e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31413/75000 [28:41<39:04, 18.59it/s]

{'loss': 0.2997, 'grad_norm': 1.7151204347610474, 'learning_rate': 2.925503355704698e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31421/75000 [28:41<42:15, 17.19it/s]

{'loss': 0.3013, 'grad_norm': 5.17667293548584, 'learning_rate': 2.9248322147651008e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31434/75000 [28:42<38:28, 18.87it/s]

{'loss': 0.3876, 'grad_norm': 5.584602355957031, 'learning_rate': 2.9241610738255037e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31442/75000 [28:42<43:30, 16.68it/s]

{'loss': 0.3874, 'grad_norm': 4.0988640785217285, 'learning_rate': 2.9234899328859062e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31452/75000 [28:43<38:47, 18.71it/s]

{'loss': 0.2378, 'grad_norm': 7.782484531402588, 'learning_rate': 2.922818791946309e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31461/75000 [28:44<43:48, 16.56it/s]

{'loss': 0.3017, 'grad_norm': 1.0165126323699951, 'learning_rate': 2.9221476510067112e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31472/75000 [28:44<39:29, 18.37it/s]

{'loss': 0.2694, 'grad_norm': 1.99391770362854, 'learning_rate': 2.9214765100671144e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31481/75000 [28:45<40:39, 17.84it/s]

{'loss': 0.2912, 'grad_norm': 3.4722633361816406, 'learning_rate': 2.9208053691275166e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31492/75000 [28:45<39:05, 18.55it/s]

{'loss': 0.2435, 'grad_norm': 5.155481815338135, 'learning_rate': 2.9201342281879195e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31500/75000 [28:46<41:11, 17.60it/s]

{'loss': 0.2663, 'grad_norm': 14.282879829406738, 'learning_rate': 2.9194630872483227e-05, 'epoch': 1.26}


                                                       
 42%|████▏     | 31514/75000 [28:47<47:13, 15.35it/s]

{'loss': 0.223, 'grad_norm': 2.0774433612823486, 'learning_rate': 2.9187919463087248e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31522/75000 [28:48<43:33, 16.64it/s]

{'loss': 0.3637, 'grad_norm': 8.192489624023438, 'learning_rate': 2.9181208053691277e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31533/75000 [28:48<38:55, 18.61it/s]

{'loss': 0.2674, 'grad_norm': 2.2745015621185303, 'learning_rate': 2.9174496644295302e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31542/75000 [28:49<40:27, 17.90it/s]

{'loss': 0.2748, 'grad_norm': 6.118502140045166, 'learning_rate': 2.916778523489933e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31554/75000 [28:49<41:37, 17.40it/s]

{'loss': 0.2689, 'grad_norm': 4.487334728240967, 'learning_rate': 2.9161073825503356e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31562/75000 [28:50<46:03, 15.72it/s]

{'loss': 0.3099, 'grad_norm': 3.521854877471924, 'learning_rate': 2.9154362416107384e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31572/75000 [28:50<43:11, 16.76it/s]

{'loss': 0.2188, 'grad_norm': 1.6425615549087524, 'learning_rate': 2.9147651006711413e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31583/75000 [28:51<42:56, 16.85it/s]

{'loss': 0.34, 'grad_norm': 4.523402214050293, 'learning_rate': 2.9140939597315438e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31593/75000 [28:52<39:53, 18.14it/s]

{'loss': 0.3322, 'grad_norm': 3.1174476146698, 'learning_rate': 2.9134228187919466e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31603/75000 [28:52<42:39, 16.96it/s]

{'loss': 0.2636, 'grad_norm': 2.6411752700805664, 'learning_rate': 2.9127516778523488e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31613/75000 [28:53<39:54, 18.12it/s]

{'loss': 0.4636, 'grad_norm': 8.77892780303955, 'learning_rate': 2.9120805369127517e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31623/75000 [28:53<41:51, 17.27it/s]

{'loss': 0.3157, 'grad_norm': 2.883669376373291, 'learning_rate': 2.911409395973155e-05, 'epoch': 1.26}


                                                     
 42%|████▏     | 31634/75000 [28:54<40:20, 17.91it/s]

{'loss': 0.2664, 'grad_norm': 2.8494417667388916, 'learning_rate': 2.910738255033557e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31642/75000 [28:54<40:17, 17.93it/s]

{'loss': 0.3921, 'grad_norm': 6.149010181427002, 'learning_rate': 2.91006711409396e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31652/75000 [28:55<41:36, 17.36it/s]

{'loss': 0.3483, 'grad_norm': 4.127004623413086, 'learning_rate': 2.9093959731543624e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31664/75000 [28:56<37:42, 19.15it/s]

{'loss': 0.3256, 'grad_norm': 2.9474480152130127, 'learning_rate': 2.9087248322147653e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31672/75000 [28:56<38:21, 18.82it/s]

{'loss': 0.2645, 'grad_norm': 4.299503803253174, 'learning_rate': 2.9080536912751678e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31683/75000 [28:57<40:54, 17.65it/s]

{'loss': 0.1365, 'grad_norm': 2.1182291507720947, 'learning_rate': 2.9073825503355706e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31691/75000 [28:57<39:23, 18.32it/s]

{'loss': 0.2486, 'grad_norm': 6.780360221862793, 'learning_rate': 2.9067114093959735e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31703/75000 [28:58<38:14, 18.87it/s]

{'loss': 0.3881, 'grad_norm': 2.5379812717437744, 'learning_rate': 2.906040268456376e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31712/75000 [28:58<41:07, 17.54it/s]

{'loss': 0.4433, 'grad_norm': 3.9433090686798096, 'learning_rate': 2.905369127516779e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31722/75000 [28:59<44:22, 16.26it/s]

{'loss': 0.3602, 'grad_norm': 6.722745895385742, 'learning_rate': 2.904697986577181e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31734/75000 [28:59<39:02, 18.47it/s]

{'loss': 0.3946, 'grad_norm': 2.1093149185180664, 'learning_rate': 2.9040268456375842e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31742/75000 [29:00<43:09, 16.70it/s]

{'loss': 0.2695, 'grad_norm': 1.9865505695343018, 'learning_rate': 2.9033557046979864e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31752/75000 [29:01<40:24, 17.84it/s]

{'loss': 0.4552, 'grad_norm': 7.363333225250244, 'learning_rate': 2.9026845637583893e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31761/75000 [29:01<44:24, 16.23it/s]

{'loss': 0.3304, 'grad_norm': 5.510141849517822, 'learning_rate': 2.902013422818792e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31772/75000 [29:02<40:44, 17.69it/s]

{'loss': 0.271, 'grad_norm': 2.5517866611480713, 'learning_rate': 2.9013422818791946e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31783/75000 [29:02<40:05, 17.97it/s]

{'loss': 0.3878, 'grad_norm': 4.266599655151367, 'learning_rate': 2.9006711409395975e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31793/75000 [29:03<40:06, 17.96it/s]

{'loss': 0.3365, 'grad_norm': 2.3979978561401367, 'learning_rate': 2.9e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31801/75000 [29:03<44:14, 16.27it/s]

{'loss': 0.2824, 'grad_norm': 1.2525181770324707, 'learning_rate': 2.899328859060403e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31813/75000 [29:04<38:38, 18.62it/s]

{'loss': 0.4487, 'grad_norm': 2.0999765396118164, 'learning_rate': 2.8986577181208057e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31822/75000 [29:04<41:57, 17.15it/s]

{'loss': 0.3887, 'grad_norm': 5.067384243011475, 'learning_rate': 2.8979865771812082e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31832/75000 [29:05<40:05, 17.95it/s]

{'loss': 0.289, 'grad_norm': 1.9739629030227661, 'learning_rate': 2.897315436241611e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31843/75000 [29:06<42:59, 16.73it/s]

{'loss': 0.3268, 'grad_norm': 2.78773832321167, 'learning_rate': 2.8966442953020133e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31854/75000 [29:06<39:03, 18.41it/s]

{'loss': 0.2528, 'grad_norm': 4.428083896636963, 'learning_rate': 2.8959731543624165e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31862/75000 [29:07<39:32, 18.18it/s]

{'loss': 0.3945, 'grad_norm': 4.459503650665283, 'learning_rate': 2.8953020134228186e-05, 'epoch': 1.27}


                                                     
 42%|████▏     | 31872/75000 [29:07<39:47, 18.07it/s]

{'loss': 0.3332, 'grad_norm': 6.333920955657959, 'learning_rate': 2.8946308724832215e-05, 'epoch': 1.27}


                                                     
 43%|████▎     | 31881/75000 [29:08<44:44, 16.06it/s]

{'loss': 0.3965, 'grad_norm': 1.54082190990448, 'learning_rate': 2.8939597315436247e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31892/75000 [29:08<40:43, 17.64it/s]

{'loss': 0.2316, 'grad_norm': 7.53475284576416, 'learning_rate': 2.893288590604027e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31903/75000 [29:09<38:14, 18.79it/s]

{'loss': 0.465, 'grad_norm': 4.277886867523193, 'learning_rate': 2.8926174496644297e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31913/75000 [29:10<41:11, 17.44it/s]

{'loss': 0.4065, 'grad_norm': 2.238162040710449, 'learning_rate': 2.8919463087248322e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31923/75000 [29:10<41:09, 17.44it/s]

{'loss': 0.2474, 'grad_norm': 2.976720094680786, 'learning_rate': 2.891275167785235e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31931/75000 [29:11<40:19, 17.80it/s]

{'loss': 0.2888, 'grad_norm': 6.248574733734131, 'learning_rate': 2.8906040268456376e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31943/75000 [29:11<39:09, 18.32it/s]

{'loss': 0.2624, 'grad_norm': 5.641622543334961, 'learning_rate': 2.8899328859060405e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31954/75000 [29:12<37:17, 19.24it/s]

{'loss': 0.3487, 'grad_norm': 7.173954486846924, 'learning_rate': 2.8892617449664433e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31962/75000 [29:12<42:51, 16.74it/s]

{'loss': 0.3095, 'grad_norm': 1.9499081373214722, 'learning_rate': 2.888590604026846e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31973/75000 [29:13<39:56, 17.96it/s]

{'loss': 0.3066, 'grad_norm': 10.549330711364746, 'learning_rate': 2.8879194630872487e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31982/75000 [29:13<43:35, 16.45it/s]

{'loss': 0.3001, 'grad_norm': 0.9291818141937256, 'learning_rate': 2.887248322147651e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 31994/75000 [29:14<37:36, 19.06it/s]

{'loss': 0.3446, 'grad_norm': 0.29510048031806946, 'learning_rate': 2.8865771812080537e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32000/75000 [29:14<42:12, 16.98it/s]

{'loss': 0.3021, 'grad_norm': 2.1440887451171875, 'learning_rate': 2.885906040268457e-05, 'epoch': 1.28}


                                                       
 43%|████▎     | 32014/75000 [29:16<45:38, 15.69it/s]  

{'loss': 0.3304, 'grad_norm': 6.7052130699157715, 'learning_rate': 2.885234899328859e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32022/75000 [29:16<44:43, 16.02it/s]

{'loss': 0.2684, 'grad_norm': 4.709874153137207, 'learning_rate': 2.884563758389262e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32032/75000 [29:17<40:38, 17.62it/s]

{'loss': 0.3577, 'grad_norm': 9.234235763549805, 'learning_rate': 2.8838926174496645e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32043/75000 [29:17<41:25, 17.29it/s]

{'loss': 0.3804, 'grad_norm': 1.6913467645645142, 'learning_rate': 2.8832214765100673e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32051/75000 [29:18<39:43, 18.02it/s]

{'loss': 0.2863, 'grad_norm': 2.14046049118042, 'learning_rate': 2.88255033557047e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32062/75000 [29:18<40:04, 17.86it/s]

{'loss': 0.4298, 'grad_norm': 10.042007446289062, 'learning_rate': 2.8818791946308727e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32074/75000 [29:19<37:26, 19.11it/s]

{'loss': 0.3202, 'grad_norm': 10.654509544372559, 'learning_rate': 2.8812080536912755e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32082/75000 [29:19<42:15, 16.92it/s]

{'loss': 0.2373, 'grad_norm': 3.1038665771484375, 'learning_rate': 2.880536912751678e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32092/75000 [29:20<42:36, 16.79it/s]

{'loss': 0.314, 'grad_norm': 7.575386047363281, 'learning_rate': 2.879865771812081e-05, 'epoch': 1.28}


 43%|████▎     | 32102/75000 [29:21<45:41, 15.64it/s]

{'loss': 0.2395, 'grad_norm': 4.678761005401611, 'learning_rate': 2.879194630872483e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32112/75000 [29:21<41:39, 17.16it/s]

{'loss': 0.2725, 'grad_norm': 1.2909053564071655, 'learning_rate': 2.8785234899328863e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32122/75000 [29:22<38:30, 18.56it/s]

{'loss': 0.2777, 'grad_norm': 0.9764412641525269, 'learning_rate': 2.8778523489932885e-05, 'epoch': 1.28}


                                                     
 43%|████▎     | 32133/75000 [29:22<43:20, 16.49it/s]

{'loss': 0.2352, 'grad_norm': 16.00426483154297, 'learning_rate': 2.8771812080536913e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32143/75000 [29:23<39:07, 18.25it/s]

{'loss': 0.2601, 'grad_norm': 3.2337806224823, 'learning_rate': 2.8765100671140942e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32153/75000 [29:24<42:22, 16.85it/s]

{'loss': 0.3575, 'grad_norm': 4.8031721115112305, 'learning_rate': 2.8758389261744967e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32163/75000 [29:24<41:33, 17.18it/s]

{'loss': 0.3784, 'grad_norm': 1.3947428464889526, 'learning_rate': 2.8751677852348995e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32171/75000 [29:25<41:09, 17.35it/s]

{'loss': 0.351, 'grad_norm': 5.18644905090332, 'learning_rate': 2.874496644295302e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32182/75000 [29:25<44:05, 16.19it/s]

{'loss': 0.2182, 'grad_norm': 7.493490695953369, 'learning_rate': 2.873825503355705e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32193/75000 [29:26<40:51, 17.46it/s]

{'loss': 0.3314, 'grad_norm': 1.36112380027771, 'learning_rate': 2.873154362416107e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32204/75000 [29:26<38:09, 18.70it/s]

{'loss': 0.4191, 'grad_norm': 6.495180606842041, 'learning_rate': 2.8724832214765103e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32214/75000 [29:27<37:57, 18.78it/s]

{'loss': 0.1906, 'grad_norm': 4.6435346603393555, 'learning_rate': 2.871812080536913e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32222/75000 [29:27<38:47, 18.38it/s]

{'loss': 0.3434, 'grad_norm': 3.003347873687744, 'learning_rate': 2.8711409395973153e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32234/75000 [29:28<36:55, 19.31it/s]

{'loss': 0.3675, 'grad_norm': 0.8537401556968689, 'learning_rate': 2.8704697986577185e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32242/75000 [29:29<38:27, 18.53it/s]

{'loss': 0.2861, 'grad_norm': 7.792202949523926, 'learning_rate': 2.8697986577181207e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32253/75000 [29:29<39:01, 18.26it/s]

{'loss': 0.2968, 'grad_norm': 5.149033069610596, 'learning_rate': 2.8691275167785235e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32262/75000 [29:30<38:42, 18.40it/s]

{'loss': 0.3313, 'grad_norm': 16.86158561706543, 'learning_rate': 2.8684563758389267e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32273/75000 [29:30<40:15, 17.69it/s]

{'loss': 0.3377, 'grad_norm': 4.791340351104736, 'learning_rate': 2.867785234899329e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32284/75000 [29:31<37:04, 19.20it/s]

{'loss': 0.18, 'grad_norm': 2.2368052005767822, 'learning_rate': 2.8671140939597318e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32292/75000 [29:31<40:40, 17.50it/s]

{'loss': 0.322, 'grad_norm': 4.260706901550293, 'learning_rate': 2.8664429530201343e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32304/75000 [29:32<36:39, 19.41it/s]

{'loss': 0.3857, 'grad_norm': 0.48869290947914124, 'learning_rate': 2.865771812080537e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32312/75000 [29:32<39:37, 17.96it/s]

{'loss': 0.1905, 'grad_norm': 4.383959770202637, 'learning_rate': 2.8651006711409397e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32322/75000 [29:33<39:42, 17.92it/s]

{'loss': 0.3356, 'grad_norm': 3.4231150150299072, 'learning_rate': 2.8644295302013425e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32333/75000 [29:34<41:55, 16.96it/s]

{'loss': 0.2975, 'grad_norm': 5.185593605041504, 'learning_rate': 2.8637583892617454e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32341/75000 [29:34<40:25, 17.59it/s]

{'loss': 0.2663, 'grad_norm': 0.9986387491226196, 'learning_rate': 2.8630872483221475e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32353/75000 [29:35<39:06, 18.18it/s]

{'loss': 0.3117, 'grad_norm': 5.839745044708252, 'learning_rate': 2.8624161073825507e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32364/75000 [29:35<37:38, 18.88it/s]

{'loss': 0.3651, 'grad_norm': 0.8861058354377747, 'learning_rate': 2.861744966442953e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32374/75000 [29:36<40:01, 17.75it/s]

{'loss': 0.4278, 'grad_norm': 3.5357673168182373, 'learning_rate': 2.8610738255033558e-05, 'epoch': 1.29}


                                                     
 43%|████▎     | 32382/75000 [29:36<37:52, 18.76it/s]

{'loss': 0.3629, 'grad_norm': 3.0728232860565186, 'learning_rate': 2.8604026845637583e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32393/75000 [29:37<38:42, 18.34it/s]

{'loss': 0.2831, 'grad_norm': 0.9739137291908264, 'learning_rate': 2.859731543624161e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32403/75000 [29:37<38:53, 18.25it/s]

{'loss': 0.3924, 'grad_norm': 5.847493648529053, 'learning_rate': 2.859060402684564e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32412/75000 [29:38<40:34, 17.50it/s]

{'loss': 0.2724, 'grad_norm': 4.2896928787231445, 'learning_rate': 2.8583892617449665e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32424/75000 [29:38<37:17, 19.02it/s]

{'loss': 0.2619, 'grad_norm': 2.6766512393951416, 'learning_rate': 2.8577181208053694e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32432/75000 [29:39<39:37, 17.91it/s]

{'loss': 0.3017, 'grad_norm': 5.739348888397217, 'learning_rate': 2.857046979865772e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32442/75000 [29:39<39:03, 18.16it/s]

{'loss': 0.3075, 'grad_norm': 6.216970443725586, 'learning_rate': 2.8563758389261747e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32451/75000 [29:40<42:22, 16.74it/s]

{'loss': 0.3823, 'grad_norm': 1.8030180931091309, 'learning_rate': 2.8557046979865776e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32462/75000 [29:41<38:29, 18.42it/s]

{'loss': 0.4111, 'grad_norm': 6.891742706298828, 'learning_rate': 2.85503355704698e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32471/75000 [29:41<40:34, 17.47it/s]

{'loss': 0.2942, 'grad_norm': 1.9460886716842651, 'learning_rate': 2.854362416107383e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32484/75000 [29:42<36:06, 19.63it/s]

{'loss': 0.183, 'grad_norm': 2.350517988204956, 'learning_rate': 2.853691275167785e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32492/75000 [29:42<39:46, 17.81it/s]

{'loss': 0.3191, 'grad_norm': 2.654611825942993, 'learning_rate': 2.853020134228188e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32500/75000 [29:43<38:46, 18.27it/s]

{'loss': 0.2492, 'grad_norm': 1.2184418439865112, 'learning_rate': 2.8523489932885905e-05, 'epoch': 1.3}


                                                       
 43%|████▎     | 32512/75000 [29:44<49:35, 14.28it/s]

{'loss': 0.342, 'grad_norm': 2.9727931022644043, 'learning_rate': 2.8516778523489934e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32521/75000 [29:44<45:46, 15.47it/s]

{'loss': 0.3929, 'grad_norm': 1.0210516452789307, 'learning_rate': 2.8510067114093962e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32534/75000 [29:45<37:07, 19.06it/s]

{'loss': 0.3802, 'grad_norm': 3.328429698944092, 'learning_rate': 2.8503355704697987e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32542/75000 [29:45<37:10, 19.04it/s]

{'loss': 0.3896, 'grad_norm': 2.983778238296509, 'learning_rate': 2.8496644295302016e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32554/75000 [29:46<37:03, 19.09it/s]

{'loss': 0.4123, 'grad_norm': 1.5079509019851685, 'learning_rate': 2.848993288590604e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32564/75000 [29:47<36:37, 19.31it/s]

{'loss': 0.3607, 'grad_norm': 4.058648586273193, 'learning_rate': 2.848322147651007e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32574/75000 [29:47<38:49, 18.21it/s]

{'loss': 0.3304, 'grad_norm': 1.1721277236938477, 'learning_rate': 2.847651006711409e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32582/75000 [29:48<37:18, 18.95it/s]

{'loss': 0.2239, 'grad_norm': 1.3567323684692383, 'learning_rate': 2.8469798657718123e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32593/75000 [29:48<36:54, 19.15it/s]

{'loss': 0.2842, 'grad_norm': 4.9157562255859375, 'learning_rate': 2.8463087248322152e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32602/75000 [29:49<38:54, 18.16it/s]

{'loss': 0.4025, 'grad_norm': 3.676915168762207, 'learning_rate': 2.8456375838926174e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32613/75000 [29:49<39:37, 17.83it/s]

{'loss': 0.3258, 'grad_norm': 7.2740478515625, 'learning_rate': 2.8449664429530206e-05, 'epoch': 1.3}


                                                     
 43%|████▎     | 32624/75000 [29:50<37:04, 19.05it/s]

{'loss': 0.319, 'grad_norm': 5.9626007080078125, 'learning_rate': 2.8442953020134227e-05, 'epoch': 1.3}


                                                     
 44%|████▎     | 32632/75000 [29:50<37:52, 18.65it/s]

{'loss': 0.3165, 'grad_norm': 2.2681148052215576, 'learning_rate': 2.8436241610738256e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32643/75000 [29:51<36:16, 19.46it/s]

{'loss': 0.311, 'grad_norm': 9.564929962158203, 'learning_rate': 2.8429530201342284e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32652/75000 [29:51<37:22, 18.88it/s]

{'loss': 0.3984, 'grad_norm': 2.0089900493621826, 'learning_rate': 2.842281879194631e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32664/75000 [29:52<35:44, 19.74it/s]

{'loss': 0.3773, 'grad_norm': 6.45206880569458, 'learning_rate': 2.8416107382550338e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32672/75000 [29:52<37:09, 18.99it/s]

{'loss': 0.4451, 'grad_norm': 1.5807926654815674, 'learning_rate': 2.8409395973154363e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32684/75000 [29:53<36:53, 19.12it/s]

{'loss': 0.3449, 'grad_norm': 2.587510824203491, 'learning_rate': 2.8402684563758392e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32692/75000 [29:54<42:04, 16.76it/s]

{'loss': 0.2928, 'grad_norm': 0.5758687257766724, 'learning_rate': 2.8395973154362414e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32702/75000 [29:54<39:01, 18.06it/s]

{'loss': 0.3416, 'grad_norm': 20.856342315673828, 'learning_rate': 2.8389261744966445e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32713/75000 [29:55<40:11, 17.54it/s]

{'loss': 0.4399, 'grad_norm': 1.0834474563598633, 'learning_rate': 2.8382550335570474e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32723/75000 [29:55<39:26, 17.86it/s]

{'loss': 0.3303, 'grad_norm': 4.75677490234375, 'learning_rate': 2.8375838926174496e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32732/75000 [29:56<40:56, 17.21it/s]

{'loss': 0.2874, 'grad_norm': 2.361725091934204, 'learning_rate': 2.8369127516778528e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32741/75000 [29:56<38:27, 18.32it/s]

{'loss': 0.3764, 'grad_norm': 1.4541497230529785, 'learning_rate': 2.836241610738255e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32753/75000 [29:57<36:07, 19.49it/s]

{'loss': 0.2708, 'grad_norm': 3.6651861667633057, 'learning_rate': 2.8355704697986578e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32764/75000 [29:57<37:08, 18.96it/s]

{'loss': 0.3479, 'grad_norm': 3.95703387260437, 'learning_rate': 2.8348993288590603e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32773/75000 [29:58<39:20, 17.89it/s]

{'loss': 0.3276, 'grad_norm': 2.368079423904419, 'learning_rate': 2.8342281879194632e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32781/75000 [29:58<38:03, 18.49it/s]

{'loss': 0.2828, 'grad_norm': 2.387019157409668, 'learning_rate': 2.833557046979866e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32793/75000 [29:59<38:41, 18.18it/s]

{'loss': 0.363, 'grad_norm': 7.870241641998291, 'learning_rate': 2.8328859060402685e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32804/75000 [30:00<36:15, 19.40it/s]

{'loss': 0.2915, 'grad_norm': 2.0712368488311768, 'learning_rate': 2.8322147651006714e-05, 'epoch': 1.31}


                                                     
 44%|████▎     | 32812/75000 [30:00<36:28, 19.28it/s]

{'loss': 0.3073, 'grad_norm': 2.4606292247772217, 'learning_rate': 2.831543624161074e-05, 'epoch': 1.31}


                                                     
 44%|████▍     | 32822/75000 [30:01<45:30, 15.45it/s]

{'loss': 0.3319, 'grad_norm': 15.391436576843262, 'learning_rate': 2.8308724832214768e-05, 'epoch': 1.31}


                                                     
 44%|████▍     | 32832/75000 [30:01<49:44, 14.13it/s]

{'loss': 0.2327, 'grad_norm': 5.714325904846191, 'learning_rate': 2.830201342281879e-05, 'epoch': 1.31}


                                                     
 44%|████▍     | 32844/75000 [30:02<38:13, 18.38it/s]

{'loss': 0.4407, 'grad_norm': 4.007750511169434, 'learning_rate': 2.8295302013422818e-05, 'epoch': 1.31}


                                                     
 44%|████▍     | 32852/75000 [30:03<41:48, 16.81it/s]

{'loss': 0.4215, 'grad_norm': 2.1727981567382812, 'learning_rate': 2.828859060402685e-05, 'epoch': 1.31}


                                                     
 44%|████▍     | 32863/75000 [30:03<37:00, 18.97it/s]

{'loss': 0.3056, 'grad_norm': 3.508225679397583, 'learning_rate': 2.8281879194630872e-05, 'epoch': 1.31}


                                                     
 44%|████▍     | 32874/75000 [30:04<38:13, 18.37it/s]

{'loss': 0.3489, 'grad_norm': 1.2819299697875977, 'learning_rate': 2.82751677852349e-05, 'epoch': 1.31}


                                                     
 44%|████▍     | 32884/75000 [30:04<35:57, 19.52it/s]

{'loss': 0.3206, 'grad_norm': 4.436465740203857, 'learning_rate': 2.8268456375838925e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32892/75000 [30:05<36:39, 19.15it/s]

{'loss': 0.3952, 'grad_norm': 5.341742038726807, 'learning_rate': 2.8261744966442954e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32901/75000 [30:05<40:48, 17.19it/s]

{'loss': 0.3013, 'grad_norm': 2.355135917663574, 'learning_rate': 2.8255033557046983e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32914/75000 [30:06<35:02, 20.02it/s]

{'loss': 0.2898, 'grad_norm': 6.37097692489624, 'learning_rate': 2.8248322147651008e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32922/75000 [30:06<39:35, 17.72it/s]

{'loss': 0.2645, 'grad_norm': 3.202070951461792, 'learning_rate': 2.8241610738255036e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32931/75000 [30:07<37:20, 18.78it/s]

{'loss': 0.3523, 'grad_norm': 0.5102696418762207, 'learning_rate': 2.823489932885906e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32943/75000 [30:07<36:03, 19.44it/s]

{'loss': 0.2936, 'grad_norm': 10.162911415100098, 'learning_rate': 2.822818791946309e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32952/75000 [30:08<36:49, 19.03it/s]

{'loss': 0.2662, 'grad_norm': 2.025660514831543, 'learning_rate': 2.8221476510067112e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32961/75000 [30:08<39:19, 17.81it/s]

{'loss': 0.3689, 'grad_norm': 4.103672027587891, 'learning_rate': 2.8214765100671144e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32974/75000 [30:09<35:01, 20.00it/s]

{'loss': 0.2133, 'grad_norm': 0.7671580910682678, 'learning_rate': 2.8208053691275172e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32982/75000 [30:09<41:18, 16.95it/s]

{'loss': 0.293, 'grad_norm': 9.066413879394531, 'learning_rate': 2.8201342281879194e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 32992/75000 [30:10<37:00, 18.92it/s]

{'loss': 0.4203, 'grad_norm': 1.331413745880127, 'learning_rate': 2.8194630872483223e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33000/75000 [30:10<37:56, 18.45it/s]

{'loss': 0.2582, 'grad_norm': 1.9709258079528809, 'learning_rate': 2.8187919463087248e-05, 'epoch': 1.32}


                                                       
 44%|████▍     | 33014/75000 [30:14<1:19:49,  8.77it/s]

{'loss': 0.2672, 'grad_norm': 2.1108763217926025, 'learning_rate': 2.8181208053691276e-05, 'epoch': 1.32}


                                                       
 44%|████▍     | 33024/75000 [30:15<45:09, 15.49it/s]  

{'loss': 0.4121, 'grad_norm': 4.665566444396973, 'learning_rate': 2.81744966442953e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33032/75000 [30:15<42:53, 16.31it/s]

{'loss': 0.279, 'grad_norm': 14.577886581420898, 'learning_rate': 2.816778523489933e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33042/75000 [30:16<39:22, 17.76it/s]

{'loss': 0.3896, 'grad_norm': 12.678011894226074, 'learning_rate': 2.816107382550336e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33051/75000 [30:16<42:16, 16.54it/s]

{'loss': 0.2686, 'grad_norm': 5.9798903465271, 'learning_rate': 2.8154362416107384e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33061/75000 [30:17<37:51, 18.46it/s]

{'loss': 0.3883, 'grad_norm': 2.104186534881592, 'learning_rate': 2.8147651006711412e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33072/75000 [30:18<38:58, 17.93it/s]

{'loss': 0.401, 'grad_norm': 1.3239319324493408, 'learning_rate': 2.8140939597315434e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33082/75000 [30:18<38:30, 18.14it/s]

{'loss': 0.3796, 'grad_norm': 3.640653133392334, 'learning_rate': 2.8134228187919466e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33093/75000 [30:19<40:57, 17.06it/s]

{'loss': 0.3194, 'grad_norm': 4.580521106719971, 'learning_rate': 2.8127516778523494e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33102/75000 [30:19<43:27, 16.07it/s]

{'loss': 0.2155, 'grad_norm': 1.4034583568572998, 'learning_rate': 2.8120805369127516e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33114/75000 [30:20<38:24, 18.17it/s]

{'loss': 0.3084, 'grad_norm': 1.397959589958191, 'learning_rate': 2.8114093959731548e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33122/75000 [30:20<38:30, 18.12it/s]

{'loss': 0.2721, 'grad_norm': 4.012115001678467, 'learning_rate': 2.810738255033557e-05, 'epoch': 1.32}


                                                     
 44%|████▍     | 33132/75000 [30:21<39:15, 17.78it/s]

{'loss': 0.3849, 'grad_norm': 3.9855823516845703, 'learning_rate': 2.81006711409396e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33143/75000 [30:22<43:17, 16.12it/s]

{'loss': 0.4323, 'grad_norm': 6.616916656494141, 'learning_rate': 2.8093959731543624e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33153/75000 [30:22<39:47, 17.53it/s]

{'loss': 0.3662, 'grad_norm': 5.144591331481934, 'learning_rate': 2.8087248322147652e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33163/75000 [30:23<40:35, 17.18it/s]

{'loss': 0.2187, 'grad_norm': 2.2335784435272217, 'learning_rate': 2.808053691275168e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33173/75000 [30:23<41:42, 16.71it/s]

{'loss': 0.3202, 'grad_norm': 7.502784252166748, 'learning_rate': 2.8073825503355706e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33183/75000 [30:24<41:08, 16.94it/s]

{'loss': 0.3529, 'grad_norm': 1.046647071838379, 'learning_rate': 2.8067114093959734e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33191/75000 [30:25<41:02, 16.98it/s]

{'loss': 0.3764, 'grad_norm': 3.5166847705841064, 'learning_rate': 2.806040268456376e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33202/75000 [30:25<41:42, 16.71it/s]

{'loss': 0.3562, 'grad_norm': 2.835336923599243, 'learning_rate': 2.8053691275167788e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33212/75000 [30:26<39:06, 17.81it/s]

{'loss': 0.3768, 'grad_norm': 7.685027122497559, 'learning_rate': 2.804697986577181e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33222/75000 [30:26<40:53, 17.03it/s]

{'loss': 0.3527, 'grad_norm': 9.401700019836426, 'learning_rate': 2.804026845637584e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33234/75000 [30:27<36:16, 19.19it/s]

{'loss': 0.3197, 'grad_norm': 0.7881755828857422, 'learning_rate': 2.803355704697987e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33242/75000 [30:27<38:34, 18.04it/s]

{'loss': 0.3269, 'grad_norm': 1.7623001337051392, 'learning_rate': 2.8026845637583892e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33251/75000 [30:28<36:20, 19.15it/s]

{'loss': 0.4208, 'grad_norm': 0.9464197754859924, 'learning_rate': 2.802013422818792e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33263/75000 [30:28<36:09, 19.23it/s]

{'loss': 0.4122, 'grad_norm': 6.806085109710693, 'learning_rate': 2.8013422818791946e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33274/75000 [30:29<34:54, 19.92it/s]

{'loss': 0.4091, 'grad_norm': 1.8993207216262817, 'learning_rate': 2.8006711409395974e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33282/75000 [30:30<39:05, 17.78it/s]

{'loss': 0.3358, 'grad_norm': 3.8679568767547607, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33294/75000 [30:30<35:03, 19.83it/s]

{'loss': 0.279, 'grad_norm': 9.538178443908691, 'learning_rate': 2.7993288590604028e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33302/75000 [30:31<40:00, 17.37it/s]

{'loss': 0.3548, 'grad_norm': 2.193089485168457, 'learning_rate': 2.7986577181208057e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33314/75000 [30:31<34:53, 19.91it/s]

{'loss': 0.2715, 'grad_norm': 0.6086708903312683, 'learning_rate': 2.7979865771812082e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33322/75000 [30:32<36:33, 19.00it/s]

{'loss': 0.3278, 'grad_norm': 0.4581020772457123, 'learning_rate': 2.797315436241611e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33334/75000 [30:32<35:23, 19.62it/s]

{'loss': 0.36, 'grad_norm': 2.8589589595794678, 'learning_rate': 2.7966442953020132e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33342/75000 [30:33<39:10, 17.72it/s]

{'loss': 0.2633, 'grad_norm': 3.315441131591797, 'learning_rate': 2.7959731543624164e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33353/75000 [30:33<36:34, 18.98it/s]

{'loss': 0.3757, 'grad_norm': 1.9845008850097656, 'learning_rate': 2.7953020134228193e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33362/75000 [30:34<39:23, 17.62it/s]

{'loss': 0.3131, 'grad_norm': 2.7419471740722656, 'learning_rate': 2.7946308724832214e-05, 'epoch': 1.33}


                                                     
 44%|████▍     | 33372/75000 [30:34<35:58, 19.29it/s]

{'loss': 0.3185, 'grad_norm': 2.50437593460083, 'learning_rate': 2.7939597315436243e-05, 'epoch': 1.33}


                                                     
 45%|████▍     | 33384/75000 [30:35<35:25, 19.57it/s]

{'loss': 0.3389, 'grad_norm': 4.611126899719238, 'learning_rate': 2.7932885906040268e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33392/75000 [30:35<35:43, 19.41it/s]

{'loss': 0.2719, 'grad_norm': 5.288498401641846, 'learning_rate': 2.7926174496644297e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33404/75000 [30:36<36:16, 19.11it/s]

{'loss': 0.3637, 'grad_norm': 1.90945303440094, 'learning_rate': 2.7919463087248322e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33412/75000 [30:36<35:35, 19.47it/s]

{'loss': 0.1751, 'grad_norm': 7.004969120025635, 'learning_rate': 2.791275167785235e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33424/75000 [30:37<36:23, 19.04it/s]

{'loss': 0.3138, 'grad_norm': 4.842416286468506, 'learning_rate': 2.790604026845638e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33432/75000 [30:37<36:23, 19.04it/s]

{'loss': 0.324, 'grad_norm': 8.459339141845703, 'learning_rate': 2.7899328859060404e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33442/75000 [30:38<38:13, 18.12it/s]

{'loss': 0.4527, 'grad_norm': 4.740626335144043, 'learning_rate': 2.7892617449664433e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33452/75000 [30:38<36:01, 19.23it/s]

{'loss': 0.3299, 'grad_norm': 2.5656094551086426, 'learning_rate': 2.7885906040268454e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33461/75000 [30:39<41:52, 16.53it/s]

{'loss': 0.2447, 'grad_norm': 2.1203391551971436, 'learning_rate': 2.7879194630872486e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33474/75000 [30:40<36:19, 19.05it/s]

{'loss': 0.2473, 'grad_norm': 5.790843963623047, 'learning_rate': 2.7872483221476515e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33481/75000 [30:40<38:59, 17.74it/s]

{'loss': 0.4068, 'grad_norm': 4.495306968688965, 'learning_rate': 2.7865771812080537e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33494/75000 [30:41<34:18, 20.16it/s]

{'loss': 0.3111, 'grad_norm': 5.0838727951049805, 'learning_rate': 2.785906040268457e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33500/75000 [30:41<35:26, 19.52it/s]

{'loss': 0.3479, 'grad_norm': 4.429536819458008, 'learning_rate': 2.785234899328859e-05, 'epoch': 1.34}


                                                       
 45%|████▍     | 33514/75000 [30:42<44:01, 15.71it/s]

{'loss': 0.3937, 'grad_norm': 1.8816783428192139, 'learning_rate': 2.784563758389262e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33524/75000 [30:43<38:42, 17.86it/s]

{'loss': 0.2793, 'grad_norm': 3.433773994445801, 'learning_rate': 2.7838926174496644e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33532/75000 [30:43<39:09, 17.65it/s]

{'loss': 0.4537, 'grad_norm': 3.0378973484039307, 'learning_rate': 2.7832214765100673e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33543/75000 [30:44<38:06, 18.13it/s]

{'loss': 0.3617, 'grad_norm': 4.98148250579834, 'learning_rate': 2.78255033557047e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33552/75000 [30:44<37:57, 18.20it/s]

{'loss': 0.3924, 'grad_norm': 2.041044235229492, 'learning_rate': 2.7818791946308726e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33561/75000 [30:45<37:14, 18.55it/s]

{'loss': 0.3401, 'grad_norm': 1.0999468564987183, 'learning_rate': 2.7812080536912755e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33573/75000 [30:45<37:01, 18.65it/s]

{'loss': 0.2793, 'grad_norm': 1.8996782302856445, 'learning_rate': 2.7805369127516777e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33582/75000 [30:46<36:03, 19.14it/s]

{'loss': 0.4937, 'grad_norm': 2.037975311279297, 'learning_rate': 2.779865771812081e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33591/75000 [30:46<41:05, 16.80it/s]

{'loss': 0.2813, 'grad_norm': 6.35126256942749, 'learning_rate': 2.779194630872483e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33604/75000 [30:47<36:28, 18.91it/s]

{'loss': 0.2818, 'grad_norm': 3.130483388900757, 'learning_rate': 2.778523489932886e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33612/75000 [30:48<41:50, 16.49it/s]

{'loss': 0.2636, 'grad_norm': 7.852138042449951, 'learning_rate': 2.777852348993289e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33622/75000 [30:48<37:16, 18.50it/s]

{'loss': 0.3795, 'grad_norm': 6.222657680511475, 'learning_rate': 2.7771812080536913e-05, 'epoch': 1.34}


                                                     
 45%|████▍     | 33633/75000 [30:49<38:38, 17.84it/s]

{'loss': 0.2988, 'grad_norm': 2.2431113719940186, 'learning_rate': 2.776510067114094e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33643/75000 [30:49<37:42, 18.28it/s]

{'loss': 0.1737, 'grad_norm': 1.1890525817871094, 'learning_rate': 2.7758389261744966e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33653/75000 [30:50<39:03, 17.64it/s]

{'loss': 0.4143, 'grad_norm': 5.6074323654174805, 'learning_rate': 2.7751677852348995e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33661/75000 [30:50<38:10, 18.05it/s]

{'loss': 0.2414, 'grad_norm': 5.383924961090088, 'learning_rate': 2.774496644295302e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33671/75000 [30:51<40:24, 17.05it/s]

{'loss': 0.3134, 'grad_norm': 2.428192138671875, 'learning_rate': 2.773825503355705e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33682/75000 [30:52<37:01, 18.60it/s]

{'loss': 0.2151, 'grad_norm': 11.956007957458496, 'learning_rate': 2.7731543624161077e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33694/75000 [30:52<36:47, 18.71it/s]

{'loss': 0.3095, 'grad_norm': 0.8500199913978577, 'learning_rate': 2.7724832214765102e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33702/75000 [30:53<40:16, 17.09it/s]

{'loss': 0.3658, 'grad_norm': 4.999112129211426, 'learning_rate': 2.771812080536913e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33712/75000 [30:53<38:35, 17.83it/s]

{'loss': 0.2484, 'grad_norm': 1.2038408517837524, 'learning_rate': 2.7711409395973153e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33723/75000 [30:54<43:56, 15.65it/s]

{'loss': 0.1965, 'grad_norm': 2.0787031650543213, 'learning_rate': 2.770469798657718e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33733/75000 [30:55<43:25, 15.84it/s]

{'loss': 0.4026, 'grad_norm': 6.515158176422119, 'learning_rate': 2.7697986577181213e-05, 'epoch': 1.35}


                                                     
 45%|████▍     | 33743/75000 [30:55<39:48, 17.27it/s]

{'loss': 0.3214, 'grad_norm': 5.25160026550293, 'learning_rate': 2.7691275167785235e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33753/75000 [30:56<43:46, 15.71it/s]

{'loss': 0.4562, 'grad_norm': 3.630960702896118, 'learning_rate': 2.7684563758389263e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33763/75000 [30:56<41:43, 16.47it/s]

{'loss': 0.2761, 'grad_norm': 6.572750091552734, 'learning_rate': 2.767785234899329e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33773/75000 [30:57<38:38, 17.78it/s]

{'loss': 0.4045, 'grad_norm': 3.2919561862945557, 'learning_rate': 2.7671140939597317e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33782/75000 [30:57<39:13, 17.51it/s]

{'loss': 0.3449, 'grad_norm': 4.171911239624023, 'learning_rate': 2.7664429530201342e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33792/75000 [30:58<36:07, 19.01it/s]

{'loss': 0.4499, 'grad_norm': 15.122180938720703, 'learning_rate': 2.765771812080537e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33803/75000 [30:58<34:15, 20.04it/s]

{'loss': 0.3526, 'grad_norm': 1.0725336074829102, 'learning_rate': 2.76510067114094e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33811/75000 [30:59<37:34, 18.27it/s]

{'loss': 0.3887, 'grad_norm': 3.2964589595794678, 'learning_rate': 2.7644295302013424e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33824/75000 [30:59<33:42, 20.36it/s]

{'loss': 0.3743, 'grad_norm': 3.1311779022216797, 'learning_rate': 2.7637583892617453e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33834/75000 [31:00<33:59, 20.18it/s]

{'loss': 0.2807, 'grad_norm': 0.8100993037223816, 'learning_rate': 2.7630872483221475e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33842/75000 [31:00<35:32, 19.30it/s]

{'loss': 0.195, 'grad_norm': 2.8069093227386475, 'learning_rate': 2.7624161073825507e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33851/75000 [31:01<38:04, 18.02it/s]

{'loss': 0.292, 'grad_norm': 5.861173629760742, 'learning_rate': 2.761744966442953e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33862/75000 [31:02<37:54, 18.09it/s]

{'loss': 0.2848, 'grad_norm': 2.15460467338562, 'learning_rate': 2.7610738255033557e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33871/75000 [31:02<35:09, 19.49it/s]

{'loss': 0.2788, 'grad_norm': 1.1644845008850098, 'learning_rate': 2.7604026845637586e-05, 'epoch': 1.35}


                                                     
 45%|████▌     | 33884/75000 [31:03<34:51, 19.65it/s]

{'loss': 0.3744, 'grad_norm': 2.9594979286193848, 'learning_rate': 2.759731543624161e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33892/75000 [31:03<37:52, 18.09it/s]

{'loss': 0.3848, 'grad_norm': 1.5800323486328125, 'learning_rate': 2.759060402684564e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33904/75000 [31:04<35:28, 19.31it/s]

{'loss': 0.3558, 'grad_norm': 9.296274185180664, 'learning_rate': 2.7583892617449664e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33914/75000 [31:04<36:00, 19.01it/s]

{'loss': 0.2912, 'grad_norm': 4.027433395385742, 'learning_rate': 2.7577181208053693e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33923/75000 [31:05<38:44, 17.67it/s]

{'loss': 0.3171, 'grad_norm': 5.8906121253967285, 'learning_rate': 2.757046979865772e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33933/75000 [31:05<35:59, 19.02it/s]

{'loss': 0.485, 'grad_norm': 3.0766780376434326, 'learning_rate': 2.7563758389261747e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33942/75000 [31:06<37:19, 18.33it/s]

{'loss': 0.4662, 'grad_norm': 5.826996326446533, 'learning_rate': 2.7557046979865775e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33951/75000 [31:06<40:11, 17.02it/s]

{'loss': 0.3694, 'grad_norm': 8.854107856750488, 'learning_rate': 2.7550335570469797e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33964/75000 [31:07<35:46, 19.12it/s]

{'loss': 0.2994, 'grad_norm': 9.514543533325195, 'learning_rate': 2.754362416107383e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33973/75000 [31:07<37:55, 18.03it/s]

{'loss': 0.3341, 'grad_norm': 2.02128005027771, 'learning_rate': 2.753691275167785e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33983/75000 [31:08<35:07, 19.47it/s]

{'loss': 0.3717, 'grad_norm': 2.0492260456085205, 'learning_rate': 2.753020134228188e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 33993/75000 [31:08<36:10, 18.89it/s]

{'loss': 0.2982, 'grad_norm': 4.164521217346191, 'learning_rate': 2.752348993288591e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34000/75000 [31:09<35:18, 19.35it/s]

{'loss': 0.3483, 'grad_norm': 4.619356632232666, 'learning_rate': 2.7516778523489933e-05, 'epoch': 1.36}


                                                       
 45%|████▌     | 34013/75000 [31:10<44:25, 15.38it/s]

{'loss': 0.2779, 'grad_norm': 1.3624759912490845, 'learning_rate': 2.751006711409396e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34024/75000 [31:11<35:15, 19.36it/s]

{'loss': 0.3193, 'grad_norm': 5.377444744110107, 'learning_rate': 2.7503355704697987e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34033/75000 [31:11<33:59, 20.09it/s]

{'loss': 0.4117, 'grad_norm': 2.688690662384033, 'learning_rate': 2.7496644295302015e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34042/75000 [31:11<34:04, 20.03it/s]

{'loss': 0.2942, 'grad_norm': 3.041558265686035, 'learning_rate': 2.748993288590604e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34054/75000 [31:12<34:08, 19.98it/s]

{'loss': 0.433, 'grad_norm': 2.6697144508361816, 'learning_rate': 2.748322147651007e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34063/75000 [31:12<33:55, 20.11it/s]

{'loss': 0.2768, 'grad_norm': 4.0228729248046875, 'learning_rate': 2.7476510067114098e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34073/75000 [31:13<36:44, 18.56it/s]

{'loss': 0.3475, 'grad_norm': 2.961091995239258, 'learning_rate': 2.7469798657718123e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34084/75000 [31:14<35:01, 19.47it/s]

{'loss': 0.2632, 'grad_norm': 5.713856220245361, 'learning_rate': 2.746308724832215e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34091/75000 [31:14<36:09, 18.86it/s]

{'loss': 0.2893, 'grad_norm': 9.922513961791992, 'learning_rate': 2.7456375838926173e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34104/75000 [31:15<34:46, 19.60it/s]

{'loss': 0.3127, 'grad_norm': 3.309851884841919, 'learning_rate': 2.74496644295302e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34111/75000 [31:15<36:30, 18.67it/s]

{'loss': 0.3018, 'grad_norm': 0.515251100063324, 'learning_rate': 2.7442953020134233e-05, 'epoch': 1.36}


                                                     
 45%|████▌     | 34123/75000 [31:16<33:41, 20.23it/s]

{'loss': 0.2714, 'grad_norm': 0.3548775613307953, 'learning_rate': 2.7436241610738255e-05, 'epoch': 1.36}


                                                     
 46%|████▌     | 34131/75000 [31:16<35:48, 19.02it/s]

{'loss': 0.3539, 'grad_norm': 0.46092864871025085, 'learning_rate': 2.7429530201342284e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34144/75000 [31:17<34:10, 19.93it/s]

{'loss': 0.2968, 'grad_norm': 1.3664723634719849, 'learning_rate': 2.742281879194631e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34154/75000 [31:17<33:19, 20.43it/s]

{'loss': 0.3435, 'grad_norm': 0.46533307433128357, 'learning_rate': 2.7416107382550337e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34161/75000 [31:18<35:05, 19.39it/s]

{'loss': 0.3604, 'grad_norm': 1.0929784774780273, 'learning_rate': 2.7409395973154363e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34173/75000 [31:18<32:26, 20.97it/s]

{'loss': 0.3559, 'grad_norm': 2.053018569946289, 'learning_rate': 2.740268456375839e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34182/75000 [31:19<33:03, 20.57it/s]

{'loss': 0.2767, 'grad_norm': 4.3298869132995605, 'learning_rate': 2.739597315436242e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34192/75000 [31:19<35:14, 19.30it/s]

{'loss': 0.2958, 'grad_norm': 2.7091593742370605, 'learning_rate': 2.7389261744966445e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34204/75000 [31:20<35:55, 18.93it/s]

{'loss': 0.3083, 'grad_norm': 2.029566526412964, 'learning_rate': 2.7382550335570473e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34214/75000 [31:20<33:20, 20.39it/s]

{'loss': 0.2898, 'grad_norm': 4.195732116699219, 'learning_rate': 2.7375838926174495e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34222/75000 [31:21<35:02, 19.39it/s]

{'loss': 0.461, 'grad_norm': 3.116380453109741, 'learning_rate': 2.7369127516778527e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34234/75000 [31:21<33:06, 20.52it/s]

{'loss': 0.3653, 'grad_norm': 1.6463931798934937, 'learning_rate': 2.736241610738255e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34242/75000 [31:22<35:14, 19.28it/s]

{'loss': 0.2797, 'grad_norm': 1.7084366083145142, 'learning_rate': 2.7355704697986577e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34253/75000 [31:22<35:47, 18.97it/s]

{'loss': 0.3713, 'grad_norm': 4.279004096984863, 'learning_rate': 2.7348993288590606e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34264/75000 [31:23<32:56, 20.61it/s]

{'loss': 0.2628, 'grad_norm': 6.469057083129883, 'learning_rate': 2.734228187919463e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34272/75000 [31:23<38:05, 17.82it/s]

{'loss': 0.3541, 'grad_norm': 7.475208759307861, 'learning_rate': 2.733557046979866e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34284/75000 [31:24<34:34, 19.63it/s]

{'loss': 0.3354, 'grad_norm': 9.065986633300781, 'learning_rate': 2.7328859060402685e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34293/75000 [31:24<33:53, 20.02it/s]

{'loss': 0.3286, 'grad_norm': 6.753148555755615, 'learning_rate': 2.7322147651006713e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34302/75000 [31:25<36:18, 18.68it/s]

{'loss': 0.3211, 'grad_norm': 3.4484615325927734, 'learning_rate': 2.7315436241610742e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34314/75000 [31:25<34:12, 19.82it/s]

{'loss': 0.4065, 'grad_norm': 8.731127738952637, 'learning_rate': 2.7308724832214767e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34321/75000 [31:26<39:29, 17.17it/s]

{'loss': 0.3157, 'grad_norm': 7.476868152618408, 'learning_rate': 2.7302013422818796e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34332/75000 [31:26<33:50, 20.02it/s]

{'loss': 0.2988, 'grad_norm': 3.0337307453155518, 'learning_rate': 2.7295302013422817e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34341/75000 [31:27<32:52, 20.62it/s]

{'loss': 0.3528, 'grad_norm': 4.153311252593994, 'learning_rate': 2.728859060402685e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34353/75000 [31:27<34:29, 19.64it/s]

{'loss': 0.3023, 'grad_norm': 1.9176560640335083, 'learning_rate': 2.728187919463087e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34364/75000 [31:28<35:49, 18.90it/s]

{'loss': 0.3423, 'grad_norm': 1.1723322868347168, 'learning_rate': 2.72751677852349e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34373/75000 [31:28<34:56, 19.38it/s]

{'loss': 0.2878, 'grad_norm': 7.2509050369262695, 'learning_rate': 2.726845637583893e-05, 'epoch': 1.37}


                                                     
 46%|████▌     | 34384/75000 [31:29<34:08, 19.83it/s]

{'loss': 0.3253, 'grad_norm': 5.745332717895508, 'learning_rate': 2.7261744966442953e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34392/75000 [31:29<36:04, 18.76it/s]

{'loss': 0.4738, 'grad_norm': 4.199003219604492, 'learning_rate': 2.7255033557046982e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34404/75000 [31:30<32:49, 20.61it/s]

{'loss': 0.2918, 'grad_norm': 4.298118591308594, 'learning_rate': 2.7248322147651007e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34413/75000 [31:31<34:06, 19.83it/s]

{'loss': 0.2772, 'grad_norm': 1.924222469329834, 'learning_rate': 2.7241610738255036e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34423/75000 [31:31<35:00, 19.31it/s]

{'loss': 0.4065, 'grad_norm': 10.010454177856445, 'learning_rate': 2.723489932885906e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34433/75000 [31:32<34:44, 19.46it/s]

{'loss': 0.3035, 'grad_norm': 7.544505596160889, 'learning_rate': 2.722818791946309e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34442/75000 [31:32<33:28, 20.20it/s]

{'loss': 0.2747, 'grad_norm': 1.5490312576293945, 'learning_rate': 2.7221476510067118e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34451/75000 [31:32<36:00, 18.77it/s]

{'loss': 0.3442, 'grad_norm': 5.601906776428223, 'learning_rate': 2.721476510067114e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34463/75000 [31:33<33:19, 20.28it/s]

{'loss': 0.4468, 'grad_norm': 2.3904082775115967, 'learning_rate': 2.720805369127517e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34472/75000 [31:33<33:42, 20.04it/s]

{'loss': 0.2918, 'grad_norm': 1.6269886493682861, 'learning_rate': 2.7201342281879193e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34484/75000 [31:34<33:51, 19.95it/s]

{'loss': 0.2089, 'grad_norm': 2.4163658618927, 'learning_rate': 2.7194630872483222e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34492/75000 [31:35<37:59, 17.77it/s]

{'loss': 0.3485, 'grad_norm': 1.84622323513031, 'learning_rate': 2.7187919463087247e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34500/75000 [31:35<33:50, 19.94it/s]

{'loss': 0.3814, 'grad_norm': 2.6714017391204834, 'learning_rate': 2.7181208053691276e-05, 'epoch': 1.38}


                                                       
 46%|████▌     | 34514/75000 [31:36<45:03, 14.97it/s]  

{'loss': 0.3192, 'grad_norm': 3.3352203369140625, 'learning_rate': 2.7174496644295304e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34524/75000 [31:37<35:30, 19.00it/s]

{'loss': 0.4943, 'grad_norm': 4.441106796264648, 'learning_rate': 2.716778523489933e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34532/75000 [31:37<39:18, 17.16it/s]

{'loss': 0.2914, 'grad_norm': 0.920286238193512, 'learning_rate': 2.7161073825503358e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34542/75000 [31:38<44:36, 15.11it/s]

{'loss': 0.2975, 'grad_norm': 4.941334247589111, 'learning_rate': 2.7154362416107383e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34552/75000 [31:39<38:37, 17.45it/s]

{'loss': 0.2502, 'grad_norm': 1.1265742778778076, 'learning_rate': 2.714765100671141e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34562/75000 [31:39<37:57, 17.75it/s]

{'loss': 0.3499, 'grad_norm': 4.250311374664307, 'learning_rate': 2.714093959731544e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34574/75000 [31:40<32:29, 20.74it/s]

{'loss': 0.3482, 'grad_norm': 1.3424988985061646, 'learning_rate': 2.7134228187919465e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34582/75000 [31:40<34:50, 19.33it/s]

{'loss': 0.2873, 'grad_norm': 1.08860445022583, 'learning_rate': 2.7127516778523494e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34591/75000 [31:41<35:29, 18.97it/s]

{'loss': 0.3584, 'grad_norm': 4.394890308380127, 'learning_rate': 2.7120805369127516e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34604/75000 [31:41<32:49, 20.51it/s]

{'loss': 0.4263, 'grad_norm': 3.888448476791382, 'learning_rate': 2.7114093959731544e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34613/75000 [31:42<33:19, 20.20it/s]

{'loss': 0.4034, 'grad_norm': 4.324461936950684, 'learning_rate': 2.710738255033557e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34622/75000 [31:42<34:58, 19.24it/s]

{'loss': 0.2819, 'grad_norm': 10.683197021484375, 'learning_rate': 2.7100671140939598e-05, 'epoch': 1.38}


                                                     
 46%|████▌     | 34634/75000 [31:43<32:37, 20.62it/s]

{'loss': 0.3045, 'grad_norm': 5.0513834953308105, 'learning_rate': 2.7093959731543626e-05, 'epoch': 1.39}


                                                     
 46%|████▌     | 34642/75000 [31:43<35:56, 18.72it/s]

{'loss': 0.299, 'grad_norm': 0.8477831482887268, 'learning_rate': 2.708724832214765e-05, 'epoch': 1.39}


                                                     
 46%|████▌     | 34652/75000 [31:44<34:13, 19.65it/s]

{'loss': 0.3203, 'grad_norm': 2.171074390411377, 'learning_rate': 2.708053691275168e-05, 'epoch': 1.39}


                                                     
 46%|████▌     | 34661/75000 [31:44<33:23, 20.13it/s]

{'loss': 0.3027, 'grad_norm': 1.783873200416565, 'learning_rate': 2.7073825503355705e-05, 'epoch': 1.39}


                                                     
 46%|████▌     | 34674/75000 [31:45<35:33, 18.90it/s]

{'loss': 0.2351, 'grad_norm': 3.6774485111236572, 'learning_rate': 2.7067114093959734e-05, 'epoch': 1.39}


                                                     
 46%|████▌     | 34682/75000 [31:45<36:19, 18.50it/s]

{'loss': 0.3466, 'grad_norm': 0.7947053909301758, 'learning_rate': 2.7060402684563756e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34692/75000 [31:46<34:16, 19.60it/s]

{'loss': 0.2607, 'grad_norm': 5.797604560852051, 'learning_rate': 2.7053691275167788e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34701/75000 [31:46<35:50, 18.74it/s]

{'loss': 0.2879, 'grad_norm': 0.5839911103248596, 'learning_rate': 2.7046979865771816e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34712/75000 [31:47<32:35, 20.60it/s]

{'loss': 0.2599, 'grad_norm': 1.2765311002731323, 'learning_rate': 2.7040268456375838e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34721/75000 [31:47<34:46, 19.30it/s]

{'loss': 0.521, 'grad_norm': 3.0989155769348145, 'learning_rate': 2.703355704697987e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34733/75000 [31:48<32:24, 20.71it/s]

{'loss': 0.4105, 'grad_norm': 6.828554153442383, 'learning_rate': 2.702684563758389e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34742/75000 [31:48<37:14, 18.02it/s]

{'loss': 0.3835, 'grad_norm': 5.11500358581543, 'learning_rate': 2.702013422818792e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34753/75000 [31:49<33:28, 20.04it/s]

{'loss': 0.308, 'grad_norm': 1.7361398935317993, 'learning_rate': 2.701342281879195e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34762/75000 [31:49<35:36, 18.84it/s]

{'loss': 0.2417, 'grad_norm': 2.7431740760803223, 'learning_rate': 2.7006711409395974e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34773/75000 [31:50<32:45, 20.47it/s]

{'loss': 0.3358, 'grad_norm': 2.330122232437134, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34784/75000 [31:50<34:39, 19.34it/s]

{'loss': 0.3313, 'grad_norm': 0.8342278003692627, 'learning_rate': 2.6993288590604028e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34794/75000 [31:51<32:33, 20.58it/s]

{'loss': 0.2019, 'grad_norm': 1.0281689167022705, 'learning_rate': 2.6986577181208056e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34802/75000 [31:51<34:17, 19.54it/s]

{'loss': 0.4029, 'grad_norm': 4.7405619621276855, 'learning_rate': 2.6979865771812078e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34814/75000 [31:52<34:05, 19.64it/s]

{'loss': 0.3578, 'grad_norm': 3.6772170066833496, 'learning_rate': 2.697315436241611e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34822/75000 [31:52<34:04, 19.65it/s]

{'loss': 0.34, 'grad_norm': 2.8363444805145264, 'learning_rate': 2.696644295302014e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34834/75000 [31:53<34:17, 19.52it/s]

{'loss': 0.1744, 'grad_norm': 0.6802802085876465, 'learning_rate': 2.695973154362416e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34843/75000 [31:54<35:01, 19.11it/s]

{'loss': 0.2944, 'grad_norm': 3.0563149452209473, 'learning_rate': 2.6953020134228192e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34852/75000 [31:54<33:02, 20.25it/s]

{'loss': 0.3825, 'grad_norm': 3.404711961746216, 'learning_rate': 2.6946308724832214e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34864/75000 [31:55<32:31, 20.57it/s]

{'loss': 0.1982, 'grad_norm': 2.166372299194336, 'learning_rate': 2.6939597315436242e-05, 'epoch': 1.39}


                                                     
 46%|████▋     | 34872/75000 [31:55<34:02, 19.65it/s]

{'loss': 0.3551, 'grad_norm': 4.608476638793945, 'learning_rate': 2.6932885906040268e-05, 'epoch': 1.39}


                                                     
 47%|████▋     | 34884/75000 [31:56<31:43, 21.08it/s]

{'loss': 0.4386, 'grad_norm': 4.091508865356445, 'learning_rate': 2.6926174496644296e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34893/75000 [31:56<35:21, 18.90it/s]

{'loss': 0.2789, 'grad_norm': 1.1112499237060547, 'learning_rate': 2.6919463087248325e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34904/75000 [31:57<32:44, 20.41it/s]

{'loss': 0.2933, 'grad_norm': 4.092860698699951, 'learning_rate': 2.691275167785235e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34913/75000 [31:57<34:48, 19.20it/s]

{'loss': 0.2979, 'grad_norm': 3.0270705223083496, 'learning_rate': 2.690604026845638e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34922/75000 [31:57<33:31, 19.92it/s]

{'loss': 0.3068, 'grad_norm': 13.580212593078613, 'learning_rate': 2.6899328859060403e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34931/75000 [31:58<35:43, 18.69it/s]

{'loss': 0.4236, 'grad_norm': 4.389362335205078, 'learning_rate': 2.6892617449664432e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34944/75000 [31:59<32:59, 20.24it/s]

{'loss': 0.246, 'grad_norm': 0.6665158271789551, 'learning_rate': 2.688590604026846e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34952/75000 [31:59<36:44, 18.17it/s]

{'loss': 0.3194, 'grad_norm': 1.284834861755371, 'learning_rate': 2.6879194630872482e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34962/75000 [32:00<33:32, 19.89it/s]

{'loss': 0.2125, 'grad_norm': 6.105597019195557, 'learning_rate': 2.6872483221476514e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34974/75000 [32:00<33:58, 19.64it/s]

{'loss': 0.44, 'grad_norm': 3.69828462600708, 'learning_rate': 2.6865771812080536e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34984/75000 [32:01<32:34, 20.47it/s]

{'loss': 0.4732, 'grad_norm': 2.829805374145508, 'learning_rate': 2.6859060402684565e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 34993/75000 [32:01<33:45, 19.75it/s]

{'loss': 0.2998, 'grad_norm': 1.6721432209014893, 'learning_rate': 2.685234899328859e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35000/75000 [32:01<32:28, 20.53it/s]

{'loss': 0.4117, 'grad_norm': 5.498814582824707, 'learning_rate': 2.6845637583892618e-05, 'epoch': 1.4}


                                                       
 47%|████▋     | 35014/75000 [32:03<41:53, 15.91it/s]

{'loss': 0.1866, 'grad_norm': 1.4482096433639526, 'learning_rate': 2.6838926174496647e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35024/75000 [32:03<35:06, 18.98it/s]

{'loss': 0.3252, 'grad_norm': 7.448091506958008, 'learning_rate': 2.6832214765100672e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35034/75000 [32:04<35:21, 18.84it/s]

{'loss': 0.4143, 'grad_norm': 0.7579849362373352, 'learning_rate': 2.68255033557047e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35043/75000 [32:04<34:35, 19.25it/s]

{'loss': 0.2635, 'grad_norm': 0.8437680006027222, 'learning_rate': 2.6818791946308726e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35054/75000 [32:05<32:47, 20.30it/s]

{'loss': 0.3894, 'grad_norm': 1.3729976415634155, 'learning_rate': 2.6812080536912754e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35062/75000 [32:05<34:03, 19.55it/s]

{'loss': 0.2769, 'grad_norm': 3.141281843185425, 'learning_rate': 2.6805369127516776e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35073/75000 [32:06<33:11, 20.05it/s]

{'loss': 0.3455, 'grad_norm': 3.897054672241211, 'learning_rate': 2.6798657718120808e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35082/75000 [32:06<32:36, 20.40it/s]

{'loss': 0.2058, 'grad_norm': 3.1368417739868164, 'learning_rate': 2.6791946308724837e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35094/75000 [32:07<36:10, 18.38it/s]

{'loss': 0.2129, 'grad_norm': 2.6494288444519043, 'learning_rate': 2.6785234899328858e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35103/75000 [32:07<33:51, 19.64it/s]

{'loss': 0.3621, 'grad_norm': 26.216402053833008, 'learning_rate': 2.6778523489932887e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35114/75000 [32:08<33:15, 19.99it/s]

{'loss': 0.3898, 'grad_norm': 13.471724510192871, 'learning_rate': 2.6771812080536912e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35123/75000 [32:08<32:19, 20.56it/s]

{'loss': 0.3694, 'grad_norm': 7.319949626922607, 'learning_rate': 2.676510067114094e-05, 'epoch': 1.4}


                                                     
 47%|████▋     | 35132/75000 [32:09<34:03, 19.51it/s]

{'loss': 0.2184, 'grad_norm': 3.4773004055023193, 'learning_rate': 2.6758389261744966e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35144/75000 [32:09<32:52, 20.20it/s]

{'loss': 0.3723, 'grad_norm': 1.1470900774002075, 'learning_rate': 2.6751677852348994e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35153/75000 [32:10<32:20, 20.54it/s]

{'loss': 0.3795, 'grad_norm': 2.2249162197113037, 'learning_rate': 2.6744966442953023e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35162/75000 [32:10<32:56, 20.15it/s]

{'loss': 0.4008, 'grad_norm': 2.7871782779693604, 'learning_rate': 2.6738255033557048e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35174/75000 [32:11<32:23, 20.49it/s]

{'loss': 0.2637, 'grad_norm': 8.168807029724121, 'learning_rate': 2.6731543624161076e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35183/75000 [32:11<34:17, 19.35it/s]

{'loss': 0.4404, 'grad_norm': 3.704871416091919, 'learning_rate': 2.6724832214765098e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35194/75000 [32:12<32:43, 20.27it/s]

{'loss': 0.265, 'grad_norm': 2.0111029148101807, 'learning_rate': 2.671812080536913e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35203/75000 [32:12<33:29, 19.80it/s]

{'loss': 0.3262, 'grad_norm': 1.416900873184204, 'learning_rate': 2.671140939597316e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35212/75000 [32:13<32:29, 20.41it/s]

{'loss': 0.2841, 'grad_norm': 3.1386218070983887, 'learning_rate': 2.670469798657718e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35224/75000 [32:13<33:33, 19.76it/s]

{'loss': 0.3938, 'grad_norm': 0.9953005313873291, 'learning_rate': 2.6697986577181212e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35233/75000 [32:14<32:39, 20.29it/s]

{'loss': 0.2708, 'grad_norm': 1.4199694395065308, 'learning_rate': 2.6691275167785234e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35242/75000 [32:14<33:39, 19.69it/s]

{'loss': 0.2371, 'grad_norm': 10.884744644165039, 'learning_rate': 2.6684563758389263e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35254/75000 [32:15<31:29, 21.03it/s]

{'loss': 0.324, 'grad_norm': 1.2177472114562988, 'learning_rate': 2.6677852348993288e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35263/75000 [32:15<33:20, 19.86it/s]

{'loss': 0.3653, 'grad_norm': 7.442813873291016, 'learning_rate': 2.6671140939597316e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35272/75000 [32:16<32:00, 20.68it/s]

{'loss': 0.3518, 'grad_norm': 5.24213171005249, 'learning_rate': 2.6664429530201345e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35282/75000 [32:16<33:47, 19.59it/s]

{'loss': 0.318, 'grad_norm': 3.5173966884613037, 'learning_rate': 2.665771812080537e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35294/75000 [32:17<34:15, 19.31it/s]

{'loss': 0.4064, 'grad_norm': 4.112942218780518, 'learning_rate': 2.66510067114094e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35303/75000 [32:17<32:52, 20.12it/s]

{'loss': 0.4317, 'grad_norm': 1.1800644397735596, 'learning_rate': 2.6644295302013424e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35314/75000 [32:18<33:49, 19.55it/s]

{'loss': 0.2884, 'grad_norm': 5.93342924118042, 'learning_rate': 2.6637583892617452e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35323/75000 [32:18<32:05, 20.61it/s]

{'loss': 0.2813, 'grad_norm': 1.1324584484100342, 'learning_rate': 2.6630872483221474e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35334/75000 [32:19<34:08, 19.36it/s]

{'loss': 0.3143, 'grad_norm': 2.857654094696045, 'learning_rate': 2.6624161073825503e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35340/75000 [32:19<33:05, 19.97it/s]

{'loss': 0.2917, 'grad_norm': 5.879761219024658, 'learning_rate': 2.6617449664429535e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35352/75000 [32:20<33:17, 19.85it/s]

{'loss': 0.365, 'grad_norm': 8.85374927520752, 'learning_rate': 2.6610738255033556e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35364/75000 [32:20<31:55, 20.69it/s]

{'loss': 0.2942, 'grad_norm': 18.892995834350586, 'learning_rate': 2.6604026845637585e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35373/75000 [32:21<33:23, 19.78it/s]

{'loss': 0.4292, 'grad_norm': 3.523698329925537, 'learning_rate': 2.659731543624161e-05, 'epoch': 1.41}


                                                     
 47%|████▋     | 35382/75000 [32:21<33:31, 19.70it/s]

{'loss': 0.273, 'grad_norm': 5.397258281707764, 'learning_rate': 2.659060402684564e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35394/75000 [32:22<31:40, 20.84it/s]

{'loss': 0.304, 'grad_norm': 1.6612030267715454, 'learning_rate': 2.6583892617449667e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35403/75000 [32:22<32:35, 20.25it/s]

{'loss': 0.2193, 'grad_norm': 0.7258453965187073, 'learning_rate': 2.6577181208053692e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35412/75000 [32:23<31:53, 20.69it/s]

{'loss': 0.2523, 'grad_norm': 2.9682891368865967, 'learning_rate': 2.657046979865772e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35424/75000 [32:23<30:25, 21.68it/s]

{'loss': 0.3053, 'grad_norm': 2.229445457458496, 'learning_rate': 2.6563758389261746e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35433/75000 [32:24<33:17, 19.81it/s]

{'loss': 0.4604, 'grad_norm': 1.486855149269104, 'learning_rate': 2.6557046979865775e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35442/75000 [32:24<32:25, 20.34it/s]

{'loss': 0.3396, 'grad_norm': 2.323125123977661, 'learning_rate': 2.6550335570469796e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35453/75000 [32:25<33:07, 19.90it/s]

{'loss': 0.3924, 'grad_norm': 2.0808918476104736, 'learning_rate': 2.654362416107383e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35462/75000 [32:25<34:23, 19.16it/s]

{'loss': 0.3094, 'grad_norm': 4.9822587966918945, 'learning_rate': 2.6536912751677857e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35474/75000 [32:26<31:50, 20.69it/s]

{'loss': 0.4357, 'grad_norm': 3.476297616958618, 'learning_rate': 2.653020134228188e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35483/75000 [32:26<31:38, 20.82it/s]

{'loss': 0.2565, 'grad_norm': 1.5613481998443604, 'learning_rate': 2.6523489932885907e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35494/75000 [32:27<32:48, 20.07it/s]

{'loss': 0.2991, 'grad_norm': 5.549536228179932, 'learning_rate': 2.6516778523489932e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35500/75000 [32:27<34:17, 19.20it/s]

{'loss': 0.3451, 'grad_norm': 2.0749411582946777, 'learning_rate': 2.651006711409396e-05, 'epoch': 1.42}


                                                       
 47%|████▋     | 35512/75000 [32:28<41:07, 16.00it/s]

{'loss': 0.3121, 'grad_norm': 4.715133190155029, 'learning_rate': 2.6503355704697986e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35524/75000 [32:29<36:52, 17.84it/s]

{'loss': 0.3191, 'grad_norm': 5.327773571014404, 'learning_rate': 2.6496644295302015e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35534/75000 [32:29<36:08, 18.20it/s]

{'loss': 0.4271, 'grad_norm': 3.2868382930755615, 'learning_rate': 2.6489932885906043e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35543/75000 [32:30<33:09, 19.83it/s]

{'loss': 0.3484, 'grad_norm': 3.822028875350952, 'learning_rate': 2.648322147651007e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35552/75000 [32:30<36:23, 18.07it/s]

{'loss': 0.3037, 'grad_norm': 7.530312538146973, 'learning_rate': 2.6476510067114097e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35564/75000 [32:31<31:58, 20.55it/s]

{'loss': 0.2936, 'grad_norm': 2.4672603607177734, 'learning_rate': 2.646979865771812e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35572/75000 [32:31<35:26, 18.54it/s]

{'loss': 0.3604, 'grad_norm': 3.532597541809082, 'learning_rate': 2.646308724832215e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35584/75000 [32:32<31:46, 20.68it/s]

{'loss': 0.4035, 'grad_norm': 5.269784450531006, 'learning_rate': 2.645637583892618e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35594/75000 [32:32<33:05, 19.85it/s]

{'loss': 0.234, 'grad_norm': 2.2063465118408203, 'learning_rate': 2.64496644295302e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35603/75000 [32:33<32:12, 20.38it/s]

{'loss': 0.3986, 'grad_norm': 3.0318338871002197, 'learning_rate': 2.6442953020134233e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35613/75000 [32:34<38:13, 17.18it/s]

{'loss': 0.2778, 'grad_norm': 2.451141834259033, 'learning_rate': 2.6436241610738255e-05, 'epoch': 1.42}


                                                     
 47%|████▋     | 35621/75000 [32:34<44:27, 14.76it/s]

{'loss': 0.3747, 'grad_norm': 1.8035134077072144, 'learning_rate': 2.6429530201342283e-05, 'epoch': 1.42}


                                                     
 48%|████▊     | 35633/75000 [32:35<35:09, 18.66it/s]

{'loss': 0.3589, 'grad_norm': 1.1665924787521362, 'learning_rate': 2.642281879194631e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35643/75000 [32:35<36:49, 17.82it/s]

{'loss': 0.2847, 'grad_norm': 6.808316230773926, 'learning_rate': 2.6416107382550337e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35654/75000 [32:36<32:23, 20.25it/s]

{'loss': 0.393, 'grad_norm': 2.9793612957000732, 'learning_rate': 2.6409395973154365e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35664/75000 [32:36<33:06, 19.80it/s]

{'loss': 0.3478, 'grad_norm': 8.558490753173828, 'learning_rate': 2.640268456375839e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35673/75000 [32:37<31:39, 20.71it/s]

{'loss': 0.3076, 'grad_norm': 1.6985037326812744, 'learning_rate': 2.639597315436242e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35684/75000 [32:37<33:56, 19.31it/s]

{'loss': 0.253, 'grad_norm': 2.457566261291504, 'learning_rate': 2.638926174496644e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35694/75000 [32:38<35:01, 18.70it/s]

{'loss': 0.2503, 'grad_norm': 5.063234329223633, 'learning_rate': 2.6382550335570473e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35702/75000 [32:38<33:38, 19.47it/s]

{'loss': 0.3592, 'grad_norm': 0.6962149143218994, 'learning_rate': 2.6375838926174495e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35714/75000 [32:39<32:53, 19.91it/s]

{'loss': 0.2549, 'grad_norm': 2.55047869682312, 'learning_rate': 2.6369127516778523e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35723/75000 [32:39<31:56, 20.50it/s]

{'loss': 0.4092, 'grad_norm': 5.202576637268066, 'learning_rate': 2.6362416107382555e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35732/75000 [32:40<34:37, 18.90it/s]

{'loss': 0.3199, 'grad_norm': 7.050812721252441, 'learning_rate': 2.6355704697986577e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35744/75000 [32:40<32:12, 20.31it/s]

{'loss': 0.3169, 'grad_norm': 4.594087600708008, 'learning_rate': 2.6348993288590605e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35752/75000 [32:41<34:31, 18.95it/s]

{'loss': 0.1509, 'grad_norm': 6.063401222229004, 'learning_rate': 2.634228187919463e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35764/75000 [32:41<31:51, 20.52it/s]

{'loss': 0.4335, 'grad_norm': 16.358280181884766, 'learning_rate': 2.633557046979866e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35773/75000 [32:42<31:29, 20.76it/s]

{'loss': 0.3581, 'grad_norm': 10.875226020812988, 'learning_rate': 2.6328859060402688e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35784/75000 [32:42<32:23, 20.18it/s]

{'loss': 0.295, 'grad_norm': 6.414060592651367, 'learning_rate': 2.6322147651006713e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35792/75000 [32:43<34:42, 18.83it/s]

{'loss': 0.2892, 'grad_norm': 1.487005352973938, 'learning_rate': 2.631543624161074e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35803/75000 [32:43<32:10, 20.30it/s]

{'loss': 0.3751, 'grad_norm': 1.509551763534546, 'learning_rate': 2.6308724832214767e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35812/75000 [32:44<32:37, 20.02it/s]

{'loss': 0.2764, 'grad_norm': 7.874363899230957, 'learning_rate': 2.6302013422818795e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35824/75000 [32:44<31:55, 20.45it/s]

{'loss': 0.3981, 'grad_norm': 7.803229808807373, 'learning_rate': 2.6295302013422817e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35833/75000 [32:45<32:18, 20.21it/s]

{'loss': 0.3231, 'grad_norm': 6.008051872253418, 'learning_rate': 2.6288590604026845e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35844/75000 [32:45<33:06, 19.71it/s]

{'loss': 0.2877, 'grad_norm': 1.8273650407791138, 'learning_rate': 2.6281879194630877e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35852/75000 [32:46<42:21, 15.40it/s]

{'loss': 0.2932, 'grad_norm': 30.060302734375, 'learning_rate': 2.62751677852349e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35864/75000 [32:47<33:14, 19.63it/s]

{'loss': 0.3611, 'grad_norm': 1.2356479167938232, 'learning_rate': 2.6268456375838928e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35874/75000 [32:47<33:25, 19.51it/s]

{'loss': 0.1582, 'grad_norm': 4.7022480964660645, 'learning_rate': 2.6261744966442953e-05, 'epoch': 1.43}


                                                     
 48%|████▊     | 35881/75000 [32:47<37:29, 17.39it/s]

{'loss': 0.3367, 'grad_norm': 8.874733924865723, 'learning_rate': 2.625503355704698e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35892/75000 [32:48<34:48, 18.72it/s]

{'loss': 0.1488, 'grad_norm': 2.5854039192199707, 'learning_rate': 2.6248322147651007e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35903/75000 [32:49<38:29, 16.93it/s]

{'loss': 0.3219, 'grad_norm': 1.8730567693710327, 'learning_rate': 2.6241610738255035e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35911/75000 [32:49<41:31, 15.69it/s]

{'loss': 0.4096, 'grad_norm': 3.9199280738830566, 'learning_rate': 2.6234899328859064e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35922/75000 [32:50<33:44, 19.30it/s]

{'loss': 0.2926, 'grad_norm': 8.336563110351562, 'learning_rate': 2.622818791946309e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35931/75000 [32:50<40:40, 16.01it/s]

{'loss': 0.2766, 'grad_norm': 1.2552694082260132, 'learning_rate': 2.6221476510067117e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35943/75000 [32:51<39:50, 16.34it/s]

{'loss': 0.3792, 'grad_norm': 2.4375433921813965, 'learning_rate': 2.621476510067114e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35953/75000 [32:52<38:27, 16.92it/s]

{'loss': 0.3387, 'grad_norm': 6.53215217590332, 'learning_rate': 2.620805369127517e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35963/75000 [32:52<35:44, 18.21it/s]

{'loss': 0.3421, 'grad_norm': 1.2337161302566528, 'learning_rate': 2.6201342281879193e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35971/75000 [32:53<40:05, 16.23it/s]

{'loss': 0.3749, 'grad_norm': 5.036453723907471, 'learning_rate': 2.619463087248322e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35982/75000 [32:53<34:47, 18.69it/s]

{'loss': 0.459, 'grad_norm': 1.5816580057144165, 'learning_rate': 2.618791946308725e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 35994/75000 [32:54<33:18, 19.51it/s]

{'loss': 0.2558, 'grad_norm': 1.8335689306259155, 'learning_rate': 2.6181208053691275e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36000/75000 [32:54<37:45, 17.22it/s]

{'loss': 0.3046, 'grad_norm': 2.4207663536071777, 'learning_rate': 2.6174496644295304e-05, 'epoch': 1.44}


                                                       
 48%|████▊     | 36012/75000 [32:56<48:08, 13.50it/s]

{'loss': 0.4123, 'grad_norm': 1.5696653127670288, 'learning_rate': 2.616778523489933e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36022/75000 [32:56<41:30, 15.65it/s]

{'loss': 0.3462, 'grad_norm': 12.468356132507324, 'learning_rate': 2.6161073825503357e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36032/75000 [32:57<38:10, 17.01it/s]

{'loss': 0.1646, 'grad_norm': 5.998205184936523, 'learning_rate': 2.6154362416107386e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36042/75000 [32:57<36:37, 17.73it/s]

{'loss': 0.3161, 'grad_norm': 22.02073860168457, 'learning_rate': 2.614765100671141e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36052/75000 [32:58<40:12, 16.14it/s]

{'loss': 0.2177, 'grad_norm': 1.081240177154541, 'learning_rate': 2.614093959731544e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36062/75000 [32:59<35:37, 18.22it/s]

{'loss': 0.3812, 'grad_norm': 2.9028210639953613, 'learning_rate': 2.613422818791946e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36072/75000 [32:59<34:17, 18.92it/s]

{'loss': 0.3791, 'grad_norm': 0.4467138648033142, 'learning_rate': 2.6127516778523493e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36083/75000 [33:00<36:28, 17.79it/s]

{'loss': 0.3198, 'grad_norm': 5.138331890106201, 'learning_rate': 2.6120805369127515e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36093/75000 [33:00<36:51, 17.59it/s]

{'loss': 0.3504, 'grad_norm': 3.599107265472412, 'learning_rate': 2.6114093959731544e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36103/75000 [33:01<38:23, 16.89it/s]

{'loss': 0.2264, 'grad_norm': 1.010940432548523, 'learning_rate': 2.6107382550335576e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36113/75000 [33:02<46:05, 14.06it/s]

{'loss': 0.3782, 'grad_norm': 4.617188453674316, 'learning_rate': 2.6100671140939597e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36123/75000 [33:02<37:07, 17.46it/s]

{'loss': 0.3548, 'grad_norm': 3.892296075820923, 'learning_rate': 2.6093959731543626e-05, 'epoch': 1.44}


                                                     
 48%|████▊     | 36133/75000 [33:03<40:30, 15.99it/s]

{'loss': 0.3135, 'grad_norm': 5.769146919250488, 'learning_rate': 2.608724832214765e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36143/75000 [33:03<35:45, 18.11it/s]

{'loss': 0.357, 'grad_norm': 6.543367385864258, 'learning_rate': 2.608053691275168e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36153/75000 [33:04<37:49, 17.12it/s]

{'loss': 0.3416, 'grad_norm': 2.6460659503936768, 'learning_rate': 2.6073825503355705e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36163/75000 [33:04<36:58, 17.50it/s]

{'loss': 0.2904, 'grad_norm': 3.6481130123138428, 'learning_rate': 2.6067114093959733e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36171/75000 [33:06<1:26:57,  7.44it/s]

{'loss': 0.3738, 'grad_norm': 4.677258491516113, 'learning_rate': 2.6060402684563762e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36181/75000 [33:07<1:23:04,  7.79it/s]

{'loss': 0.3655, 'grad_norm': 1.5857231616973877, 'learning_rate': 2.6053691275167787e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36190/75000 [33:09<2:28:58,  4.34it/s]

{'loss': 0.3183, 'grad_norm': 5.017391681671143, 'learning_rate': 2.6046979865771816e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36201/75000 [33:11<1:35:28,  6.77it/s]

{'loss': 0.2894, 'grad_norm': 1.99527907371521, 'learning_rate': 2.6040268456375837e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36212/75000 [33:12<57:13, 11.30it/s]  

{'loss': 0.3135, 'grad_norm': 3.073819398880005, 'learning_rate': 2.6033557046979866e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36222/75000 [33:13<58:50, 10.98it/s]

{'loss': 0.2778, 'grad_norm': 2.823256492614746, 'learning_rate': 2.6026845637583898e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36232/75000 [33:13<52:06, 12.40it/s]

{'loss': 0.2886, 'grad_norm': 5.023430347442627, 'learning_rate': 2.602013422818792e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36240/75000 [33:14<1:14:39,  8.65it/s]

{'loss': 0.3176, 'grad_norm': 2.396052837371826, 'learning_rate': 2.6013422818791948e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36252/75000 [33:15<54:19, 11.89it/s]

{'loss': 0.425, 'grad_norm': 4.02241849899292, 'learning_rate': 2.6006711409395973e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36262/75000 [33:16<55:07, 11.71it/s]

{'loss': 0.3506, 'grad_norm': 2.474315643310547, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36272/75000 [33:17<51:01, 12.65it/s]

{'loss': 0.3192, 'grad_norm': 2.7453036308288574, 'learning_rate': 2.5993288590604027e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36282/75000 [33:17<41:45, 15.45it/s]

{'loss': 0.2602, 'grad_norm': 7.100747108459473, 'learning_rate': 2.5986577181208055e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36292/75000 [33:18<39:58, 16.14it/s]

{'loss': 0.2936, 'grad_norm': 2.5290844440460205, 'learning_rate': 2.5979865771812084e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36302/75000 [33:19<43:54, 14.69it/s]

{'loss': 0.2764, 'grad_norm': 4.482321262359619, 'learning_rate': 2.597315436241611e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36314/75000 [33:20<39:54, 16.15it/s]

{'loss': 0.2711, 'grad_norm': 1.425976037979126, 'learning_rate': 2.5966442953020138e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36322/75000 [33:20<40:39, 15.85it/s]

{'loss': 0.3173, 'grad_norm': 0.611723005771637, 'learning_rate': 2.595973154362416e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36332/75000 [33:21<44:25, 14.51it/s]

{'loss': 0.3937, 'grad_norm': 6.654653549194336, 'learning_rate': 2.595302013422819e-05, 'epoch': 1.45}


                                                     
 48%|████▊     | 36340/75000 [33:21<47:17, 13.62it/s]

{'loss': 0.335, 'grad_norm': 1.459051489830017, 'learning_rate': 2.5946308724832213e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36350/75000 [33:22<57:12, 11.26it/s]

{'loss': 0.2889, 'grad_norm': 0.49867385625839233, 'learning_rate': 2.5939597315436242e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36362/75000 [33:23<57:42, 11.16it/s]

{'loss': 0.2504, 'grad_norm': 3.1787824630737305, 'learning_rate': 2.593288590604027e-05, 'epoch': 1.45}


                                                       
 48%|████▊     | 36371/75000 [33:24<1:18:43,  8.18it/s]

{'loss': 0.3928, 'grad_norm': 3.0182442665100098, 'learning_rate': 2.5926174496644295e-05, 'epoch': 1.45}


                                                       
 49%|████▊     | 36382/75000 [33:26<1:05:34,  9.82it/s]

{'loss': 0.3158, 'grad_norm': 2.296653985977173, 'learning_rate': 2.5919463087248324e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36392/75000 [33:26<55:19, 11.63it/s]

{'loss': 0.2928, 'grad_norm': 3.9428582191467285, 'learning_rate': 2.591275167785235e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36402/75000 [33:27<1:04:50,  9.92it/s]

{'loss': 0.3875, 'grad_norm': 7.600114345550537, 'learning_rate': 2.5906040268456378e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36412/75000 [33:28<54:26, 11.81it/s]

{'loss': 0.3477, 'grad_norm': 3.928093433380127, 'learning_rate': 2.5899328859060406e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36422/75000 [33:29<1:06:13,  9.71it/s]

{'loss': 0.3403, 'grad_norm': 2.2631161212921143, 'learning_rate': 2.589261744966443e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36432/75000 [33:30<58:24, 11.01it/s]  

{'loss': 0.3168, 'grad_norm': 8.581445693969727, 'learning_rate': 2.588590604026846e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36441/75000 [33:31<1:14:42,  8.60it/s]

{'loss': 0.3677, 'grad_norm': 4.474700450897217, 'learning_rate': 2.5879194630872482e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36451/75000 [33:32<1:19:31,  8.08it/s]

{'loss': 0.3207, 'grad_norm': 1.4450292587280273, 'learning_rate': 2.5872483221476514e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36461/75000 [33:33<1:09:50,  9.20it/s]

{'loss': 0.325, 'grad_norm': 0.8712777495384216, 'learning_rate': 2.5865771812080535e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36471/75000 [33:35<1:23:20,  7.70it/s]

{'loss': 0.2878, 'grad_norm': 6.73792028427124, 'learning_rate': 2.5859060402684564e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36482/75000 [33:36<1:02:33, 10.26it/s]

{'loss': 0.3021, 'grad_norm': 1.3250031471252441, 'learning_rate': 2.5852348993288596e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36492/75000 [33:37<44:03, 14.57it/s]

{'loss': 0.3205, 'grad_norm': 3.0265884399414062, 'learning_rate': 2.5845637583892618e-05, 'epoch': 1.46}


                                                     
 49%|████▊     | 36500/75000 [33:37<37:47, 16.98it/s]

{'loss': 0.3579, 'grad_norm': 1.9833794832229614, 'learning_rate': 2.5838926174496646e-05, 'epoch': 1.46}


                                                       
 49%|████▊     | 36513/75000 [33:38<45:02, 14.24it/s]

{'loss': 0.1917, 'grad_norm': 6.4495954513549805, 'learning_rate': 2.583221476510067e-05, 'epoch': 1.46}


                                                     
 49%|████▊     | 36522/75000 [33:39<43:37, 14.70it/s]

{'loss': 0.2351, 'grad_norm': 4.125999450683594, 'learning_rate': 2.58255033557047e-05, 'epoch': 1.46}


                                                     
 49%|████▊     | 36533/75000 [33:40<37:03, 17.30it/s]

{'loss': 0.2949, 'grad_norm': 3.2564737796783447, 'learning_rate': 2.5818791946308725e-05, 'epoch': 1.46}


                                                     
 49%|████▊     | 36543/75000 [33:40<46:35, 13.76it/s]

{'loss': 0.3308, 'grad_norm': 2.1488070487976074, 'learning_rate': 2.5812080536912754e-05, 'epoch': 1.46}


                                                     
 49%|████▊     | 36551/75000 [33:41<40:52, 15.68it/s]

{'loss': 0.2994, 'grad_norm': 10.210996627807617, 'learning_rate': 2.5805369127516782e-05, 'epoch': 1.46}


                                                     
 49%|████▊     | 36562/75000 [33:41<34:13, 18.72it/s]

{'loss': 0.1946, 'grad_norm': 0.5584065318107605, 'learning_rate': 2.5798657718120804e-05, 'epoch': 1.46}


                                                     
 49%|████▉     | 36571/75000 [33:42<36:10, 17.71it/s]

{'loss': 0.4056, 'grad_norm': 2.7462403774261475, 'learning_rate': 2.5791946308724836e-05, 'epoch': 1.46}


                                                     
 49%|████▉     | 36583/75000 [33:43<36:28, 17.56it/s]

{'loss': 0.3956, 'grad_norm': 7.57742977142334, 'learning_rate': 2.5785234899328858e-05, 'epoch': 1.46}


                                                     
 49%|████▉     | 36593/75000 [33:43<38:01, 16.84it/s]

{'loss': 0.2696, 'grad_norm': 1.6730133295059204, 'learning_rate': 2.5778523489932886e-05, 'epoch': 1.46}


                                                     
 49%|████▉     | 36604/75000 [33:44<36:05, 17.73it/s]

{'loss': 0.4225, 'grad_norm': 5.378413677215576, 'learning_rate': 2.5771812080536918e-05, 'epoch': 1.46}


                                                     
 49%|████▉     | 36612/75000 [33:44<39:57, 16.01it/s]

{'loss': 0.2467, 'grad_norm': 2.219972610473633, 'learning_rate': 2.576510067114094e-05, 'epoch': 1.46}


                                                     
 49%|████▉     | 36623/75000 [33:45<34:16, 18.66it/s]

{'loss': 0.2486, 'grad_norm': 13.57772159576416, 'learning_rate': 2.575838926174497e-05, 'epoch': 1.46}


                                                     
 49%|████▉     | 36632/75000 [33:46<39:02, 16.38it/s]

{'loss': 0.3291, 'grad_norm': 4.741344451904297, 'learning_rate': 2.5751677852348994e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36644/75000 [33:46<37:18, 17.14it/s]

{'loss': 0.286, 'grad_norm': 4.739255428314209, 'learning_rate': 2.5744966442953022e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36652/75000 [33:47<38:22, 16.66it/s]

{'loss': 0.3558, 'grad_norm': 3.034193992614746, 'learning_rate': 2.5738255033557047e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36662/75000 [33:47<38:07, 16.76it/s]

{'loss': 0.2345, 'grad_norm': 3.0100739002227783, 'learning_rate': 2.5731543624161076e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36672/75000 [33:48<40:15, 15.87it/s]

{'loss': 0.2944, 'grad_norm': 5.1536946296691895, 'learning_rate': 2.5724832214765104e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36684/75000 [33:49<36:08, 17.67it/s]

{'loss': 0.2012, 'grad_norm': 1.2749279737472534, 'learning_rate': 2.571812080536913e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36692/75000 [33:49<42:09, 15.14it/s]

{'loss': 0.2949, 'grad_norm': 4.2465291023254395, 'learning_rate': 2.5711409395973158e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36702/75000 [33:50<45:59, 13.88it/s]

{'loss': 0.4018, 'grad_norm': 1.9204761981964111, 'learning_rate': 2.570469798657718e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36712/75000 [33:50<39:02, 16.35it/s]

{'loss': 0.4322, 'grad_norm': 3.344350814819336, 'learning_rate': 2.569798657718121e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36724/75000 [33:51<33:48, 18.87it/s]

{'loss': 0.2901, 'grad_norm': 1.2322574853897095, 'learning_rate': 2.5691275167785234e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36732/75000 [33:52<40:13, 15.86it/s]

{'loss': 0.3977, 'grad_norm': 4.617724418640137, 'learning_rate': 2.5684563758389262e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36742/75000 [33:52<34:35, 18.43it/s]

{'loss': 0.3617, 'grad_norm': 7.503907680511475, 'learning_rate': 2.567785234899329e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36753/75000 [33:53<36:11, 17.62it/s]

{'loss': 0.3867, 'grad_norm': 4.277900218963623, 'learning_rate': 2.5671140939597316e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36762/75000 [33:53<36:43, 17.35it/s]

{'loss': 0.3169, 'grad_norm': 1.1433323621749878, 'learning_rate': 2.5664429530201344e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36772/75000 [33:54<41:46, 15.25it/s]

{'loss': 0.2812, 'grad_norm': 3.761525869369507, 'learning_rate': 2.565771812080537e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36784/75000 [33:55<33:36, 18.96it/s]

{'loss': 0.2294, 'grad_norm': 2.410425901412964, 'learning_rate': 2.5651006711409398e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36793/75000 [33:55<34:28, 18.47it/s]

{'loss': 0.2242, 'grad_norm': 4.44043493270874, 'learning_rate': 2.564429530201342e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36801/75000 [33:55<34:28, 18.47it/s]

{'loss': 0.243, 'grad_norm': 3.854306936264038, 'learning_rate': 2.5637583892617452e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36813/75000 [33:56<33:47, 18.83it/s]

{'loss': 0.3028, 'grad_norm': 2.129711866378784, 'learning_rate': 2.563087248322148e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36822/75000 [33:57<34:50, 18.27it/s]

{'loss': 0.2798, 'grad_norm': 1.6278730630874634, 'learning_rate': 2.5624161073825502e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36832/75000 [33:57<37:05, 17.15it/s]

{'loss': 0.342, 'grad_norm': 1.4020471572875977, 'learning_rate': 2.5617449664429534e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36844/75000 [33:58<32:42, 19.44it/s]

{'loss': 0.3552, 'grad_norm': 3.9333250522613525, 'learning_rate': 2.5610738255033556e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36852/75000 [33:58<35:30, 17.91it/s]

{'loss': 0.271, 'grad_norm': 1.2022571563720703, 'learning_rate': 2.5604026845637584e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36863/75000 [33:59<36:59, 17.18it/s]

{'loss': 0.3204, 'grad_norm': 10.182414054870605, 'learning_rate': 2.5597315436241613e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36871/75000 [33:59<34:21, 18.50it/s]

{'loss': 0.3293, 'grad_norm': 4.310723304748535, 'learning_rate': 2.5590604026845638e-05, 'epoch': 1.47}


                                                     
 49%|████▉     | 36884/75000 [34:00<32:24, 19.60it/s]

{'loss': 0.3811, 'grad_norm': 1.8784037828445435, 'learning_rate': 2.5583892617449667e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36892/75000 [34:00<32:44, 19.40it/s]

{'loss': 0.4512, 'grad_norm': 7.123277187347412, 'learning_rate': 2.5577181208053692e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36904/75000 [34:01<31:37, 20.07it/s]

{'loss': 0.2793, 'grad_norm': 5.103937149047852, 'learning_rate': 2.557046979865772e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36913/75000 [34:01<34:44, 18.27it/s]

{'loss': 0.2426, 'grad_norm': 3.107203483581543, 'learning_rate': 2.5563758389261742e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36922/75000 [34:02<35:07, 18.07it/s]

{'loss': 0.2679, 'grad_norm': 14.46493911743164, 'learning_rate': 2.5557046979865774e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36931/75000 [34:02<35:13, 18.01it/s]

{'loss': 0.3404, 'grad_norm': 1.2680257558822632, 'learning_rate': 2.5550335570469803e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36942/75000 [34:03<34:58, 18.14it/s]

{'loss': 0.2911, 'grad_norm': 5.509321212768555, 'learning_rate': 2.5543624161073824e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36954/75000 [34:04<33:27, 18.96it/s]

{'loss': 0.3855, 'grad_norm': 7.851807594299316, 'learning_rate': 2.5536912751677856e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36964/75000 [34:04<34:27, 18.40it/s]

{'loss': 0.2444, 'grad_norm': 2.653097152709961, 'learning_rate': 2.5530201342281878e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36971/75000 [34:05<40:50, 15.52it/s]

{'loss': 0.4884, 'grad_norm': 14.511648178100586, 'learning_rate': 2.5523489932885907e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36981/75000 [34:05<35:10, 18.01it/s]

{'loss': 0.2797, 'grad_norm': 5.794525146484375, 'learning_rate': 2.5516778523489932e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 36991/75000 [34:06<35:46, 17.71it/s]

{'loss': 0.2353, 'grad_norm': 2.964027166366577, 'learning_rate': 2.551006711409396e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37000/75000 [34:06<34:01, 18.61it/s]

{'loss': 0.345, 'grad_norm': 2.085411548614502, 'learning_rate': 2.550335570469799e-05, 'epoch': 1.48}


                                                       
 49%|████▉     | 37012/75000 [34:07<41:12, 15.37it/s]

{'loss': 0.2917, 'grad_norm': 3.0238490104675293, 'learning_rate': 2.5496644295302014e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37024/75000 [34:08<33:20, 18.99it/s]

{'loss': 0.3048, 'grad_norm': 7.733341217041016, 'learning_rate': 2.5489932885906043e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37032/75000 [34:09<37:45, 16.76it/s]

{'loss': 0.2531, 'grad_norm': 5.398425579071045, 'learning_rate': 2.5483221476510068e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37044/75000 [34:09<33:12, 19.05it/s]

{'loss': 0.2581, 'grad_norm': 5.46936559677124, 'learning_rate': 2.5476510067114096e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37052/75000 [34:10<38:39, 16.36it/s]

{'loss': 0.3796, 'grad_norm': 13.837635040283203, 'learning_rate': 2.5469798657718125e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37062/75000 [34:10<33:04, 19.12it/s]

{'loss': 0.2128, 'grad_norm': 6.080517768859863, 'learning_rate': 2.5463087248322147e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37073/75000 [34:11<36:08, 17.49it/s]

{'loss': 0.3233, 'grad_norm': 1.3212437629699707, 'learning_rate': 2.545637583892618e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37082/75000 [34:11<32:50, 19.24it/s]

{'loss': 0.216, 'grad_norm': 2.3125312328338623, 'learning_rate': 2.54496644295302e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37093/75000 [34:12<31:41, 19.93it/s]

{'loss': 0.2968, 'grad_norm': 3.12813401222229, 'learning_rate': 2.544295302013423e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37102/75000 [34:12<33:02, 19.12it/s]

{'loss': 0.3365, 'grad_norm': 1.941576361656189, 'learning_rate': 2.5436241610738254e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37113/75000 [34:13<32:25, 19.47it/s]

{'loss': 0.2045, 'grad_norm': 10.761436462402344, 'learning_rate': 2.5429530201342283e-05, 'epoch': 1.48}


                                                     
 49%|████▉     | 37122/75000 [34:13<33:50, 18.65it/s]

{'loss': 0.2788, 'grad_norm': 9.561237335205078, 'learning_rate': 2.542281879194631e-05, 'epoch': 1.48}


                                                     
 50%|████▉     | 37132/75000 [34:14<35:02, 18.01it/s]

{'loss': 0.2236, 'grad_norm': 7.742311954498291, 'learning_rate': 2.5416107382550336e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37141/75000 [34:14<33:45, 18.69it/s]

{'loss': 0.3549, 'grad_norm': 5.885827541351318, 'learning_rate': 2.5409395973154365e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37154/75000 [34:15<33:02, 19.09it/s]

{'loss': 0.2633, 'grad_norm': 5.134705543518066, 'learning_rate': 2.540268456375839e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37162/75000 [34:15<33:00, 19.10it/s]

{'loss': 0.3981, 'grad_norm': 1.5763335227966309, 'learning_rate': 2.539597315436242e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37172/75000 [34:16<34:41, 18.18it/s]

{'loss': 0.3354, 'grad_norm': 2.707664728164673, 'learning_rate': 2.538926174496644e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37182/75000 [34:16<34:11, 18.44it/s]

{'loss': 0.2438, 'grad_norm': 2.5711169242858887, 'learning_rate': 2.5382550335570472e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37191/75000 [34:17<37:49, 16.66it/s]

{'loss': 0.2529, 'grad_norm': 2.1879565715789795, 'learning_rate': 2.53758389261745e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37202/75000 [34:18<35:55, 17.54it/s]

{'loss': 0.2826, 'grad_norm': 0.9813806414604187, 'learning_rate': 2.5369127516778523e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37212/75000 [34:18<37:29, 16.80it/s]

{'loss': 0.3469, 'grad_norm': 5.776256561279297, 'learning_rate': 2.536241610738255e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37221/75000 [34:19<36:24, 17.29it/s]

{'loss': 0.3426, 'grad_norm': 4.029328346252441, 'learning_rate': 2.5355704697986576e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37232/75000 [34:19<34:28, 18.26it/s]

{'loss': 0.3379, 'grad_norm': 4.592977523803711, 'learning_rate': 2.5348993288590605e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37242/75000 [34:20<45:31, 13.82it/s]

{'loss': 0.2376, 'grad_norm': 1.2196400165557861, 'learning_rate': 2.5342281879194633e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37252/75000 [34:21<38:21, 16.40it/s]

{'loss': 0.315, 'grad_norm': 11.509480476379395, 'learning_rate': 2.533557046979866e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37263/75000 [34:21<37:31, 16.76it/s]

{'loss': 0.3582, 'grad_norm': 5.031569480895996, 'learning_rate': 2.5328859060402687e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37274/75000 [34:22<33:27, 18.79it/s]

{'loss': 0.3168, 'grad_norm': 9.880334854125977, 'learning_rate': 2.5322147651006712e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37282/75000 [34:22<40:17, 15.60it/s]

{'loss': 0.3885, 'grad_norm': 3.673773765563965, 'learning_rate': 2.531543624161074e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37291/75000 [34:23<40:27, 15.53it/s]

{'loss': 0.3329, 'grad_norm': 4.344600677490234, 'learning_rate': 2.5308724832214763e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37302/75000 [34:24<38:13, 16.44it/s]

{'loss': 0.3003, 'grad_norm': 1.9104114770889282, 'learning_rate': 2.5302013422818795e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37313/75000 [34:24<36:14, 17.33it/s]

{'loss': 0.28, 'grad_norm': 13.9645414352417, 'learning_rate': 2.5295302013422823e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37323/75000 [34:25<35:32, 17.67it/s]

{'loss': 0.2328, 'grad_norm': 1.7537200450897217, 'learning_rate': 2.5288590604026845e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37333/75000 [34:25<37:31, 16.73it/s]

{'loss': 0.3872, 'grad_norm': 3.3876631259918213, 'learning_rate': 2.5281879194630877e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37343/75000 [34:26<39:06, 16.05it/s]

{'loss': 0.3572, 'grad_norm': 8.124608993530273, 'learning_rate': 2.52751677852349e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37351/75000 [34:27<40:28, 15.51it/s]

{'loss': 0.3035, 'grad_norm': 2.8401103019714355, 'learning_rate': 2.5268456375838927e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37363/75000 [34:27<33:39, 18.63it/s]

{'loss': 0.289, 'grad_norm': 3.8636014461517334, 'learning_rate': 2.5261744966442952e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37373/75000 [34:28<39:04, 16.05it/s]

{'loss': 0.2522, 'grad_norm': 5.539126873016357, 'learning_rate': 2.525503355704698e-05, 'epoch': 1.49}


                                                     
 50%|████▉     | 37383/75000 [34:28<38:07, 16.44it/s]

{'loss': 0.2907, 'grad_norm': 1.442788004875183, 'learning_rate': 2.524832214765101e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37393/75000 [34:29<38:21, 16.34it/s]

{'loss': 0.2061, 'grad_norm': 3.258531093597412, 'learning_rate': 2.5241610738255034e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37403/75000 [34:30<37:10, 16.85it/s]

{'loss': 0.3821, 'grad_norm': 1.6823949813842773, 'learning_rate': 2.5234899328859063e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37413/75000 [34:30<43:52, 14.28it/s]

{'loss': 0.3275, 'grad_norm': 8.633167266845703, 'learning_rate': 2.5228187919463088e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37423/75000 [34:31<35:33, 17.62it/s]

{'loss': 0.363, 'grad_norm': 5.270170211791992, 'learning_rate': 2.5221476510067117e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37433/75000 [34:31<33:48, 18.52it/s]

{'loss': 0.2614, 'grad_norm': 2.6677639484405518, 'learning_rate': 2.521476510067114e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37443/75000 [34:32<37:21, 16.76it/s]

{'loss': 0.207, 'grad_norm': 2.825120449066162, 'learning_rate': 2.5208053691275167e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37452/75000 [34:33<39:33, 15.82it/s]

{'loss': 0.41, 'grad_norm': 10.70810604095459, 'learning_rate': 2.52013422818792e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37462/75000 [34:33<41:08, 15.21it/s]

{'loss': 0.1452, 'grad_norm': 1.267802119255066, 'learning_rate': 2.519463087248322e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37472/75000 [34:34<47:01, 13.30it/s]

{'loss': 0.4013, 'grad_norm': 1.1134918928146362, 'learning_rate': 2.518791946308725e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37482/75000 [34:35<40:21, 15.50it/s]

{'loss': 0.3757, 'grad_norm': 9.44440746307373, 'learning_rate': 2.5181208053691274e-05, 'epoch': 1.5}


                                                     
 50%|████▉     | 37492/75000 [34:35<49:55, 12.52it/s]

{'loss': 0.3423, 'grad_norm': 4.648185729980469, 'learning_rate': 2.5174496644295303e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37500/75000 [34:36<40:04, 15.59it/s]

{'loss': 0.2519, 'grad_norm': 2.8562707901000977, 'learning_rate': 2.516778523489933e-05, 'epoch': 1.5}


                                                       
 50%|█████     | 37514/75000 [34:37<39:27, 15.83it/s]

{'loss': 0.3144, 'grad_norm': 3.7594656944274902, 'learning_rate': 2.5161073825503357e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37522/75000 [34:38<36:28, 17.12it/s]

{'loss': 0.2472, 'grad_norm': 0.6165814995765686, 'learning_rate': 2.5154362416107385e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37532/75000 [34:38<41:10, 15.17it/s]

{'loss': 0.2802, 'grad_norm': 4.464639663696289, 'learning_rate': 2.514765100671141e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37542/75000 [34:39<38:08, 16.37it/s]

{'loss': 0.3252, 'grad_norm': 4.401517868041992, 'learning_rate': 2.514093959731544e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37552/75000 [34:40<48:02, 12.99it/s]

{'loss': 0.3096, 'grad_norm': 2.187351942062378, 'learning_rate': 2.513422818791946e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37562/75000 [34:40<37:14, 16.76it/s]

{'loss': 0.2538, 'grad_norm': 3.2660794258117676, 'learning_rate': 2.5127516778523493e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37573/75000 [34:41<37:36, 16.59it/s]

{'loss': 0.2832, 'grad_norm': 9.840913772583008, 'learning_rate': 2.512080536912752e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37582/75000 [34:42<48:23, 12.89it/s]

{'loss': 0.3576, 'grad_norm': 3.4011411666870117, 'learning_rate': 2.5114093959731543e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37592/75000 [34:42<39:16, 15.88it/s]

{'loss': 0.1683, 'grad_norm': 1.9031304121017456, 'learning_rate': 2.510738255033557e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37601/75000 [34:43<40:46, 15.29it/s]

{'loss': 0.2547, 'grad_norm': 10.381648063659668, 'learning_rate': 2.5100671140939597e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37611/75000 [34:44<48:59, 12.72it/s]

{'loss': 0.3931, 'grad_norm': 0.6962114572525024, 'learning_rate': 2.5093959731543625e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37623/75000 [34:44<45:06, 13.81it/s]

{'loss': 0.2633, 'grad_norm': 7.1988067626953125, 'learning_rate': 2.508724832214765e-05, 'epoch': 1.5}


                                                     
 50%|█████     | 37634/75000 [34:45<34:13, 18.19it/s]

{'loss': 0.2593, 'grad_norm': 1.170143723487854, 'learning_rate': 2.508053691275168e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37643/75000 [34:46<38:46, 16.06it/s]

{'loss': 0.2984, 'grad_norm': 3.1077089309692383, 'learning_rate': 2.5073825503355708e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37651/75000 [34:46<36:38, 16.99it/s]

{'loss': 0.2551, 'grad_norm': 2.094022750854492, 'learning_rate': 2.5067114093959733e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37662/75000 [34:47<35:34, 17.49it/s]

{'loss': 0.3602, 'grad_norm': 2.0271031856536865, 'learning_rate': 2.506040268456376e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37673/75000 [34:47<32:49, 18.95it/s]

{'loss': 0.3308, 'grad_norm': 3.1688921451568604, 'learning_rate': 2.5053691275167783e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37683/75000 [34:48<35:25, 17.56it/s]

{'loss': 0.366, 'grad_norm': 1.9885342121124268, 'learning_rate': 2.5046979865771815e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37692/75000 [34:48<38:34, 16.12it/s]

{'loss': 0.2717, 'grad_norm': 4.469405174255371, 'learning_rate': 2.5040268456375843e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37702/75000 [34:49<47:18, 13.14it/s]

{'loss': 0.3325, 'grad_norm': 3.11755108833313, 'learning_rate': 2.5033557046979865e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37712/75000 [34:50<39:47, 15.62it/s]

{'loss': 0.3457, 'grad_norm': 4.935671806335449, 'learning_rate': 2.5026845637583897e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37721/75000 [34:50<40:34, 15.31it/s]

{'loss': 0.3455, 'grad_norm': 3.487029790878296, 'learning_rate': 2.502013422818792e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37733/75000 [34:51<40:22, 15.38it/s]

{'loss': 0.3771, 'grad_norm': 0.35372185707092285, 'learning_rate': 2.5013422818791947e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37741/75000 [34:52<41:25, 14.99it/s]

{'loss': 0.4522, 'grad_norm': 5.146564960479736, 'learning_rate': 2.5006711409395973e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37753/75000 [34:52<35:53, 17.30it/s]

{'loss': 0.3291, 'grad_norm': 3.099611520767212, 'learning_rate': 2.5e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37761/75000 [34:53<37:25, 16.58it/s]

{'loss': 0.3784, 'grad_norm': 3.6934170722961426, 'learning_rate': 2.4993288590604026e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37774/75000 [34:54<34:50, 17.81it/s]

{'loss': 0.2514, 'grad_norm': 2.3038833141326904, 'learning_rate': 2.4986577181208055e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37780/75000 [34:54<37:56, 16.35it/s]

{'loss': 0.3063, 'grad_norm': 5.479503631591797, 'learning_rate': 2.497986577181208e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37792/75000 [34:55<39:46, 15.59it/s]

{'loss': 0.2931, 'grad_norm': 6.749579429626465, 'learning_rate': 2.497315436241611e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37802/75000 [34:56<46:36, 13.30it/s]

{'loss': 0.2671, 'grad_norm': 5.421533584594727, 'learning_rate': 2.4966442953020137e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37810/75000 [34:56<53:50, 11.51it/s]

{'loss': 0.3981, 'grad_norm': 1.8824467658996582, 'learning_rate': 2.4959731543624162e-05, 'epoch': 1.51}


                                                       
 50%|█████     | 37822/75000 [34:58<1:00:03, 10.32it/s]

{'loss': 0.4691, 'grad_norm': 5.944516658782959, 'learning_rate': 2.4953020134228187e-05, 'epoch': 1.51}


                                                       
 50%|█████     | 37832/75000 [34:59<58:29, 10.59it/s]  

{'loss': 0.2953, 'grad_norm': 4.069357872009277, 'learning_rate': 2.4946308724832216e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37842/75000 [34:59<49:27, 12.52it/s]

{'loss': 0.364, 'grad_norm': 2.897386074066162, 'learning_rate': 2.493959731543624e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37850/75000 [35:00<51:24, 12.04it/s]

{'loss': 0.3222, 'grad_norm': 1.4383625984191895, 'learning_rate': 2.493288590604027e-05, 'epoch': 1.51}


                                                       
 50%|█████     | 37862/75000 [35:01<59:50, 10.34it/s]  

{'loss': 0.2254, 'grad_norm': 4.033144474029541, 'learning_rate': 2.4926174496644298e-05, 'epoch': 1.51}


                                                     
 50%|█████     | 37872/75000 [35:02<53:28, 11.57it/s]

{'loss': 0.2749, 'grad_norm': 2.050262451171875, 'learning_rate': 2.4919463087248323e-05, 'epoch': 1.51}


                                                       
 51%|█████     | 37882/75000 [35:03<1:01:03, 10.13it/s]

{'loss': 0.3445, 'grad_norm': 0.8283234238624573, 'learning_rate': 2.491275167785235e-05, 'epoch': 1.52}


                                                       
 51%|█████     | 37892/75000 [35:04<52:23, 11.81it/s]

{'loss': 0.3339, 'grad_norm': 5.653825759887695, 'learning_rate': 2.4906040268456377e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37902/75000 [35:05<52:45, 11.72it/s]

{'loss': 0.3127, 'grad_norm': 4.2021002769470215, 'learning_rate': 2.4899328859060402e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37912/75000 [35:05<48:32, 12.73it/s]

{'loss': 0.3263, 'grad_norm': 1.7411912679672241, 'learning_rate': 2.489261744966443e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37922/75000 [35:06<41:08, 15.02it/s]

{'loss': 0.3217, 'grad_norm': 4.483088970184326, 'learning_rate': 2.488590604026846e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37931/75000 [35:07<41:16, 14.97it/s]

{'loss': 0.2945, 'grad_norm': 10.992633819580078, 'learning_rate': 2.4879194630872485e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37943/75000 [35:07<34:49, 17.73it/s]

{'loss': 0.2222, 'grad_norm': 2.4166862964630127, 'learning_rate': 2.487248322147651e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37951/75000 [35:08<40:50, 15.12it/s]

{'loss': 0.4028, 'grad_norm': 6.927272796630859, 'learning_rate': 2.4865771812080538e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37963/75000 [35:09<42:17, 14.59it/s]

{'loss': 0.3478, 'grad_norm': 2.8953514099121094, 'learning_rate': 2.4859060402684563e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37971/75000 [35:09<39:36, 15.58it/s]

{'loss': 0.3161, 'grad_norm': 2.713125467300415, 'learning_rate': 2.4852348993288592e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37983/75000 [35:10<41:07, 15.00it/s]

{'loss': 0.4119, 'grad_norm': 1.4310154914855957, 'learning_rate': 2.484563758389262e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 37993/75000 [35:11<42:56, 14.36it/s]

{'loss': 0.3125, 'grad_norm': 5.250538349151611, 'learning_rate': 2.4838926174496646e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38000/75000 [35:11<42:20, 14.56it/s]

{'loss': 0.2691, 'grad_norm': 3.555352210998535, 'learning_rate': 2.4832214765100674e-05, 'epoch': 1.52}


                                                       
 51%|█████     | 38013/75000 [35:13<46:30, 13.25it/s]  

{'loss': 0.3332, 'grad_norm': 10.112086296081543, 'learning_rate': 2.48255033557047e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38021/75000 [35:14<48:01, 12.83it/s]

{'loss': 0.2969, 'grad_norm': 4.706387996673584, 'learning_rate': 2.4818791946308725e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38033/75000 [35:14<41:01, 15.02it/s]

{'loss': 0.3678, 'grad_norm': 7.596806526184082, 'learning_rate': 2.4812080536912753e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38043/75000 [35:15<41:07, 14.98it/s]

{'loss': 0.3987, 'grad_norm': 3.5993916988372803, 'learning_rate': 2.480536912751678e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38051/75000 [35:16<46:39, 13.20it/s]

{'loss': 0.3509, 'grad_norm': 3.919240713119507, 'learning_rate': 2.4798657718120807e-05, 'epoch': 1.52}


                                                       
 51%|█████     | 38061/75000 [35:17<55:49, 11.03it/s]  

{'loss': 0.2249, 'grad_norm': 2.2510781288146973, 'learning_rate': 2.4791946308724835e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38071/75000 [35:18<1:02:44,  9.81it/s]

{'loss': 0.3228, 'grad_norm': 1.8606184720993042, 'learning_rate': 2.478523489932886e-05, 'epoch': 1.52}


                                                       
 51%|█████     | 38081/75000 [35:19<56:54, 10.81it/s]

{'loss': 0.2829, 'grad_norm': 2.9257028102874756, 'learning_rate': 2.4778523489932886e-05, 'epoch': 1.52}


                                                       
 51%|█████     | 38091/75000 [35:20<1:04:57,  9.47it/s]

{'loss': 0.341, 'grad_norm': 7.247681140899658, 'learning_rate': 2.4771812080536914e-05, 'epoch': 1.52}


                                                       
 51%|█████     | 38103/75000 [35:20<44:16, 13.89it/s]

{'loss': 0.242, 'grad_norm': 0.9037677049636841, 'learning_rate': 2.476510067114094e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38111/75000 [35:21<56:27, 10.89it/s]

{'loss': 0.3707, 'grad_norm': 11.130114555358887, 'learning_rate': 2.4758389261744968e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38123/75000 [35:22<42:26, 14.48it/s]

{'loss': 0.3801, 'grad_norm': 6.761263847351074, 'learning_rate': 2.4751677852348996e-05, 'epoch': 1.52}


                                                     
 51%|█████     | 38131/75000 [35:23<43:30, 14.12it/s]

{'loss': 0.4169, 'grad_norm': 1.4343316555023193, 'learning_rate': 2.474496644295302e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38141/75000 [35:23<45:48, 13.41it/s]

{'loss': 0.1794, 'grad_norm': 2.1467394828796387, 'learning_rate': 2.4738255033557047e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38151/75000 [35:24<46:17, 13.27it/s]

{'loss': 0.2911, 'grad_norm': 2.4574313163757324, 'learning_rate': 2.4731543624161075e-05, 'epoch': 1.53}


                                                       
 51%|█████     | 38163/75000 [35:25<48:49, 12.57it/s]  

{'loss': 0.1602, 'grad_norm': 5.418725490570068, 'learning_rate': 2.47248322147651e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38173/75000 [35:26<42:36, 14.41it/s]

{'loss': 0.2758, 'grad_norm': 11.114237785339355, 'learning_rate': 2.471812080536913e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38181/75000 [35:26<39:49, 15.41it/s]

{'loss': 0.2526, 'grad_norm': 0.4556686580181122, 'learning_rate': 2.4711409395973158e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38191/75000 [35:27<40:42, 15.07it/s]

{'loss': 0.3186, 'grad_norm': 2.722921133041382, 'learning_rate': 2.4704697986577183e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38203/75000 [35:28<38:42, 15.84it/s]

{'loss': 0.5209, 'grad_norm': 1.8862828016281128, 'learning_rate': 2.4697986577181208e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38213/75000 [35:28<38:47, 15.81it/s]

{'loss': 0.42, 'grad_norm': 4.917978763580322, 'learning_rate': 2.4691275167785236e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38221/75000 [35:29<38:51, 15.78it/s]

{'loss': 0.2569, 'grad_norm': 5.066311836242676, 'learning_rate': 2.468456375838926e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38231/75000 [35:30<46:43, 13.12it/s]

{'loss': 0.3071, 'grad_norm': 2.691821575164795, 'learning_rate': 2.467785234899329e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38243/75000 [35:30<38:37, 15.86it/s]

{'loss': 0.2738, 'grad_norm': 1.0750842094421387, 'learning_rate': 2.467114093959732e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38253/75000 [35:31<39:15, 15.60it/s]

{'loss': 0.2701, 'grad_norm': 7.979554176330566, 'learning_rate': 2.4664429530201344e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38261/75000 [35:32<44:06, 13.88it/s]

{'loss': 0.2647, 'grad_norm': 3.776013135910034, 'learning_rate': 2.465771812080537e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38271/75000 [35:32<35:19, 17.33it/s]

{'loss': 0.3159, 'grad_norm': 7.812839031219482, 'learning_rate': 2.4651006711409398e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38282/75000 [35:33<35:38, 17.17it/s]

{'loss': 0.3074, 'grad_norm': 6.139391899108887, 'learning_rate': 2.4644295302013423e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38293/75000 [35:34<36:32, 16.74it/s]

{'loss': 0.228, 'grad_norm': 1.9841011762619019, 'learning_rate': 2.463758389261745e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38301/75000 [35:34<48:08, 12.70it/s]

{'loss': 0.2777, 'grad_norm': 1.3310825824737549, 'learning_rate': 2.463087248322148e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38311/75000 [35:35<43:04, 14.20it/s]

{'loss': 0.3057, 'grad_norm': 2.561652183532715, 'learning_rate': 2.4624161073825505e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38322/75000 [35:36<47:08, 12.97it/s]

{'loss': 0.2562, 'grad_norm': 2.362546682357788, 'learning_rate': 2.461744966442953e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38332/75000 [35:36<45:22, 13.47it/s]

{'loss': 0.1621, 'grad_norm': 4.103967666625977, 'learning_rate': 2.461073825503356e-05, 'epoch': 1.53}


                                                       
 51%|█████     | 38342/75000 [35:37<1:00:23, 10.12it/s]

{'loss': 0.2498, 'grad_norm': 2.8246443271636963, 'learning_rate': 2.4604026845637584e-05, 'epoch': 1.53}


                                                       
 51%|█████     | 38352/75000 [35:38<41:54, 14.58it/s]

{'loss': 0.3438, 'grad_norm': 3.3701722621917725, 'learning_rate': 2.4597315436241612e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38362/75000 [35:39<44:02, 13.86it/s]

{'loss': 0.3443, 'grad_norm': 2.6685051918029785, 'learning_rate': 2.459060402684564e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38372/75000 [35:40<45:40, 13.37it/s]

{'loss': 0.437, 'grad_norm': 3.4156551361083984, 'learning_rate': 2.4583892617449666e-05, 'epoch': 1.53}


                                                     
 51%|█████     | 38382/75000 [35:40<56:06, 10.88it/s]

{'loss': 0.3451, 'grad_norm': 10.16238021850586, 'learning_rate': 2.457718120805369e-05, 'epoch': 1.54}


                                                     
 51%|█████     | 38392/75000 [35:41<46:02, 13.25it/s]

{'loss': 0.3327, 'grad_norm': 1.7475804090499878, 'learning_rate': 2.457046979865772e-05, 'epoch': 1.54}


                                                     
 51%|█████     | 38402/75000 [35:42<58:45, 10.38it/s]

{'loss': 0.4321, 'grad_norm': 2.5205483436584473, 'learning_rate': 2.4563758389261745e-05, 'epoch': 1.54}


                                                     
 51%|█████     | 38412/75000 [35:43<50:58, 11.96it/s]

{'loss': 0.357, 'grad_norm': 5.156518936157227, 'learning_rate': 2.4557046979865773e-05, 'epoch': 1.54}


                                                     
 51%|█████     | 38422/75000 [35:44<43:06, 14.14it/s]

{'loss': 0.2935, 'grad_norm': 2.2165756225585938, 'learning_rate': 2.45503355704698e-05, 'epoch': 1.54}


                                                     
 51%|█████     | 38432/75000 [35:44<39:19, 15.50it/s]

{'loss': 0.2147, 'grad_norm': 3.2613234519958496, 'learning_rate': 2.4543624161073827e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38442/75000 [35:45<44:44, 13.62it/s]

{'loss': 0.2448, 'grad_norm': 13.020790100097656, 'learning_rate': 2.4536912751677856e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38453/75000 [35:46<36:28, 16.70it/s]

{'loss': 0.2789, 'grad_norm': 3.751974582672119, 'learning_rate': 2.453020134228188e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38461/75000 [35:46<34:57, 17.42it/s]

{'loss': 0.2785, 'grad_norm': 6.1144890785217285, 'learning_rate': 2.4523489932885906e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38472/75000 [35:47<37:51, 16.08it/s]

{'loss': 0.2507, 'grad_norm': 1.20431649684906, 'learning_rate': 2.4516778523489935e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38482/75000 [35:47<39:50, 15.28it/s]

{'loss': 0.3259, 'grad_norm': 2.741528272628784, 'learning_rate': 2.451006711409396e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38493/75000 [35:48<37:53, 16.06it/s]

{'loss': 0.3975, 'grad_norm': 4.606588840484619, 'learning_rate': 2.450335570469799e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38500/75000 [35:48<36:26, 16.69it/s]

{'loss': 0.2241, 'grad_norm': 4.513584136962891, 'learning_rate': 2.4496644295302017e-05, 'epoch': 1.54}


                                                       
 51%|█████▏    | 38513/75000 [35:52<1:06:39,  9.12it/s]

{'loss': 0.2838, 'grad_norm': 3.2691738605499268, 'learning_rate': 2.4489932885906042e-05, 'epoch': 1.54}


                                                       
 51%|█████▏    | 38523/75000 [35:53<42:50, 14.19it/s]

{'loss': 0.293, 'grad_norm': 5.912513256072998, 'learning_rate': 2.4483221476510067e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38533/75000 [35:53<35:20, 17.20it/s]

{'loss': 0.3471, 'grad_norm': 1.554806113243103, 'learning_rate': 2.4476510067114096e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38541/75000 [35:54<41:32, 14.63it/s]

{'loss': 0.2168, 'grad_norm': 1.1623996496200562, 'learning_rate': 2.446979865771812e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38552/75000 [35:54<36:38, 16.58it/s]

{'loss': 0.4327, 'grad_norm': 4.278636932373047, 'learning_rate': 2.446308724832215e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38562/75000 [35:55<35:32, 17.09it/s]

{'loss': 0.3171, 'grad_norm': 0.9066050052642822, 'learning_rate': 2.4456375838926178e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38572/75000 [35:55<40:29, 14.99it/s]

{'loss': 0.356, 'grad_norm': 2.934694290161133, 'learning_rate': 2.4449664429530203e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38582/75000 [35:56<41:01, 14.80it/s]

{'loss': 0.2294, 'grad_norm': 2.621581792831421, 'learning_rate': 2.444295302013423e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38592/75000 [35:57<38:26, 15.78it/s]

{'loss': 0.3841, 'grad_norm': 0.8023931980133057, 'learning_rate': 2.4436241610738257e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38602/75000 [35:57<38:44, 15.66it/s]

{'loss': 0.2936, 'grad_norm': 2.4933135509490967, 'learning_rate': 2.4429530201342282e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38612/75000 [35:58<41:07, 14.75it/s]

{'loss': 0.2259, 'grad_norm': 0.9878295660018921, 'learning_rate': 2.4422818791946307e-05, 'epoch': 1.54}


                                                     
 51%|█████▏    | 38622/75000 [35:59<41:11, 14.72it/s]

{'loss': 0.3063, 'grad_norm': 4.790496349334717, 'learning_rate': 2.441610738255034e-05, 'epoch': 1.54}


                                                     
 52%|█████▏    | 38632/75000 [35:59<35:06, 17.26it/s]

{'loss': 0.3771, 'grad_norm': 2.105473279953003, 'learning_rate': 2.4409395973154364e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38642/75000 [36:00<43:33, 13.91it/s]

{'loss': 0.2629, 'grad_norm': 2.072108268737793, 'learning_rate': 2.440268456375839e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38652/75000 [36:01<45:40, 13.26it/s]

{'loss': 0.2871, 'grad_norm': 0.6433446407318115, 'learning_rate': 2.4395973154362418e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38662/75000 [36:01<44:07, 13.73it/s]

{'loss': 0.4004, 'grad_norm': 5.5724663734436035, 'learning_rate': 2.4389261744966443e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38670/75000 [36:02<47:00, 12.88it/s]

{'loss': 0.2226, 'grad_norm': 1.9117772579193115, 'learning_rate': 2.4382550335570468e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38682/75000 [36:03<42:50, 14.13it/s]

{'loss': 0.354, 'grad_norm': 3.635385751724243, 'learning_rate': 2.43758389261745e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38692/75000 [36:04<35:42, 16.95it/s]

{'loss': 0.3512, 'grad_norm': 11.559086799621582, 'learning_rate': 2.4369127516778525e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38701/75000 [36:04<37:40, 16.06it/s]

{'loss': 0.3166, 'grad_norm': 4.402303695678711, 'learning_rate': 2.436241610738255e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38712/75000 [36:05<31:28, 19.22it/s]

{'loss': 0.3723, 'grad_norm': 1.5939130783081055, 'learning_rate': 2.435570469798658e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38722/75000 [36:05<35:12, 17.17it/s]

{'loss': 0.2618, 'grad_norm': 4.513974666595459, 'learning_rate': 2.4348993288590604e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38732/75000 [36:06<36:41, 16.47it/s]

{'loss': 0.3484, 'grad_norm': 2.4857301712036133, 'learning_rate': 2.434228187919463e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38744/75000 [36:07<31:48, 19.00it/s]

{'loss': 0.2296, 'grad_norm': 3.5690808296203613, 'learning_rate': 2.4335570469798658e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38752/75000 [36:07<38:25, 15.72it/s]

{'loss': 0.3725, 'grad_norm': 2.790306806564331, 'learning_rate': 2.4328859060402687e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38762/75000 [36:08<38:25, 15.72it/s]

{'loss': 0.348, 'grad_norm': 1.4330309629440308, 'learning_rate': 2.432214765100671e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38772/75000 [36:08<38:34, 15.65it/s]

{'loss': 0.2758, 'grad_norm': 3.340222120285034, 'learning_rate': 2.431543624161074e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38782/75000 [36:09<43:50, 13.77it/s]

{'loss': 0.3767, 'grad_norm': 2.1983513832092285, 'learning_rate': 2.4308724832214765e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38792/75000 [36:10<53:41, 11.24it/s]

{'loss': 0.357, 'grad_norm': 2.4043586254119873, 'learning_rate': 2.4302013422818794e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38802/75000 [36:11<43:57, 13.72it/s]

{'loss': 0.506, 'grad_norm': 4.262126922607422, 'learning_rate': 2.429530201342282e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38812/75000 [36:11<41:41, 14.47it/s]

{'loss': 0.3478, 'grad_norm': 4.26782751083374, 'learning_rate': 2.4288590604026848e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38821/75000 [36:12<36:37, 16.47it/s]

{'loss': 0.2563, 'grad_norm': 1.0428177118301392, 'learning_rate': 2.4281879194630873e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38833/75000 [36:12<34:25, 17.51it/s]

{'loss': 0.256, 'grad_norm': 4.081128120422363, 'learning_rate': 2.42751677852349e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38843/75000 [36:13<35:56, 16.77it/s]

{'loss': 0.2486, 'grad_norm': 0.3455442190170288, 'learning_rate': 2.4268456375838926e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38854/75000 [36:14<31:20, 19.22it/s]

{'loss': 0.2781, 'grad_norm': 6.0025105476379395, 'learning_rate': 2.4261744966442955e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38861/75000 [36:14<37:17, 16.15it/s]

{'loss': 0.3048, 'grad_norm': 6.862298965454102, 'learning_rate': 2.425503355704698e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38872/75000 [36:15<39:10, 15.37it/s]

{'loss': 0.2467, 'grad_norm': 14.705248832702637, 'learning_rate': 2.424832214765101e-05, 'epoch': 1.55}


                                                     
 52%|█████▏    | 38882/75000 [36:15<37:38, 15.99it/s]

{'loss': 0.3191, 'grad_norm': 7.7006378173828125, 'learning_rate': 2.4241610738255034e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38894/75000 [36:16<31:21, 19.19it/s]

{'loss': 0.263, 'grad_norm': 1.451738953590393, 'learning_rate': 2.4234899328859062e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38903/75000 [36:16<33:33, 17.93it/s]

{'loss': 0.2019, 'grad_norm': 2.4708564281463623, 'learning_rate': 2.4228187919463088e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38911/75000 [36:17<44:18, 13.57it/s]

{'loss': 0.2248, 'grad_norm': 5.668336868286133, 'learning_rate': 2.4221476510067116e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38924/75000 [36:18<32:02, 18.77it/s]

{'loss': 0.2655, 'grad_norm': 15.33555793762207, 'learning_rate': 2.421476510067114e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38932/75000 [36:18<36:16, 16.57it/s]

{'loss': 0.3138, 'grad_norm': 3.2246780395507812, 'learning_rate': 2.4208053691275166e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38942/75000 [36:19<37:01, 16.23it/s]

{'loss': 0.2774, 'grad_norm': 2.744529962539673, 'learning_rate': 2.42013422818792e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38954/75000 [36:19<30:53, 19.45it/s]

{'loss': 0.4416, 'grad_norm': 4.343755722045898, 'learning_rate': 2.4194630872483224e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38964/75000 [36:20<30:25, 19.74it/s]

{'loss': 0.2426, 'grad_norm': 1.7376259565353394, 'learning_rate': 2.418791946308725e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38972/75000 [36:20<33:59, 17.67it/s]

{'loss': 0.2289, 'grad_norm': 1.6824322938919067, 'learning_rate': 2.4181208053691277e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38982/75000 [36:21<33:38, 17.85it/s]

{'loss': 0.3083, 'grad_norm': 2.340754270553589, 'learning_rate': 2.4174496644295302e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 38992/75000 [36:21<30:57, 19.39it/s]

{'loss': 0.3976, 'grad_norm': 0.9789800643920898, 'learning_rate': 2.4167785234899328e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39000/75000 [36:22<32:12, 18.62it/s]

{'loss': 0.3714, 'grad_norm': 6.752807140350342, 'learning_rate': 2.416107382550336e-05, 'epoch': 1.56}


                                                       
 52%|█████▏    | 39014/75000 [36:23<36:17, 16.52it/s]

{'loss': 0.2805, 'grad_norm': 3.440286874771118, 'learning_rate': 2.4154362416107385e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39022/75000 [36:24<32:17, 18.57it/s]

{'loss': 0.361, 'grad_norm': 1.499148964881897, 'learning_rate': 2.414765100671141e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39032/75000 [36:24<33:05, 18.11it/s]

{'loss': 0.3749, 'grad_norm': 4.026405334472656, 'learning_rate': 2.414093959731544e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39044/75000 [36:25<30:13, 19.83it/s]

{'loss': 0.1897, 'grad_norm': 4.104970455169678, 'learning_rate': 2.4134228187919464e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39051/75000 [36:25<33:14, 18.03it/s]

{'loss': 0.3086, 'grad_norm': 0.8833694458007812, 'learning_rate': 2.412751677852349e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39064/75000 [36:26<29:49, 20.08it/s]

{'loss': 0.2694, 'grad_norm': 0.6991683840751648, 'learning_rate': 2.412080536912752e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39072/75000 [36:26<32:01, 18.69it/s]

{'loss': 0.3701, 'grad_norm': 4.828401565551758, 'learning_rate': 2.4114093959731546e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39082/75000 [36:27<30:54, 19.37it/s]

{'loss': 0.3461, 'grad_norm': 1.1609371900558472, 'learning_rate': 2.410738255033557e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39092/75000 [36:27<34:35, 17.30it/s]

{'loss': 0.2396, 'grad_norm': 1.8812435865402222, 'learning_rate': 2.41006711409396e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39102/75000 [36:28<32:09, 18.60it/s]

{'loss': 0.3389, 'grad_norm': 0.7098276019096375, 'learning_rate': 2.4093959731543625e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39112/75000 [36:28<33:12, 18.01it/s]

{'loss': 0.2728, 'grad_norm': 0.9781144261360168, 'learning_rate': 2.408724832214765e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39121/75000 [36:29<32:32, 18.37it/s]

{'loss': 0.3953, 'grad_norm': 4.674906253814697, 'learning_rate': 2.408053691275168e-05, 'epoch': 1.56}


                                                     
 52%|█████▏    | 39134/75000 [36:29<29:38, 20.16it/s]

{'loss': 0.3127, 'grad_norm': 10.031585693359375, 'learning_rate': 2.4073825503355707e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39142/75000 [36:30<32:12, 18.56it/s]

{'loss': 0.4517, 'grad_norm': 6.8133931159973145, 'learning_rate': 2.4067114093959732e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39152/75000 [36:30<30:36, 19.52it/s]

{'loss': 0.2797, 'grad_norm': 1.2862437963485718, 'learning_rate': 2.406040268456376e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39162/75000 [36:31<33:06, 18.04it/s]

{'loss': 0.4583, 'grad_norm': 1.6761651039123535, 'learning_rate': 2.4053691275167786e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39174/75000 [36:31<29:56, 19.95it/s]

{'loss': 0.3319, 'grad_norm': 6.8696064949035645, 'learning_rate': 2.404697986577181e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39183/75000 [36:32<31:17, 19.07it/s]

{'loss': 0.291, 'grad_norm': 3.843153715133667, 'learning_rate': 2.404026845637584e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39194/75000 [36:33<30:48, 19.37it/s]

{'loss': 0.3424, 'grad_norm': 6.723512649536133, 'learning_rate': 2.4033557046979868e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39202/75000 [36:33<33:03, 18.05it/s]

{'loss': 0.3589, 'grad_norm': 8.387491226196289, 'learning_rate': 2.4026845637583893e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39212/75000 [36:33<31:00, 19.23it/s]

{'loss': 0.2947, 'grad_norm': 4.142980575561523, 'learning_rate': 2.4020134228187922e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39223/75000 [36:34<32:07, 18.56it/s]

{'loss': 0.3721, 'grad_norm': 1.2892259359359741, 'learning_rate': 2.4013422818791947e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39234/75000 [36:35<29:48, 20.00it/s]

{'loss': 0.2842, 'grad_norm': 2.4131453037261963, 'learning_rate': 2.4006711409395975e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39241/75000 [36:35<32:32, 18.31it/s]

{'loss': 0.4023, 'grad_norm': 3.2208685874938965, 'learning_rate': 2.4e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39253/75000 [36:36<29:37, 20.12it/s]

{'loss': 0.3314, 'grad_norm': 0.6840798258781433, 'learning_rate': 2.3993288590604026e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39263/75000 [36:36<33:51, 17.59it/s]

{'loss': 0.2744, 'grad_norm': 1.5022655725479126, 'learning_rate': 2.3986577181208054e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39274/75000 [36:37<30:14, 19.69it/s]

{'loss': 0.3952, 'grad_norm': 4.380686283111572, 'learning_rate': 2.3979865771812083e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39284/75000 [36:37<29:49, 19.96it/s]

{'loss': 0.4026, 'grad_norm': 3.2787086963653564, 'learning_rate': 2.3973154362416108e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39294/75000 [36:38<30:18, 19.64it/s]

{'loss': 0.3243, 'grad_norm': 7.527707099914551, 'learning_rate': 2.3966442953020137e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39303/75000 [36:38<31:56, 18.63it/s]

{'loss': 0.2002, 'grad_norm': 1.6037986278533936, 'learning_rate': 2.3959731543624162e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39312/75000 [36:39<30:49, 19.29it/s]

{'loss': 0.1566, 'grad_norm': 2.511140823364258, 'learning_rate': 2.3953020134228187e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39321/75000 [36:39<33:02, 18.00it/s]

{'loss': 0.3244, 'grad_norm': 11.426026344299316, 'learning_rate': 2.3946308724832215e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39332/75000 [36:40<30:36, 19.43it/s]

{'loss': 0.4796, 'grad_norm': 22.139741897583008, 'learning_rate': 2.3939597315436244e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39343/75000 [36:40<32:38, 18.20it/s]

{'loss': 0.3054, 'grad_norm': 5.544715404510498, 'learning_rate': 2.393288590604027e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39354/75000 [36:41<30:16, 19.62it/s]

{'loss': 0.33, 'grad_norm': 2.5742146968841553, 'learning_rate': 2.3926174496644298e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39363/75000 [36:41<32:08, 18.48it/s]

{'loss': 0.4018, 'grad_norm': 2.6673898696899414, 'learning_rate': 2.3919463087248323e-05, 'epoch': 1.57}


                                                     
 52%|█████▏    | 39372/75000 [36:42<34:16, 17.32it/s]

{'loss': 0.3945, 'grad_norm': 1.8927472829818726, 'learning_rate': 2.3912751677852348e-05, 'epoch': 1.57}


                                                     
 53%|█████▎    | 39382/75000 [36:42<32:47, 18.11it/s]

{'loss': 0.2632, 'grad_norm': 1.0403430461883545, 'learning_rate': 2.390604026845638e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39392/75000 [36:43<32:16, 18.39it/s]

{'loss': 0.4172, 'grad_norm': 5.724148750305176, 'learning_rate': 2.3899328859060405e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39402/75000 [36:44<31:28, 18.85it/s]

{'loss': 0.3087, 'grad_norm': 2.863986015319824, 'learning_rate': 2.389261744966443e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39412/75000 [36:44<32:59, 17.98it/s]

{'loss': 0.4024, 'grad_norm': 1.420245885848999, 'learning_rate': 2.388590604026846e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39422/75000 [36:45<30:43, 19.30it/s]

{'loss': 0.215, 'grad_norm': 1.85945463180542, 'learning_rate': 2.3879194630872484e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39431/75000 [36:45<35:51, 16.53it/s]

{'loss': 0.2892, 'grad_norm': 1.8377506732940674, 'learning_rate': 2.387248322147651e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39443/75000 [36:46<31:12, 18.99it/s]

{'loss': 0.4607, 'grad_norm': 4.2893452644348145, 'learning_rate': 2.3865771812080538e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39454/75000 [36:46<29:56, 19.78it/s]

{'loss': 0.3929, 'grad_norm': 1.0076404809951782, 'learning_rate': 2.3859060402684566e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39462/75000 [36:47<33:46, 17.54it/s]

{'loss': 0.3483, 'grad_norm': 1.62165367603302, 'learning_rate': 2.385234899328859e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39472/75000 [36:47<31:10, 18.99it/s]

{'loss': 0.2903, 'grad_norm': 1.7334452867507935, 'learning_rate': 2.384563758389262e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39481/75000 [36:48<32:28, 18.23it/s]

{'loss': 0.1753, 'grad_norm': 1.217280626296997, 'learning_rate': 2.3838926174496645e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39493/75000 [36:48<29:33, 20.02it/s]

{'loss': 0.368, 'grad_norm': 5.036666393280029, 'learning_rate': 2.383221476510067e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39500/75000 [36:49<34:05, 17.36it/s]

{'loss': 0.2064, 'grad_norm': 0.9495090842247009, 'learning_rate': 2.38255033557047e-05, 'epoch': 1.58}


                                                       
 53%|█████▎    | 39512/75000 [36:50<36:46, 16.08it/s]

{'loss': 0.3883, 'grad_norm': 3.4056060314178467, 'learning_rate': 2.3818791946308727e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39524/75000 [36:51<30:58, 19.09it/s]

{'loss': 0.3639, 'grad_norm': 5.923857688903809, 'learning_rate': 2.3812080536912752e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39532/75000 [36:51<32:26, 18.22it/s]

{'loss': 0.4011, 'grad_norm': 5.172817707061768, 'learning_rate': 2.380536912751678e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39541/75000 [36:51<32:00, 18.46it/s]

{'loss': 0.2632, 'grad_norm': 8.74596118927002, 'learning_rate': 2.3798657718120806e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39554/75000 [36:52<30:42, 19.24it/s]

{'loss': 0.3569, 'grad_norm': 9.546626091003418, 'learning_rate': 2.379194630872483e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39562/75000 [36:53<31:45, 18.60it/s]

{'loss': 0.2964, 'grad_norm': 7.001761436462402, 'learning_rate': 2.378523489932886e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39571/75000 [36:53<32:48, 18.00it/s]

{'loss': 0.3048, 'grad_norm': 2.8940088748931885, 'learning_rate': 2.3778523489932885e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39583/75000 [36:54<30:15, 19.51it/s]

{'loss': 0.2181, 'grad_norm': 1.6306463479995728, 'learning_rate': 2.3771812080536914e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39594/75000 [36:54<29:18, 20.13it/s]

{'loss': 0.3944, 'grad_norm': 5.469385623931885, 'learning_rate': 2.3765100671140942e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39603/75000 [36:55<30:49, 19.13it/s]

{'loss': 0.3063, 'grad_norm': 1.467720627784729, 'learning_rate': 2.3758389261744967e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39614/75000 [36:55<29:36, 19.92it/s]

{'loss': 0.3567, 'grad_norm': 4.387852668762207, 'learning_rate': 2.3751677852348992e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39621/75000 [36:56<33:02, 17.84it/s]

{'loss': 0.4263, 'grad_norm': 1.6162735223770142, 'learning_rate': 2.374496644295302e-05, 'epoch': 1.58}


                                                     
 53%|█████▎    | 39633/75000 [36:56<30:00, 19.65it/s]

{'loss': 0.4138, 'grad_norm': 1.421435832977295, 'learning_rate': 2.3738255033557046e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39641/75000 [36:57<30:10, 19.53it/s]

{'loss': 0.4602, 'grad_norm': 3.9070706367492676, 'learning_rate': 2.3731543624161075e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39654/75000 [36:57<30:18, 19.44it/s]

{'loss': 0.3171, 'grad_norm': 1.8701105117797852, 'learning_rate': 2.3724832214765103e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39664/75000 [36:58<29:51, 19.73it/s]

{'loss': 0.3337, 'grad_norm': 9.866394996643066, 'learning_rate': 2.371812080536913e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39672/75000 [36:58<32:00, 18.39it/s]

{'loss': 0.2317, 'grad_norm': 1.3010120391845703, 'learning_rate': 2.3711409395973157e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39684/75000 [36:59<29:13, 20.14it/s]

{'loss': 0.2814, 'grad_norm': 3.3272194862365723, 'learning_rate': 2.3704697986577182e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39693/75000 [36:59<29:44, 19.78it/s]

{'loss': 0.3487, 'grad_norm': 1.5225841999053955, 'learning_rate': 2.3697986577181207e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39702/75000 [37:00<30:12, 19.48it/s]

{'loss': 0.3669, 'grad_norm': 3.6249005794525146, 'learning_rate': 2.3691275167785236e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39711/75000 [37:00<31:36, 18.61it/s]

{'loss': 0.3416, 'grad_norm': 2.2788031101226807, 'learning_rate': 2.3684563758389264e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39723/75000 [37:01<30:39, 19.18it/s]

{'loss': 0.2418, 'grad_norm': 0.48363614082336426, 'learning_rate': 2.367785234899329e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39734/75000 [37:01<29:35, 19.86it/s]

{'loss': 0.3849, 'grad_norm': 2.6822996139526367, 'learning_rate': 2.3671140939597318e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39743/75000 [37:02<31:50, 18.46it/s]

{'loss': 0.3038, 'grad_norm': 3.097964286804199, 'learning_rate': 2.3664429530201343e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39752/75000 [37:02<31:06, 18.89it/s]

{'loss': 0.2526, 'grad_norm': 2.7319765090942383, 'learning_rate': 2.365771812080537e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39762/75000 [37:03<29:54, 19.63it/s]

{'loss': 0.4329, 'grad_norm': 4.130671501159668, 'learning_rate': 2.3651006711409397e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39772/75000 [37:04<33:05, 17.74it/s]

{'loss': 0.1677, 'grad_norm': 1.5301142930984497, 'learning_rate': 2.3644295302013426e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39784/75000 [37:04<32:13, 18.21it/s]

{'loss': 0.3955, 'grad_norm': 2.820538282394409, 'learning_rate': 2.363758389261745e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39794/75000 [37:05<29:42, 19.75it/s]

{'loss': 0.2931, 'grad_norm': 2.5471901893615723, 'learning_rate': 2.363087248322148e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39802/75000 [37:05<32:54, 17.83it/s]

{'loss': 0.2968, 'grad_norm': 2.153696060180664, 'learning_rate': 2.3624161073825504e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39812/75000 [37:06<31:23, 18.68it/s]

{'loss': 0.1893, 'grad_norm': 0.5601394176483154, 'learning_rate': 2.361744966442953e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39822/75000 [37:06<29:26, 19.91it/s]

{'loss': 0.3704, 'grad_norm': 7.781490325927734, 'learning_rate': 2.3610738255033558e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39834/75000 [37:07<29:13, 20.06it/s]

{'loss': 0.3615, 'grad_norm': 2.5039141178131104, 'learning_rate': 2.3604026845637587e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39844/75000 [37:07<30:16, 19.35it/s]

{'loss': 0.255, 'grad_norm': 1.433269739151001, 'learning_rate': 2.3597315436241612e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39853/75000 [37:08<30:39, 19.11it/s]

{'loss': 0.401, 'grad_norm': 5.171721458435059, 'learning_rate': 2.359060402684564e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39864/75000 [37:08<29:29, 19.86it/s]

{'loss': 0.3576, 'grad_norm': 0.9697616100311279, 'learning_rate': 2.3583892617449665e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39872/75000 [37:09<34:02, 17.20it/s]

{'loss': 0.2628, 'grad_norm': 2.9346699714660645, 'learning_rate': 2.357718120805369e-05, 'epoch': 1.59}


                                                     
 53%|█████▎    | 39882/75000 [37:09<30:29, 19.20it/s]

{'loss': 0.2355, 'grad_norm': 2.790440320968628, 'learning_rate': 2.357046979865772e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39894/75000 [37:10<30:15, 19.34it/s]

{'loss': 0.2755, 'grad_norm': 2.844496726989746, 'learning_rate': 2.3563758389261744e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39904/75000 [37:10<30:00, 19.49it/s]

{'loss': 0.3191, 'grad_norm': 5.962985515594482, 'learning_rate': 2.3557046979865773e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39914/75000 [37:11<29:28, 19.84it/s]

{'loss': 0.272, 'grad_norm': 1.455639362335205, 'learning_rate': 2.35503355704698e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39922/75000 [37:11<37:08, 15.74it/s]

{'loss': 0.3075, 'grad_norm': 4.888547420501709, 'learning_rate': 2.3543624161073827e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39932/75000 [37:12<35:14, 16.59it/s]

{'loss': 0.3607, 'grad_norm': 4.209622383117676, 'learning_rate': 2.3536912751677852e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39942/75000 [37:13<36:48, 15.87it/s]

{'loss': 0.3801, 'grad_norm': 4.567533016204834, 'learning_rate': 2.353020134228188e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39952/75000 [37:13<33:09, 17.62it/s]

{'loss': 0.297, 'grad_norm': 5.473546504974365, 'learning_rate': 2.3523489932885905e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39963/75000 [37:14<34:36, 16.87it/s]

{'loss': 0.289, 'grad_norm': 3.4902641773223877, 'learning_rate': 2.3516778523489934e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39972/75000 [37:14<31:59, 18.25it/s]

{'loss': 0.3601, 'grad_norm': 2.289719581604004, 'learning_rate': 2.3510067114093963e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39982/75000 [37:15<32:41, 17.85it/s]

{'loss': 0.3596, 'grad_norm': 9.361519813537598, 'learning_rate': 2.3503355704697988e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 39992/75000 [37:15<30:44, 18.98it/s]

{'loss': 0.334, 'grad_norm': 2.5902247428894043, 'learning_rate': 2.3496644295302013e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40000/75000 [37:16<31:04, 18.77it/s]

{'loss': 0.4379, 'grad_norm': 7.780309200286865, 'learning_rate': 2.348993288590604e-05, 'epoch': 1.6}


                                                       
 53%|█████▎    | 40013/75000 [37:17<42:47, 13.63it/s]

{'loss': 0.3155, 'grad_norm': 2.4059336185455322, 'learning_rate': 2.3483221476510067e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40022/75000 [37:18<35:53, 16.24it/s]

{'loss': 0.3968, 'grad_norm': 3.6304147243499756, 'learning_rate': 2.3476510067114095e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40034/75000 [37:19<31:02, 18.77it/s]

{'loss': 0.2621, 'grad_norm': 3.9052951335906982, 'learning_rate': 2.3469798657718124e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40044/75000 [37:19<31:11, 18.68it/s]

{'loss': 0.2842, 'grad_norm': 4.176718711853027, 'learning_rate': 2.346308724832215e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40052/75000 [37:20<30:42, 18.97it/s]

{'loss': 0.2693, 'grad_norm': 1.3653839826583862, 'learning_rate': 2.3456375838926174e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40064/75000 [37:20<29:19, 19.85it/s]

{'loss': 0.3601, 'grad_norm': 6.285655975341797, 'learning_rate': 2.3449664429530203e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40072/75000 [37:21<30:19, 19.20it/s]

{'loss': 0.2022, 'grad_norm': 2.8023998737335205, 'learning_rate': 2.3442953020134228e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40083/75000 [37:21<31:18, 18.58it/s]

{'loss': 0.4709, 'grad_norm': 1.1801096200942993, 'learning_rate': 2.3436241610738256e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40094/75000 [37:22<30:05, 19.33it/s]

{'loss': 0.3242, 'grad_norm': 3.5774595737457275, 'learning_rate': 2.3429530201342285e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40104/75000 [37:22<33:00, 17.62it/s]

{'loss': 0.2106, 'grad_norm': 3.3514957427978516, 'learning_rate': 2.342281879194631e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40111/75000 [37:23<31:34, 18.42it/s]

{'loss': 0.2895, 'grad_norm': 2.945990562438965, 'learning_rate': 2.341610738255034e-05, 'epoch': 1.6}


                                                     
 53%|█████▎    | 40122/75000 [37:23<31:05, 18.70it/s]

{'loss': 0.3905, 'grad_norm': 8.56373119354248, 'learning_rate': 2.3409395973154364e-05, 'epoch': 1.6}


                                                     
 54%|█████▎    | 40131/75000 [37:24<33:47, 17.20it/s]

{'loss': 0.3621, 'grad_norm': 5.928419589996338, 'learning_rate': 2.340268456375839e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40142/75000 [37:24<31:49, 18.26it/s]

{'loss': 0.268, 'grad_norm': 9.695630073547363, 'learning_rate': 2.3395973154362417e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40153/75000 [37:25<34:32, 16.81it/s]

{'loss': 0.3082, 'grad_norm': 4.203575611114502, 'learning_rate': 2.3389261744966446e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40163/75000 [37:26<33:04, 17.56it/s]

{'loss': 0.291, 'grad_norm': 1.6572604179382324, 'learning_rate': 2.338255033557047e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40173/75000 [37:26<34:12, 16.96it/s]

{'loss': 0.3575, 'grad_norm': 3.3799421787261963, 'learning_rate': 2.33758389261745e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40183/75000 [37:27<31:11, 18.61it/s]

{'loss': 0.465, 'grad_norm': 1.9265468120574951, 'learning_rate': 2.3369127516778525e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40194/75000 [37:27<29:25, 19.71it/s]

{'loss': 0.2788, 'grad_norm': 0.87041175365448, 'learning_rate': 2.336241610738255e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40203/75000 [37:28<30:20, 19.11it/s]

{'loss': 0.3263, 'grad_norm': 0.9565212726593018, 'learning_rate': 2.335570469798658e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40213/75000 [37:28<31:03, 18.67it/s]

{'loss': 0.364, 'grad_norm': 1.4955253601074219, 'learning_rate': 2.3348993288590607e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40222/75000 [37:29<31:30, 18.40it/s]

{'loss': 0.3602, 'grad_norm': 3.183720111846924, 'learning_rate': 2.3342281879194632e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40234/75000 [37:29<28:36, 20.25it/s]

{'loss': 0.3582, 'grad_norm': 2.1576733589172363, 'learning_rate': 2.333557046979866e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40243/75000 [37:30<28:51, 20.08it/s]

{'loss': 0.2914, 'grad_norm': 2.436927318572998, 'learning_rate': 2.3328859060402686e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40252/75000 [37:30<28:47, 20.11it/s]

{'loss': 0.2976, 'grad_norm': 1.8221733570098877, 'learning_rate': 2.332214765100671e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40261/75000 [37:31<29:11, 19.83it/s]

{'loss': 0.286, 'grad_norm': 4.795285701751709, 'learning_rate': 2.331543624161074e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40273/75000 [37:31<27:33, 21.01it/s]

{'loss': 0.2945, 'grad_norm': 4.398418426513672, 'learning_rate': 2.3308724832214765e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40282/75000 [37:32<28:45, 20.12it/s]

{'loss': 0.3049, 'grad_norm': 4.788794040679932, 'learning_rate': 2.3302013422818793e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40294/75000 [37:32<27:49, 20.79it/s]

{'loss': 0.2845, 'grad_norm': 2.04366397857666, 'learning_rate': 2.3295302013422822e-05, 'epoch': 1.61}


                                                     
 54%|█████▎    | 40303/75000 [37:33<29:47, 19.41it/s]

{'loss': 0.2593, 'grad_norm': 1.171069622039795, 'learning_rate': 2.3288590604026847e-05, 'epoch': 1.61}


                                                     
 54%|█████▍    | 40313/75000 [37:33<29:10, 19.81it/s]

{'loss': 0.2868, 'grad_norm': 2.5584003925323486, 'learning_rate': 2.3281879194630872e-05, 'epoch': 1.61}


                                                     
 54%|█████▍    | 40322/75000 [37:34<29:27, 19.62it/s]

{'loss': 0.3135, 'grad_norm': 3.903719425201416, 'learning_rate': 2.32751677852349e-05, 'epoch': 1.61}


                                                     
 54%|█████▍    | 40334/75000 [37:34<28:12, 20.48it/s]

{'loss': 0.3594, 'grad_norm': 4.058134078979492, 'learning_rate': 2.3268456375838926e-05, 'epoch': 1.61}


                                                     
 54%|█████▍    | 40343/75000 [37:35<28:58, 19.93it/s]

{'loss': 0.2805, 'grad_norm': 11.100954055786133, 'learning_rate': 2.3261744966442954e-05, 'epoch': 1.61}


                                                     
 54%|█████▍    | 40352/75000 [37:35<28:38, 20.16it/s]

{'loss': 0.404, 'grad_norm': 5.442265033721924, 'learning_rate': 2.3255033557046983e-05, 'epoch': 1.61}


                                                     
 54%|█████▍    | 40361/75000 [37:36<29:21, 19.67it/s]

{'loss': 0.2487, 'grad_norm': 7.7715163230896, 'learning_rate': 2.3248322147651008e-05, 'epoch': 1.61}


                                                     
 54%|█████▍    | 40372/75000 [37:36<27:58, 20.63it/s]

{'loss': 0.326, 'grad_norm': 2.0961053371429443, 'learning_rate': 2.3241610738255033e-05, 'epoch': 1.61}


                                                     
 54%|█████▍    | 40384/75000 [37:37<27:09, 21.25it/s]

{'loss': 0.3414, 'grad_norm': 2.412078857421875, 'learning_rate': 2.3234899328859062e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40393/75000 [37:37<28:15, 20.41it/s]

{'loss': 0.3274, 'grad_norm': 2.3518335819244385, 'learning_rate': 2.3228187919463087e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40404/75000 [37:38<28:44, 20.07it/s]

{'loss': 0.3138, 'grad_norm': 2.1589949131011963, 'learning_rate': 2.3221476510067116e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40413/75000 [37:38<30:52, 18.67it/s]

{'loss': 0.4689, 'grad_norm': 2.9002606868743896, 'learning_rate': 2.3214765100671144e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40424/75000 [37:39<28:04, 20.52it/s]

{'loss': 0.369, 'grad_norm': 5.224120140075684, 'learning_rate': 2.320805369127517e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40433/75000 [37:39<28:12, 20.43it/s]

{'loss': 0.3065, 'grad_norm': 4.0248565673828125, 'learning_rate': 2.3201342281879194e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40442/75000 [37:40<29:01, 19.84it/s]

{'loss': 0.3293, 'grad_norm': 4.91010046005249, 'learning_rate': 2.3194630872483223e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40452/75000 [37:40<29:20, 19.62it/s]

{'loss': 0.266, 'grad_norm': 1.4968035221099854, 'learning_rate': 2.3187919463087248e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40464/75000 [37:41<27:40, 20.80it/s]

{'loss': 0.2456, 'grad_norm': 6.967615604400635, 'learning_rate': 2.3181208053691277e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40472/75000 [37:41<29:38, 19.41it/s]

{'loss': 0.3384, 'grad_norm': 3.0914015769958496, 'learning_rate': 2.3174496644295305e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40482/75000 [37:42<29:45, 19.33it/s]

{'loss': 0.367, 'grad_norm': 10.502968788146973, 'learning_rate': 2.316778523489933e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40494/75000 [37:42<28:54, 19.89it/s]

{'loss': 0.3076, 'grad_norm': 0.9084407091140747, 'learning_rate': 2.3161073825503356e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40500/75000 [37:43<28:49, 19.95it/s]

{'loss': 0.1523, 'grad_norm': 0.8498949408531189, 'learning_rate': 2.3154362416107384e-05, 'epoch': 1.62}


                                                       
 54%|█████▍    | 40512/75000 [37:44<37:28, 15.34it/s]

{'loss': 0.5, 'grad_norm': 2.507415533065796, 'learning_rate': 2.314765100671141e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40522/75000 [37:44<31:11, 18.42it/s]

{'loss': 0.4376, 'grad_norm': 2.456120014190674, 'learning_rate': 2.3140939597315438e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40532/75000 [37:45<29:39, 19.36it/s]

{'loss': 0.2496, 'grad_norm': 3.1014344692230225, 'learning_rate': 2.3134228187919466e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40544/75000 [37:45<27:51, 20.61it/s]

{'loss': 0.3642, 'grad_norm': 4.024404048919678, 'learning_rate': 2.312751677852349e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40553/75000 [37:46<29:15, 19.62it/s]

{'loss': 0.2987, 'grad_norm': 2.9328696727752686, 'learning_rate': 2.312080536912752e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40563/75000 [37:46<28:37, 20.05it/s]

{'loss': 0.3651, 'grad_norm': 6.0762434005737305, 'learning_rate': 2.3114093959731545e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40572/75000 [37:47<29:52, 19.21it/s]

{'loss': 0.288, 'grad_norm': 4.084914684295654, 'learning_rate': 2.310738255033557e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40583/75000 [37:47<28:10, 20.36it/s]

{'loss': 0.3129, 'grad_norm': 1.3037306070327759, 'learning_rate': 2.31006711409396e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40592/75000 [37:48<28:22, 20.22it/s]

{'loss': 0.2591, 'grad_norm': 6.919495582580566, 'learning_rate': 2.3093959731543624e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40604/75000 [37:48<27:21, 20.95it/s]

{'loss': 0.2935, 'grad_norm': 3.8266732692718506, 'learning_rate': 2.3087248322147653e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40613/75000 [37:49<28:35, 20.05it/s]

{'loss': 0.3511, 'grad_norm': 3.245819568634033, 'learning_rate': 2.308053691275168e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40622/75000 [37:49<27:39, 20.72it/s]

{'loss': 0.27, 'grad_norm': 1.400971531867981, 'learning_rate': 2.3073825503355706e-05, 'epoch': 1.62}


                                                     
 54%|█████▍    | 40634/75000 [37:50<28:13, 20.30it/s]

{'loss': 0.3629, 'grad_norm': 2.048685312271118, 'learning_rate': 2.306711409395973e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40643/75000 [37:50<28:18, 20.23it/s]

{'loss': 0.1866, 'grad_norm': 4.387421131134033, 'learning_rate': 2.306040268456376e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40651/75000 [37:51<29:58, 19.10it/s]

{'loss': 0.4656, 'grad_norm': 2.8255703449249268, 'learning_rate': 2.3053691275167785e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40662/75000 [37:51<27:37, 20.72it/s]

{'loss': 0.3249, 'grad_norm': 5.014060974121094, 'learning_rate': 2.3046979865771814e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40673/75000 [37:52<29:55, 19.12it/s]

{'loss': 0.3137, 'grad_norm': 3.757772922515869, 'learning_rate': 2.3040268456375842e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40682/75000 [37:52<28:49, 19.84it/s]

{'loss': 0.2342, 'grad_norm': 1.8405221700668335, 'learning_rate': 2.3033557046979867e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40694/75000 [37:53<28:45, 19.88it/s]

{'loss': 0.2415, 'grad_norm': 2.229910373687744, 'learning_rate': 2.3026845637583893e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40703/75000 [37:53<27:56, 20.46it/s]

{'loss': 0.406, 'grad_norm': 5.11330509185791, 'learning_rate': 2.302013422818792e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40713/75000 [37:54<30:22, 18.82it/s]

{'loss': 0.4853, 'grad_norm': 1.3148245811462402, 'learning_rate': 2.3013422818791946e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40721/75000 [37:54<29:45, 19.20it/s]

{'loss': 0.3175, 'grad_norm': 2.0928988456726074, 'learning_rate': 2.300671140939597e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40733/75000 [37:55<26:57, 21.19it/s]

{'loss': 0.2542, 'grad_norm': 0.9590249061584473, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40742/75000 [37:55<28:27, 20.07it/s]

{'loss': 0.365, 'grad_norm': 2.7760488986968994, 'learning_rate': 2.299328859060403e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40754/75000 [37:56<28:28, 20.04it/s]

{'loss': 0.3015, 'grad_norm': 5.083582401275635, 'learning_rate': 2.2986577181208054e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40764/75000 [37:56<28:13, 20.22it/s]

{'loss': 0.3173, 'grad_norm': 1.8891435861587524, 'learning_rate': 2.2979865771812082e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40773/75000 [37:57<27:37, 20.65it/s]

{'loss': 0.3717, 'grad_norm': 1.8718311786651611, 'learning_rate': 2.2973154362416107e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40782/75000 [37:57<28:40, 19.88it/s]

{'loss': 0.4098, 'grad_norm': 1.4520702362060547, 'learning_rate': 2.2966442953020133e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40794/75000 [37:58<27:26, 20.78it/s]

{'loss': 0.3215, 'grad_norm': 10.695514678955078, 'learning_rate': 2.2959731543624165e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40803/75000 [37:58<28:13, 20.20it/s]

{'loss': 0.4447, 'grad_norm': 4.717335224151611, 'learning_rate': 2.295302013422819e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40812/75000 [37:59<27:14, 20.91it/s]

{'loss': 0.4773, 'grad_norm': 0.7152092456817627, 'learning_rate': 2.2946308724832215e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40821/75000 [37:59<28:10, 20.22it/s]

{'loss': 0.3719, 'grad_norm': 1.877062439918518, 'learning_rate': 2.2939597315436243e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40833/75000 [38:00<27:17, 20.87it/s]

{'loss': 0.3229, 'grad_norm': 3.737522602081299, 'learning_rate': 2.293288590604027e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40842/75000 [38:00<27:48, 20.47it/s]

{'loss': 0.3792, 'grad_norm': 2.2264089584350586, 'learning_rate': 2.2926174496644294e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40854/75000 [38:01<28:18, 20.11it/s]

{'loss': 0.3076, 'grad_norm': 4.003272533416748, 'learning_rate': 2.2919463087248326e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40863/75000 [38:01<29:44, 19.13it/s]

{'loss': 0.3116, 'grad_norm': 3.412978172302246, 'learning_rate': 2.291275167785235e-05, 'epoch': 1.63}


                                                     
 54%|█████▍    | 40872/75000 [38:02<28:17, 20.11it/s]

{'loss': 0.3453, 'grad_norm': 4.600894927978516, 'learning_rate': 2.2906040268456376e-05, 'epoch': 1.63}


                                                     
 55%|█████▍    | 40881/75000 [38:02<28:54, 19.67it/s]

{'loss': 0.2939, 'grad_norm': 1.6721677780151367, 'learning_rate': 2.2899328859060405e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40893/75000 [38:03<27:37, 20.58it/s]

{'loss': 0.2684, 'grad_norm': 2.5867526531219482, 'learning_rate': 2.289261744966443e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40902/75000 [38:03<27:19, 20.80it/s]

{'loss': 0.3957, 'grad_norm': 12.099955558776855, 'learning_rate': 2.2885906040268458e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40914/75000 [38:04<27:58, 20.31it/s]

{'loss': 0.2708, 'grad_norm': 9.162126541137695, 'learning_rate': 2.2879194630872483e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40923/75000 [38:04<27:49, 20.41it/s]

{'loss': 0.2723, 'grad_norm': 8.812807083129883, 'learning_rate': 2.2872483221476512e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40932/75000 [38:05<28:22, 20.02it/s]

{'loss': 0.2508, 'grad_norm': 2.6663742065429688, 'learning_rate': 2.2865771812080537e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40944/75000 [38:05<27:37, 20.55it/s]

{'loss': 0.3194, 'grad_norm': 1.8165173530578613, 'learning_rate': 2.2859060402684566e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40952/75000 [38:06<29:27, 19.27it/s]

{'loss': 0.3409, 'grad_norm': 3.2520089149475098, 'learning_rate': 2.285234899328859e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40964/75000 [38:06<27:56, 20.30it/s]

{'loss': 0.2508, 'grad_norm': 4.24273681640625, 'learning_rate': 2.284563758389262e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40974/75000 [38:07<28:53, 19.63it/s]

{'loss': 0.3133, 'grad_norm': 4.77993106842041, 'learning_rate': 2.2838926174496644e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40983/75000 [38:07<27:25, 20.67it/s]

{'loss': 0.2538, 'grad_norm': 3.3616743087768555, 'learning_rate': 2.2832214765100673e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 40992/75000 [38:08<27:58, 20.26it/s]

{'loss': 0.2727, 'grad_norm': 2.2621641159057617, 'learning_rate': 2.2825503355704698e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41000/75000 [38:08<26:42, 21.22it/s]

{'loss': 0.308, 'grad_norm': 10.485332489013672, 'learning_rate': 2.2818791946308727e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41012/75000 [38:09<34:31, 16.41it/s]

{'loss': 0.2989, 'grad_norm': 4.010373115539551, 'learning_rate': 2.2812080536912752e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41022/75000 [38:10<31:21, 18.06it/s]

{'loss': 0.354, 'grad_norm': 0.7299585342407227, 'learning_rate': 2.280536912751678e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41033/75000 [38:10<28:23, 19.94it/s]

{'loss': 0.23, 'grad_norm': 6.583865165710449, 'learning_rate': 2.2798657718120806e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41042/75000 [38:11<29:15, 19.34it/s]

{'loss': 0.2525, 'grad_norm': 2.6695163249969482, 'learning_rate': 2.279194630872483e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41054/75000 [38:11<27:36, 20.49it/s]

{'loss': 0.2606, 'grad_norm': 2.9927337169647217, 'learning_rate': 2.2785234899328863e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41062/75000 [38:11<29:36, 19.11it/s]

{'loss': 0.2989, 'grad_norm': 3.170896530151367, 'learning_rate': 2.2778523489932888e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41074/75000 [38:12<27:34, 20.51it/s]

{'loss': 0.2842, 'grad_norm': 1.2361704111099243, 'learning_rate': 2.2771812080536913e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41082/75000 [38:12<29:48, 18.96it/s]

{'loss': 0.3218, 'grad_norm': 2.7609317302703857, 'learning_rate': 2.276510067114094e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41093/75000 [38:13<27:46, 20.35it/s]

{'loss': 0.2942, 'grad_norm': 14.377630233764648, 'learning_rate': 2.2758389261744967e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41102/75000 [38:13<28:51, 19.58it/s]

{'loss': 0.331, 'grad_norm': 2.979269027709961, 'learning_rate': 2.2751677852348992e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41114/75000 [38:14<27:24, 20.60it/s]

{'loss': 0.2894, 'grad_norm': 2.0951027870178223, 'learning_rate': 2.2744966442953024e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41123/75000 [38:15<29:06, 19.40it/s]

{'loss': 0.3203, 'grad_norm': 8.650238990783691, 'learning_rate': 2.273825503355705e-05, 'epoch': 1.64}


                                                     
 55%|█████▍    | 41132/75000 [38:15<27:55, 20.21it/s]

{'loss': 0.2567, 'grad_norm': 3.931964874267578, 'learning_rate': 2.2731543624161074e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41142/75000 [38:16<29:40, 19.01it/s]

{'loss': 0.3348, 'grad_norm': 4.518136978149414, 'learning_rate': 2.2724832214765103e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41152/75000 [38:16<28:55, 19.51it/s]

{'loss': 0.2626, 'grad_norm': 3.4115958213806152, 'learning_rate': 2.2718120805369128e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41162/75000 [38:17<30:10, 18.69it/s]

{'loss': 0.3193, 'grad_norm': 5.315500736236572, 'learning_rate': 2.2711409395973153e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41174/75000 [38:17<27:42, 20.35it/s]

{'loss': 0.3801, 'grad_norm': 1.2318848371505737, 'learning_rate': 2.2704697986577185e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41183/75000 [38:18<28:35, 19.71it/s]

{'loss': 0.3336, 'grad_norm': 1.521847128868103, 'learning_rate': 2.269798657718121e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41192/75000 [38:18<28:03, 20.09it/s]

{'loss': 0.3369, 'grad_norm': 8.015551567077637, 'learning_rate': 2.2691275167785235e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41204/75000 [38:19<27:44, 20.30it/s]

{'loss': 0.2824, 'grad_norm': 1.6958668231964111, 'learning_rate': 2.2684563758389264e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41213/75000 [38:19<27:48, 20.25it/s]

{'loss': 0.3419, 'grad_norm': 2.2528481483459473, 'learning_rate': 2.267785234899329e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41222/75000 [38:20<28:43, 19.60it/s]

{'loss': 0.3111, 'grad_norm': 4.233823299407959, 'learning_rate': 2.2671140939597314e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41234/75000 [38:20<27:31, 20.45it/s]

{'loss': 0.3613, 'grad_norm': 4.130523204803467, 'learning_rate': 2.2664429530201343e-05, 'epoch': 1.65}


                                                     
 55%|█████▍    | 41242/75000 [38:21<29:06, 19.33it/s]

{'loss': 0.1855, 'grad_norm': 0.3688621520996094, 'learning_rate': 2.265771812080537e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41253/75000 [38:21<27:34, 20.40it/s]

{'loss': 0.2933, 'grad_norm': 1.3383886814117432, 'learning_rate': 2.2651006711409396e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41262/75000 [38:21<28:07, 20.00it/s]

{'loss': 0.2566, 'grad_norm': 3.6903738975524902, 'learning_rate': 2.2644295302013425e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41272/75000 [38:22<29:06, 19.31it/s]

{'loss': 0.5053, 'grad_norm': 1.848801612854004, 'learning_rate': 2.263758389261745e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41282/75000 [38:23<29:25, 19.10it/s]

{'loss': 0.3387, 'grad_norm': 5.113147258758545, 'learning_rate': 2.2630872483221475e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41293/75000 [38:23<30:19, 18.53it/s]

{'loss': 0.3062, 'grad_norm': 13.42497730255127, 'learning_rate': 2.2624161073825504e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41302/75000 [38:24<29:20, 19.14it/s]

{'loss': 0.4182, 'grad_norm': 2.3258302211761475, 'learning_rate': 2.2617449664429532e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41311/75000 [38:24<36:35, 15.34it/s]

{'loss': 0.2598, 'grad_norm': 1.2441959381103516, 'learning_rate': 2.2610738255033557e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41323/75000 [38:25<34:32, 16.25it/s]

{'loss': 0.3515, 'grad_norm': 4.292705059051514, 'learning_rate': 2.2604026845637586e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41331/75000 [38:25<40:49, 13.74it/s]

{'loss': 0.3729, 'grad_norm': 1.774017095565796, 'learning_rate': 2.259731543624161e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41342/75000 [38:26<34:51, 16.09it/s]

{'loss': 0.3318, 'grad_norm': 3.7193281650543213, 'learning_rate': 2.259060402684564e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41352/75000 [38:27<30:27, 18.41it/s]

{'loss': 0.2692, 'grad_norm': 2.373917818069458, 'learning_rate': 2.2583892617449665e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41362/75000 [38:27<29:25, 19.05it/s]

{'loss': 0.2807, 'grad_norm': 2.62847900390625, 'learning_rate': 2.2577181208053693e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41372/75000 [38:28<29:02, 19.29it/s]

{'loss': 0.3495, 'grad_norm': 4.6510210037231445, 'learning_rate': 2.257046979865772e-05, 'epoch': 1.65}


                                                     
 55%|█████▌    | 41383/75000 [38:28<28:27, 19.68it/s]

{'loss': 0.3637, 'grad_norm': 2.0794479846954346, 'learning_rate': 2.2563758389261747e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41392/75000 [38:29<29:36, 18.92it/s]

{'loss': 0.2991, 'grad_norm': 15.382430076599121, 'learning_rate': 2.2557046979865772e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41402/75000 [38:29<29:15, 19.14it/s]

{'loss': 0.4041, 'grad_norm': 5.297412872314453, 'learning_rate': 2.25503355704698e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41412/75000 [38:30<29:08, 19.21it/s]

{'loss': 0.4338, 'grad_norm': 0.8013997077941895, 'learning_rate': 2.2543624161073826e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41424/75000 [38:30<28:23, 19.71it/s]

{'loss': 0.2029, 'grad_norm': 1.8511029481887817, 'learning_rate': 2.253691275167785e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41434/75000 [38:31<28:41, 19.50it/s]

{'loss': 0.1719, 'grad_norm': 2.3469953536987305, 'learning_rate': 2.253020134228188e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41441/75000 [38:31<29:14, 19.13it/s]

{'loss': 0.3259, 'grad_norm': 4.490511894226074, 'learning_rate': 2.2523489932885908e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41454/75000 [38:32<27:41, 20.19it/s]

{'loss': 0.2734, 'grad_norm': 3.381232738494873, 'learning_rate': 2.2516778523489933e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41463/75000 [38:32<29:46, 18.77it/s]

{'loss': 0.3019, 'grad_norm': 2.5545947551727295, 'learning_rate': 2.2510067114093962e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41472/75000 [38:33<29:37, 18.86it/s]

{'loss': 0.435, 'grad_norm': 5.6175947189331055, 'learning_rate': 2.2503355704697987e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41483/75000 [38:33<29:26, 18.97it/s]

{'loss': 0.3187, 'grad_norm': 3.3146111965179443, 'learning_rate': 2.2496644295302012e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41492/75000 [38:34<30:33, 18.27it/s]

{'loss': 0.2684, 'grad_norm': 4.92764139175415, 'learning_rate': 2.2489932885906044e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41500/75000 [38:34<28:45, 19.42it/s]

{'loss': 0.2048, 'grad_norm': 1.6366546154022217, 'learning_rate': 2.248322147651007e-05, 'epoch': 1.66}


                                                       
 55%|█████▌    | 41513/75000 [38:35<35:19, 15.80it/s]

{'loss': 0.3332, 'grad_norm': 19.010339736938477, 'learning_rate': 2.2476510067114095e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41521/75000 [38:36<31:08, 17.92it/s]

{'loss': 0.4704, 'grad_norm': 8.834417343139648, 'learning_rate': 2.2469798657718123e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41532/75000 [38:36<31:03, 17.96it/s]

{'loss': 0.2401, 'grad_norm': 5.271615505218506, 'learning_rate': 2.2463087248322148e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41542/75000 [38:37<34:19, 16.25it/s]

{'loss': 0.2849, 'grad_norm': 0.465236097574234, 'learning_rate': 2.2456375838926173e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41551/75000 [38:37<30:40, 18.17it/s]

{'loss': 0.3489, 'grad_norm': 7.97816801071167, 'learning_rate': 2.2449664429530202e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41562/75000 [38:38<29:49, 18.68it/s]

{'loss': 0.3307, 'grad_norm': 8.374983787536621, 'learning_rate': 2.244295302013423e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41571/75000 [38:39<33:03, 16.85it/s]

{'loss': 0.311, 'grad_norm': 5.7426018714904785, 'learning_rate': 2.2436241610738256e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41583/75000 [38:39<28:54, 19.26it/s]

{'loss': 0.4062, 'grad_norm': 3.730405807495117, 'learning_rate': 2.2429530201342284e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41592/75000 [38:40<31:50, 17.49it/s]

{'loss': 0.2493, 'grad_norm': 5.1895904541015625, 'learning_rate': 2.242281879194631e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41602/75000 [38:40<29:17, 19.01it/s]

{'loss': 0.2857, 'grad_norm': 0.9345792531967163, 'learning_rate': 2.2416107382550335e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41613/75000 [38:41<30:49, 18.05it/s]

{'loss': 0.3567, 'grad_norm': 5.951272010803223, 'learning_rate': 2.2409395973154363e-05, 'epoch': 1.66}


                                                     
 55%|█████▌    | 41622/75000 [38:41<30:24, 18.29it/s]

{'loss': 0.4424, 'grad_norm': 1.4087060689926147, 'learning_rate': 2.240268456375839e-05, 'epoch': 1.66}


                                                     
 56%|█████▌    | 41632/75000 [38:42<28:54, 19.24it/s]

{'loss': 0.3463, 'grad_norm': 5.109470844268799, 'learning_rate': 2.2395973154362417e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41644/75000 [38:42<28:31, 19.49it/s]

{'loss': 0.2737, 'grad_norm': 2.9743266105651855, 'learning_rate': 2.2389261744966445e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41652/75000 [38:43<28:46, 19.31it/s]

{'loss': 0.2842, 'grad_norm': 8.969398498535156, 'learning_rate': 2.238255033557047e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41661/75000 [38:43<30:53, 17.99it/s]

{'loss': 0.2722, 'grad_norm': 1.3839792013168335, 'learning_rate': 2.2375838926174496e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41674/75000 [38:44<27:36, 20.12it/s]

{'loss': 0.3933, 'grad_norm': 1.7288380861282349, 'learning_rate': 2.2369127516778524e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41684/75000 [38:45<28:00, 19.83it/s]

{'loss': 0.2975, 'grad_norm': 2.362959384918213, 'learning_rate': 2.2362416107382553e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41694/75000 [38:45<27:46, 19.99it/s]

{'loss': 0.4051, 'grad_norm': 3.8020517826080322, 'learning_rate': 2.2355704697986578e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41703/75000 [38:46<28:48, 19.26it/s]

{'loss': 0.3071, 'grad_norm': 1.126526117324829, 'learning_rate': 2.2348993288590606e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41712/75000 [38:46<30:04, 18.44it/s]

{'loss': 0.3082, 'grad_norm': 3.6754822731018066, 'learning_rate': 2.234228187919463e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41722/75000 [38:47<32:40, 16.97it/s]

{'loss': 0.3679, 'grad_norm': 2.86037540435791, 'learning_rate': 2.2335570469798657e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41734/75000 [38:47<29:28, 18.81it/s]

{'loss': 0.3218, 'grad_norm': 9.62951946258545, 'learning_rate': 2.2328859060402685e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41744/75000 [38:48<29:38, 18.70it/s]

{'loss': 0.3768, 'grad_norm': 11.357959747314453, 'learning_rate': 2.232214765100671e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41752/75000 [38:48<33:38, 16.47it/s]

{'loss': 0.3803, 'grad_norm': 18.281599044799805, 'learning_rate': 2.231543624161074e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41762/75000 [38:49<31:05, 17.82it/s]

{'loss': 0.2883, 'grad_norm': 2.4728384017944336, 'learning_rate': 2.2308724832214768e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41772/75000 [38:49<29:26, 18.81it/s]

{'loss': 0.3822, 'grad_norm': 1.3896691799163818, 'learning_rate': 2.2302013422818793e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41782/75000 [38:50<29:11, 18.97it/s]

{'loss': 0.3598, 'grad_norm': 1.4091917276382446, 'learning_rate': 2.229530201342282e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41793/75000 [38:50<27:26, 20.16it/s]

{'loss': 0.3605, 'grad_norm': 1.6715853214263916, 'learning_rate': 2.2288590604026846e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41803/75000 [38:51<31:01, 17.84it/s]

{'loss': 0.2679, 'grad_norm': 3.0224409103393555, 'learning_rate': 2.228187919463087e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41814/75000 [38:51<28:29, 19.41it/s]

{'loss': 0.333, 'grad_norm': 7.105967044830322, 'learning_rate': 2.22751677852349e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41821/75000 [38:52<31:27, 17.57it/s]

{'loss': 0.2058, 'grad_norm': 7.069302558898926, 'learning_rate': 2.226845637583893e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41834/75000 [38:53<27:37, 20.01it/s]

{'loss': 0.4179, 'grad_norm': 1.2924158573150635, 'learning_rate': 2.2261744966442954e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41842/75000 [38:53<32:03, 17.24it/s]

{'loss': 0.322, 'grad_norm': 1.738450527191162, 'learning_rate': 2.2255033557046982e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41854/75000 [38:54<28:32, 19.36it/s]

{'loss': 0.3793, 'grad_norm': 4.373619079589844, 'learning_rate': 2.2248322147651008e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41864/75000 [38:54<27:53, 19.80it/s]

{'loss': 0.2653, 'grad_norm': 2.07389235496521, 'learning_rate': 2.2241610738255033e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41872/75000 [38:55<29:02, 19.02it/s]

{'loss': 0.2575, 'grad_norm': 4.691410541534424, 'learning_rate': 2.223489932885906e-05, 'epoch': 1.67}


                                                     
 56%|█████▌    | 41882/75000 [38:55<28:36, 19.30it/s]

{'loss': 0.2997, 'grad_norm': 4.254809379577637, 'learning_rate': 2.222818791946309e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41894/75000 [38:56<29:20, 18.81it/s]

{'loss': 0.3374, 'grad_norm': 3.945786237716675, 'learning_rate': 2.2221476510067115e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41902/75000 [38:56<29:04, 18.98it/s]

{'loss': 0.2101, 'grad_norm': 2.125741481781006, 'learning_rate': 2.2214765100671144e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41914/75000 [38:57<27:36, 19.97it/s]

{'loss': 0.2261, 'grad_norm': 6.7131428718566895, 'learning_rate': 2.220805369127517e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41923/75000 [38:57<27:32, 20.02it/s]

{'loss': 0.2545, 'grad_norm': 4.22253942489624, 'learning_rate': 2.2201342281879194e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41933/75000 [38:58<29:55, 18.42it/s]

{'loss': 0.4577, 'grad_norm': 4.39225959777832, 'learning_rate': 2.2194630872483222e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41944/75000 [38:58<27:45, 19.85it/s]

{'loss': 0.3572, 'grad_norm': 2.4551074504852295, 'learning_rate': 2.218791946308725e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41952/75000 [38:59<30:38, 17.97it/s]

{'loss': 0.3209, 'grad_norm': 3.3238587379455566, 'learning_rate': 2.2181208053691276e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41964/75000 [38:59<28:04, 19.61it/s]

{'loss': 0.2876, 'grad_norm': 3.2700321674346924, 'learning_rate': 2.2174496644295305e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41973/75000 [39:00<29:28, 18.67it/s]

{'loss': 0.3995, 'grad_norm': 1.9821263551712036, 'learning_rate': 2.216778523489933e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41982/75000 [39:00<30:22, 18.12it/s]

{'loss': 0.3947, 'grad_norm': 1.2239488363265991, 'learning_rate': 2.2161073825503355e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 41994/75000 [39:01<29:07, 18.89it/s]

{'loss': 0.2355, 'grad_norm': 1.3742783069610596, 'learning_rate': 2.2154362416107384e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42000/75000 [39:01<30:26, 18.07it/s]

{'loss': 0.2764, 'grad_norm': 2.160984754562378, 'learning_rate': 2.2147651006711412e-05, 'epoch': 1.68}


                                                       
 56%|█████▌    | 42014/75000 [39:03<35:03, 15.68it/s]

{'loss': 0.596, 'grad_norm': 2.389040946960449, 'learning_rate': 2.2140939597315437e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42024/75000 [39:03<28:47, 19.09it/s]

{'loss': 0.2422, 'grad_norm': 0.6680461168289185, 'learning_rate': 2.2134228187919466e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42031/75000 [39:04<31:27, 17.46it/s]

{'loss': 0.3075, 'grad_norm': 11.76113224029541, 'learning_rate': 2.212751677852349e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42042/75000 [39:04<28:49, 19.06it/s]

{'loss': 0.2664, 'grad_norm': 3.1625354290008545, 'learning_rate': 2.2120805369127516e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42052/75000 [39:05<31:41, 17.33it/s]

{'loss': 0.3551, 'grad_norm': 3.7021610736846924, 'learning_rate': 2.2114093959731545e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42062/75000 [39:05<28:59, 18.93it/s]

{'loss': 0.4015, 'grad_norm': 7.080336093902588, 'learning_rate': 2.210738255033557e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42073/75000 [39:06<30:18, 18.10it/s]

{'loss': 0.2813, 'grad_norm': 1.5719338655471802, 'learning_rate': 2.21006711409396e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42082/75000 [39:06<28:36, 19.17it/s]

{'loss': 0.3218, 'grad_norm': 1.8547736406326294, 'learning_rate': 2.2093959731543627e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42093/75000 [39:07<31:20, 17.50it/s]

{'loss': 0.3365, 'grad_norm': 2.7443530559539795, 'learning_rate': 2.2087248322147652e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42102/75000 [39:07<28:57, 18.94it/s]

{'loss': 0.3377, 'grad_norm': 1.9581047296524048, 'learning_rate': 2.2080536912751677e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42114/75000 [39:08<28:39, 19.13it/s]

{'loss': 0.2192, 'grad_norm': 0.9402910470962524, 'learning_rate': 2.2073825503355706e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42123/75000 [39:08<29:42, 18.45it/s]

{'loss': 0.368, 'grad_norm': 2.3039278984069824, 'learning_rate': 2.206711409395973e-05, 'epoch': 1.68}


                                                     
 56%|█████▌    | 42134/75000 [39:09<28:53, 18.96it/s]

{'loss': 0.3235, 'grad_norm': 4.7170305252075195, 'learning_rate': 2.206040268456376e-05, 'epoch': 1.69}


                                                     
 56%|█████▌    | 42141/75000 [39:09<30:32, 17.93it/s]

{'loss': 0.2193, 'grad_norm': 0.5503975749015808, 'learning_rate': 2.2053691275167788e-05, 'epoch': 1.69}


                                                     
 56%|█████▌    | 42154/75000 [39:10<27:04, 20.22it/s]

{'loss': 0.2721, 'grad_norm': 2.114417552947998, 'learning_rate': 2.2046979865771813e-05, 'epoch': 1.69}


                                                     
 56%|█████▌    | 42164/75000 [39:11<27:38, 19.80it/s]

{'loss': 0.3485, 'grad_norm': 6.4722371101379395, 'learning_rate': 2.204026845637584e-05, 'epoch': 1.69}


                                                     
 56%|█████▌    | 42174/75000 [39:11<28:50, 18.96it/s]

{'loss': 0.3289, 'grad_norm': 0.3583226501941681, 'learning_rate': 2.2033557046979867e-05, 'epoch': 1.69}


                                                     
 56%|█████▌    | 42184/75000 [39:12<28:22, 19.28it/s]

{'loss': 0.2547, 'grad_norm': 3.822817325592041, 'learning_rate': 2.2026845637583892e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42194/75000 [39:12<27:23, 19.97it/s]

{'loss': 0.4044, 'grad_norm': 2.3295443058013916, 'learning_rate': 2.202013422818792e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42202/75000 [39:13<28:22, 19.26it/s]

{'loss': 0.2725, 'grad_norm': 3.865793466567993, 'learning_rate': 2.201342281879195e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42212/75000 [39:13<30:37, 17.84it/s]

{'loss': 0.2642, 'grad_norm': 1.9816559553146362, 'learning_rate': 2.2006711409395974e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42222/75000 [39:14<29:12, 18.70it/s]

{'loss': 0.2529, 'grad_norm': 2.3366291522979736, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42234/75000 [39:14<27:23, 19.94it/s]

{'loss': 0.2971, 'grad_norm': 2.8065898418426514, 'learning_rate': 2.1993288590604028e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42244/75000 [39:15<27:42, 19.70it/s]

{'loss': 0.4232, 'grad_norm': 2.192960739135742, 'learning_rate': 2.1986577181208053e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42252/75000 [39:15<28:22, 19.24it/s]

{'loss': 0.3397, 'grad_norm': 9.063199996948242, 'learning_rate': 2.197986577181208e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42264/75000 [39:16<27:15, 20.02it/s]

{'loss': 0.3011, 'grad_norm': 3.2223455905914307, 'learning_rate': 2.197315436241611e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42272/75000 [39:16<29:40, 18.38it/s]

{'loss': 0.3373, 'grad_norm': 1.1967185735702515, 'learning_rate': 2.1966442953020135e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42283/75000 [39:17<29:25, 18.53it/s]

{'loss': 0.3337, 'grad_norm': 7.578429222106934, 'learning_rate': 2.1959731543624164e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42294/75000 [39:17<29:12, 18.66it/s]

{'loss': 0.3347, 'grad_norm': 4.31182861328125, 'learning_rate': 2.195302013422819e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42302/75000 [39:18<29:12, 18.65it/s]

{'loss': 0.3595, 'grad_norm': 3.564404249191284, 'learning_rate': 2.1946308724832214e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42312/75000 [39:18<30:34, 17.82it/s]

{'loss': 0.2244, 'grad_norm': 1.0198320150375366, 'learning_rate': 2.1939597315436243e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42322/75000 [39:19<28:32, 19.08it/s]

{'loss': 0.2767, 'grad_norm': 5.491386890411377, 'learning_rate': 2.193288590604027e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42334/75000 [39:20<27:25, 19.85it/s]

{'loss': 0.324, 'grad_norm': 2.7920758724212646, 'learning_rate': 2.1926174496644297e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42342/75000 [39:20<31:03, 17.52it/s]

{'loss': 0.3885, 'grad_norm': 7.926698684692383, 'learning_rate': 2.1919463087248325e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42353/75000 [39:21<30:49, 17.65it/s]

{'loss': 0.2853, 'grad_norm': 1.5533305406570435, 'learning_rate': 2.191275167785235e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42363/75000 [39:21<30:11, 18.01it/s]

{'loss': 0.366, 'grad_norm': 1.9833238124847412, 'learning_rate': 2.1906040268456375e-05, 'epoch': 1.69}


                                                     
 56%|█████▋    | 42374/75000 [39:22<27:59, 19.43it/s]

{'loss': 0.2932, 'grad_norm': 0.500490665435791, 'learning_rate': 2.1899328859060404e-05, 'epoch': 1.69}


                                                     
 57%|█████▋    | 42383/75000 [39:22<27:50, 19.53it/s]

{'loss': 0.3685, 'grad_norm': 0.6414589285850525, 'learning_rate': 2.189261744966443e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42392/75000 [39:23<28:21, 19.17it/s]

{'loss': 0.1941, 'grad_norm': 1.6551663875579834, 'learning_rate': 2.1885906040268458e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42404/75000 [39:23<27:32, 19.73it/s]

{'loss': 0.2981, 'grad_norm': 2.705430507659912, 'learning_rate': 2.1879194630872486e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42413/75000 [39:24<28:05, 19.34it/s]

{'loss': 0.3393, 'grad_norm': 1.7705885171890259, 'learning_rate': 2.187248322147651e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42422/75000 [39:24<33:12, 16.35it/s]

{'loss': 0.2867, 'grad_norm': 3.7430922985076904, 'learning_rate': 2.1865771812080536e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42434/75000 [39:25<29:53, 18.15it/s]

{'loss': 0.2429, 'grad_norm': 2.093491315841675, 'learning_rate': 2.1859060402684565e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42442/75000 [39:25<29:07, 18.63it/s]

{'loss': 0.2416, 'grad_norm': 2.3727073669433594, 'learning_rate': 2.185234899328859e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42453/75000 [39:26<29:41, 18.27it/s]

{'loss': 0.2203, 'grad_norm': 9.139382362365723, 'learning_rate': 2.184563758389262e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42464/75000 [39:26<27:35, 19.66it/s]

{'loss': 0.3937, 'grad_norm': 1.3514204025268555, 'learning_rate': 2.1838926174496647e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42472/75000 [39:27<30:16, 17.90it/s]

{'loss': 0.381, 'grad_norm': 3.6877760887145996, 'learning_rate': 2.1832214765100672e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42481/75000 [39:27<33:50, 16.02it/s]

{'loss': 0.2288, 'grad_norm': 1.4786059856414795, 'learning_rate': 2.1825503355704698e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42492/75000 [39:28<32:58, 16.43it/s]

{'loss': 0.3149, 'grad_norm': 0.4460119605064392, 'learning_rate': 2.1818791946308726e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42500/75000 [39:29<30:02, 18.03it/s]

{'loss': 0.3063, 'grad_norm': 1.003868818283081, 'learning_rate': 2.181208053691275e-05, 'epoch': 1.7}


                                                       
 57%|█████▋    | 42514/75000 [39:30<32:52, 16.47it/s]

{'loss': 0.5172, 'grad_norm': 9.474343299865723, 'learning_rate': 2.180536912751678e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42522/75000 [39:30<32:52, 16.46it/s]

{'loss': 0.2907, 'grad_norm': 0.8864202499389648, 'learning_rate': 2.179865771812081e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42534/75000 [39:31<28:05, 19.26it/s]

{'loss': 0.3451, 'grad_norm': 0.8486303091049194, 'learning_rate': 2.1791946308724834e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42543/75000 [39:31<30:02, 18.00it/s]

{'loss': 0.3098, 'grad_norm': 1.480107069015503, 'learning_rate': 2.178523489932886e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42553/75000 [39:32<31:48, 17.00it/s]

{'loss': 0.306, 'grad_norm': 6.614940643310547, 'learning_rate': 2.1778523489932887e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42562/75000 [39:32<30:04, 17.98it/s]

{'loss': 0.2905, 'grad_norm': 2.5648796558380127, 'learning_rate': 2.1771812080536912e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42573/75000 [39:33<30:36, 17.66it/s]

{'loss': 0.2204, 'grad_norm': 2.093996524810791, 'learning_rate': 2.176510067114094e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42581/75000 [39:34<32:18, 16.73it/s]

{'loss': 0.38, 'grad_norm': 1.3838998079299927, 'learning_rate': 2.175838926174497e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42593/75000 [39:34<32:24, 16.67it/s]

{'loss': 0.3629, 'grad_norm': 3.320463180541992, 'learning_rate': 2.1751677852348995e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42604/75000 [39:35<28:13, 19.12it/s]

{'loss': 0.2991, 'grad_norm': 2.68717360496521, 'learning_rate': 2.174496644295302e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42613/75000 [39:35<29:31, 18.28it/s]

{'loss': 0.4065, 'grad_norm': 2.1624462604522705, 'learning_rate': 2.173825503355705e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42622/75000 [39:36<28:31, 18.92it/s]

{'loss': 0.3984, 'grad_norm': 3.23565673828125, 'learning_rate': 2.1731543624161074e-05, 'epoch': 1.7}


                                                     
 57%|█████▋    | 42633/75000 [39:36<29:17, 18.42it/s]

{'loss': 0.2457, 'grad_norm': 6.858389377593994, 'learning_rate': 2.1724832214765102e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42643/75000 [39:37<28:09, 19.16it/s]

{'loss': 0.369, 'grad_norm': 4.1770524978637695, 'learning_rate': 2.171812080536913e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42652/75000 [39:37<27:54, 19.31it/s]

{'loss': 0.4081, 'grad_norm': 8.038656234741211, 'learning_rate': 2.1711409395973156e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42663/75000 [39:38<29:51, 18.05it/s]

{'loss': 0.3545, 'grad_norm': 5.002927303314209, 'learning_rate': 2.1704697986577184e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42674/75000 [39:39<27:26, 19.63it/s]

{'loss': 0.2978, 'grad_norm': 5.030539035797119, 'learning_rate': 2.169798657718121e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42681/75000 [39:39<27:52, 19.33it/s]

{'loss': 0.3705, 'grad_norm': 10.025898933410645, 'learning_rate': 2.1691275167785235e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42694/75000 [39:40<28:35, 18.83it/s]

{'loss': 0.3888, 'grad_norm': 6.991278171539307, 'learning_rate': 2.1684563758389263e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42703/75000 [39:40<28:53, 18.63it/s]

{'loss': 0.2423, 'grad_norm': 2.418829917907715, 'learning_rate': 2.167785234899329e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42714/75000 [39:41<28:07, 19.14it/s]

{'loss': 0.3704, 'grad_norm': 1.6190531253814697, 'learning_rate': 2.1671140939597317e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42724/75000 [39:41<29:09, 18.44it/s]

{'loss': 0.2429, 'grad_norm': 2.326005458831787, 'learning_rate': 2.1664429530201345e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42732/75000 [39:42<29:48, 18.04it/s]

{'loss': 0.3045, 'grad_norm': 1.2291620969772339, 'learning_rate': 2.165771812080537e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42743/75000 [39:42<29:35, 18.17it/s]

{'loss': 0.3764, 'grad_norm': 2.3995718955993652, 'learning_rate': 2.1651006711409396e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42754/75000 [39:43<26:53, 19.99it/s]

{'loss': 0.2983, 'grad_norm': 3.7043893337249756, 'learning_rate': 2.1644295302013424e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42764/75000 [39:44<29:21, 18.30it/s]

{'loss': 0.3301, 'grad_norm': 9.1271390914917, 'learning_rate': 2.163758389261745e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42774/75000 [39:44<29:34, 18.16it/s]

{'loss': 0.4012, 'grad_norm': 3.441305637359619, 'learning_rate': 2.1630872483221478e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42784/75000 [39:45<28:11, 19.05it/s]

{'loss': 0.4193, 'grad_norm': 2.1757636070251465, 'learning_rate': 2.1624161073825507e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42793/75000 [39:45<28:18, 18.96it/s]

{'loss': 0.3143, 'grad_norm': 3.0773119926452637, 'learning_rate': 2.1617449664429532e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42802/75000 [39:46<29:44, 18.05it/s]

{'loss': 0.2713, 'grad_norm': 6.196016788482666, 'learning_rate': 2.1610738255033557e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42811/75000 [39:46<28:24, 18.89it/s]

{'loss': 0.3573, 'grad_norm': 6.048511028289795, 'learning_rate': 2.1604026845637585e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42824/75000 [39:47<26:48, 20.00it/s]

{'loss': 0.3357, 'grad_norm': 1.9775468111038208, 'learning_rate': 2.159731543624161e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42834/75000 [39:47<26:59, 19.86it/s]

{'loss': 0.3402, 'grad_norm': 0.7347884774208069, 'learning_rate': 2.159060402684564e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42842/75000 [39:48<29:32, 18.14it/s]

{'loss': 0.2312, 'grad_norm': 5.70564079284668, 'learning_rate': 2.1583892617449668e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42853/75000 [39:48<30:00, 17.85it/s]

{'loss': 0.2878, 'grad_norm': 7.105238914489746, 'learning_rate': 2.1577181208053693e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42862/75000 [39:49<28:23, 18.87it/s]

{'loss': 0.4044, 'grad_norm': 2.3788576126098633, 'learning_rate': 2.1570469798657718e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42871/75000 [39:49<27:55, 19.17it/s]

{'loss': 0.3597, 'grad_norm': 9.811267852783203, 'learning_rate': 2.1563758389261747e-05, 'epoch': 1.71}


                                                     
 57%|█████▋    | 42884/75000 [39:50<26:16, 20.37it/s]

{'loss': 0.4352, 'grad_norm': 8.195599555969238, 'learning_rate': 2.1557046979865772e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42893/75000 [39:50<28:53, 18.52it/s]

{'loss': 0.2502, 'grad_norm': 2.0965631008148193, 'learning_rate': 2.1550335570469797e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42902/75000 [39:51<28:16, 18.92it/s]

{'loss': 0.3337, 'grad_norm': 7.157819747924805, 'learning_rate': 2.154362416107383e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42914/75000 [39:51<27:01, 19.79it/s]

{'loss': 0.2877, 'grad_norm': 2.0643157958984375, 'learning_rate': 2.1536912751677854e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42923/75000 [39:52<28:56, 18.48it/s]

{'loss': 0.3037, 'grad_norm': 3.7439534664154053, 'learning_rate': 2.153020134228188e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42931/75000 [39:52<30:02, 17.79it/s]

{'loss': 0.2917, 'grad_norm': 4.979989528656006, 'learning_rate': 2.1523489932885908e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42942/75000 [39:53<27:04, 19.74it/s]

{'loss': 0.3358, 'grad_norm': 0.37077391147613525, 'learning_rate': 2.1516778523489933e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42952/75000 [39:53<29:55, 17.85it/s]

{'loss': 0.2711, 'grad_norm': 1.6223578453063965, 'learning_rate': 2.1510067114093958e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42962/75000 [39:54<29:20, 18.20it/s]

{'loss': 0.3287, 'grad_norm': 1.2115368843078613, 'learning_rate': 2.150335570469799e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42974/75000 [39:55<27:23, 19.48it/s]

{'loss': 0.451, 'grad_norm': 3.522127151489258, 'learning_rate': 2.1496644295302015e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42984/75000 [39:55<28:00, 19.05it/s]

{'loss': 0.1714, 'grad_norm': 1.0692565441131592, 'learning_rate': 2.148993288590604e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 42991/75000 [39:56<29:30, 18.08it/s]

{'loss': 0.2758, 'grad_norm': 6.395868301391602, 'learning_rate': 2.148322147651007e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43000/75000 [39:56<27:46, 19.20it/s]

{'loss': 0.3112, 'grad_norm': 2.840907573699951, 'learning_rate': 2.1476510067114094e-05, 'epoch': 1.72}


                                                       
 57%|█████▋    | 43014/75000 [39:57<33:12, 16.06it/s]

{'loss': 0.2471, 'grad_norm': 4.671566963195801, 'learning_rate': 2.1469798657718123e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43021/75000 [39:58<32:48, 16.25it/s]

{'loss': 0.2886, 'grad_norm': 0.8305962085723877, 'learning_rate': 2.1463087248322148e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43032/75000 [39:58<30:59, 17.19it/s]

{'loss': 0.3498, 'grad_norm': 3.0611722469329834, 'learning_rate': 2.1456375838926176e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43043/75000 [39:59<30:06, 17.69it/s]

{'loss': 0.3242, 'grad_norm': 2.6300742626190186, 'learning_rate': 2.14496644295302e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43053/75000 [40:00<28:58, 18.37it/s]

{'loss': 0.3348, 'grad_norm': 1.9354760646820068, 'learning_rate': 2.144295302013423e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43063/75000 [40:00<28:50, 18.45it/s]

{'loss': 0.2355, 'grad_norm': 1.5654865503311157, 'learning_rate': 2.1436241610738255e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43071/75000 [40:00<29:08, 18.26it/s]

{'loss': 0.3007, 'grad_norm': 2.6318130493164062, 'learning_rate': 2.1429530201342284e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43082/75000 [40:01<27:12, 19.55it/s]

{'loss': 0.2966, 'grad_norm': 1.6280008554458618, 'learning_rate': 2.142281879194631e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43092/75000 [40:02<30:42, 17.32it/s]

{'loss': 0.2716, 'grad_norm': 9.053322792053223, 'learning_rate': 2.1416107382550337e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43103/75000 [40:02<29:23, 18.08it/s]

{'loss': 0.2502, 'grad_norm': 4.518111228942871, 'learning_rate': 2.1409395973154362e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43112/75000 [40:03<28:11, 18.86it/s]

{'loss': 0.3899, 'grad_norm': 3.890514612197876, 'learning_rate': 2.140268456375839e-05, 'epoch': 1.72}


                                                     
 57%|█████▋    | 43123/75000 [40:03<28:12, 18.84it/s]

{'loss': 0.3422, 'grad_norm': 1.585144281387329, 'learning_rate': 2.1395973154362416e-05, 'epoch': 1.72}


                                                     
 58%|█████▊    | 43132/75000 [40:04<32:10, 16.51it/s]

{'loss': 0.2856, 'grad_norm': 5.544965744018555, 'learning_rate': 2.1389261744966445e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43142/75000 [40:04<29:48, 17.81it/s]

{'loss': 0.2634, 'grad_norm': 2.5996460914611816, 'learning_rate': 2.138255033557047e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43151/75000 [40:05<28:40, 18.51it/s]

{'loss': 0.2603, 'grad_norm': 6.146337985992432, 'learning_rate': 2.13758389261745e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43164/75000 [40:05<26:54, 19.72it/s]

{'loss': 0.1986, 'grad_norm': 1.28115975856781, 'learning_rate': 2.1369127516778527e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43173/75000 [40:06<28:36, 18.54it/s]

{'loss': 0.2683, 'grad_norm': 1.1721749305725098, 'learning_rate': 2.1362416107382552e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43183/75000 [40:06<29:02, 18.26it/s]

{'loss': 0.3677, 'grad_norm': 11.252843856811523, 'learning_rate': 2.1355704697986577e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43194/75000 [40:07<26:49, 19.76it/s]

{'loss': 0.3255, 'grad_norm': 14.469555854797363, 'learning_rate': 2.1348993288590606e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43202/75000 [40:08<29:14, 18.12it/s]

{'loss': 0.2645, 'grad_norm': 3.6090047359466553, 'learning_rate': 2.134228187919463e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43214/75000 [40:08<27:17, 19.41it/s]

{'loss': 0.4806, 'grad_norm': 4.20340633392334, 'learning_rate': 2.1335570469798656e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43222/75000 [40:09<28:49, 18.38it/s]

{'loss': 0.3148, 'grad_norm': 5.531896591186523, 'learning_rate': 2.1328859060402688e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43232/75000 [40:09<27:52, 19.00it/s]

{'loss': 0.3932, 'grad_norm': 3.9792468547821045, 'learning_rate': 2.1322147651006713e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43244/75000 [40:10<26:58, 19.62it/s]

{'loss': 0.2586, 'grad_norm': 2.4665300846099854, 'learning_rate': 2.131543624161074e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43254/75000 [40:10<26:35, 19.90it/s]

{'loss': 0.3359, 'grad_norm': 5.962559700012207, 'learning_rate': 2.1308724832214767e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43263/75000 [40:11<28:43, 18.41it/s]

{'loss': 0.3142, 'grad_norm': 2.1120359897613525, 'learning_rate': 2.1302013422818792e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43272/75000 [40:11<28:15, 18.71it/s]

{'loss': 0.352, 'grad_norm': 7.508524417877197, 'learning_rate': 2.1295302013422817e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43284/75000 [40:12<26:42, 19.79it/s]

{'loss': 0.3081, 'grad_norm': 9.321850776672363, 'learning_rate': 2.128859060402685e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43293/75000 [40:12<29:41, 17.80it/s]

{'loss': 0.3329, 'grad_norm': 1.7641910314559937, 'learning_rate': 2.1281879194630874e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43302/75000 [40:13<27:26, 19.26it/s]

{'loss': 0.3551, 'grad_norm': 1.1029139757156372, 'learning_rate': 2.12751677852349e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43313/75000 [40:13<27:19, 19.32it/s]

{'loss': 0.3339, 'grad_norm': 7.180912494659424, 'learning_rate': 2.1268456375838928e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43324/75000 [40:14<28:24, 18.59it/s]

{'loss': 0.4297, 'grad_norm': 5.596066474914551, 'learning_rate': 2.1261744966442953e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43333/75000 [40:14<29:06, 18.14it/s]

{'loss': 0.3449, 'grad_norm': 0.6374794840812683, 'learning_rate': 2.125503355704698e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43344/75000 [40:15<26:59, 19.55it/s]

{'loss': 0.3722, 'grad_norm': 2.9312374591827393, 'learning_rate': 2.1248322147651007e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43353/75000 [40:15<29:27, 17.90it/s]

{'loss': 0.3187, 'grad_norm': 1.3790160417556763, 'learning_rate': 2.1241610738255036e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43362/75000 [40:16<31:14, 16.88it/s]

{'loss': 0.2872, 'grad_norm': 1.8029717206954956, 'learning_rate': 2.123489932885906e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43373/75000 [40:17<28:38, 18.40it/s]

{'loss': 0.351, 'grad_norm': 6.489220142364502, 'learning_rate': 2.122818791946309e-05, 'epoch': 1.73}


                                                     
 58%|█████▊    | 43381/75000 [40:17<33:08, 15.90it/s]

{'loss': 0.3689, 'grad_norm': 10.334577560424805, 'learning_rate': 2.1221476510067114e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43392/75000 [40:18<28:28, 18.50it/s]

{'loss': 0.3144, 'grad_norm': 5.651687145233154, 'learning_rate': 2.121476510067114e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43403/75000 [40:18<28:02, 18.78it/s]

{'loss': 0.3594, 'grad_norm': 7.997323036193848, 'learning_rate': 2.1208053691275168e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43412/75000 [40:19<29:38, 17.76it/s]

{'loss': 0.2495, 'grad_norm': 1.5784456729888916, 'learning_rate': 2.1201342281879197e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43421/75000 [40:19<29:14, 18.00it/s]

{'loss': 0.3478, 'grad_norm': 7.052349090576172, 'learning_rate': 2.1194630872483222e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43433/75000 [40:20<28:35, 18.40it/s]

{'loss': 0.3878, 'grad_norm': 5.080456256866455, 'learning_rate': 2.118791946308725e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43442/75000 [40:20<30:51, 17.05it/s]

{'loss': 0.4087, 'grad_norm': 1.8923691511154175, 'learning_rate': 2.1181208053691275e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43452/75000 [40:21<31:27, 16.71it/s]

{'loss': 0.3761, 'grad_norm': 1.9389679431915283, 'learning_rate': 2.1174496644295304e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43462/75000 [40:22<32:07, 16.36it/s]

{'loss': 0.3723, 'grad_norm': 6.896431922912598, 'learning_rate': 2.116778523489933e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43474/75000 [40:22<26:40, 19.70it/s]

{'loss': 0.3125, 'grad_norm': 2.2663893699645996, 'learning_rate': 2.1161073825503358e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43482/75000 [40:23<30:12, 17.39it/s]

{'loss': 0.2534, 'grad_norm': 4.260861873626709, 'learning_rate': 2.1154362416107383e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43492/75000 [40:23<27:30, 19.10it/s]

{'loss': 0.3073, 'grad_norm': 5.121302604675293, 'learning_rate': 2.114765100671141e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43500/75000 [40:24<29:10, 18.00it/s]

{'loss': 0.2665, 'grad_norm': 2.8076608180999756, 'learning_rate': 2.1140939597315437e-05, 'epoch': 1.74}


                                                       
 58%|█████▊    | 43514/75000 [40:25<31:44, 16.54it/s]

{'loss': 0.2787, 'grad_norm': 1.2740596532821655, 'learning_rate': 2.1134228187919465e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43522/75000 [40:25<28:23, 18.48it/s]

{'loss': 0.3005, 'grad_norm': 7.918856143951416, 'learning_rate': 2.112751677852349e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43532/75000 [40:26<30:18, 17.30it/s]

{'loss': 0.2888, 'grad_norm': 2.114973545074463, 'learning_rate': 2.1120805369127515e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43542/75000 [40:26<29:19, 17.88it/s]

{'loss': 0.3862, 'grad_norm': 1.7556047439575195, 'learning_rate': 2.1114093959731544e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43553/75000 [40:27<28:14, 18.56it/s]

{'loss': 0.3729, 'grad_norm': 0.7935939431190491, 'learning_rate': 2.1107382550335573e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43563/75000 [40:28<27:32, 19.03it/s]

{'loss': 0.175, 'grad_norm': 2.9590065479278564, 'learning_rate': 2.1100671140939598e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43572/75000 [40:28<28:13, 18.56it/s]

{'loss': 0.2684, 'grad_norm': 0.40828511118888855, 'learning_rate': 2.1093959731543626e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43582/75000 [40:29<30:12, 17.33it/s]

{'loss': 0.3175, 'grad_norm': 3.8523848056793213, 'learning_rate': 2.108724832214765e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43591/75000 [40:29<27:45, 18.86it/s]

{'loss': 0.3444, 'grad_norm': 1.2666107416152954, 'learning_rate': 2.1080536912751677e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43604/75000 [40:30<27:16, 19.19it/s]

{'loss': 0.2509, 'grad_norm': 8.242454528808594, 'learning_rate': 2.107382550335571e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43612/75000 [40:30<27:21, 19.12it/s]

{'loss': 0.3573, 'grad_norm': 2.6070587635040283, 'learning_rate': 2.1067114093959734e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43622/75000 [40:31<28:35, 18.29it/s]

{'loss': 0.3819, 'grad_norm': 5.253172874450684, 'learning_rate': 2.106040268456376e-05, 'epoch': 1.74}


                                                     
 58%|█████▊    | 43633/75000 [40:31<28:58, 18.04it/s]

{'loss': 0.352, 'grad_norm': 1.8004125356674194, 'learning_rate': 2.1053691275167787e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43642/75000 [40:32<27:55, 18.71it/s]

{'loss': 0.3032, 'grad_norm': 0.908482015132904, 'learning_rate': 2.1046979865771813e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43654/75000 [40:32<26:07, 20.00it/s]

{'loss': 0.2906, 'grad_norm': 8.345653533935547, 'learning_rate': 2.1040268456375838e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43662/75000 [40:33<28:34, 18.28it/s]

{'loss': 0.4422, 'grad_norm': 4.794722080230713, 'learning_rate': 2.1033557046979866e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43673/75000 [40:33<28:05, 18.59it/s]

{'loss': 0.2627, 'grad_norm': 2.3736493587493896, 'learning_rate': 2.1026845637583895e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43684/75000 [40:34<26:52, 19.42it/s]

{'loss': 0.2639, 'grad_norm': 2.9790475368499756, 'learning_rate': 2.102013422818792e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43692/75000 [40:34<28:52, 18.07it/s]

{'loss': 0.366, 'grad_norm': 14.89110279083252, 'learning_rate': 2.101342281879195e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43703/75000 [40:35<29:08, 17.90it/s]

{'loss': 0.3704, 'grad_norm': 1.5146942138671875, 'learning_rate': 2.1006711409395974e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43713/75000 [40:36<28:17, 18.43it/s]

{'loss': 0.4772, 'grad_norm': 4.926642417907715, 'learning_rate': 2.1e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43722/75000 [40:36<29:12, 17.85it/s]

{'loss': 0.2524, 'grad_norm': 1.550390601158142, 'learning_rate': 2.0993288590604027e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43731/75000 [40:37<31:09, 16.73it/s]

{'loss': 0.2632, 'grad_norm': 2.624713897705078, 'learning_rate': 2.0986577181208056e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43744/75000 [40:37<27:05, 19.22it/s]

{'loss': 0.3138, 'grad_norm': 7.743700981140137, 'learning_rate': 2.097986577181208e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43752/75000 [40:38<29:08, 17.87it/s]

{'loss': 0.4686, 'grad_norm': 2.5184803009033203, 'learning_rate': 2.097315436241611e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43763/75000 [40:38<27:31, 18.92it/s]

{'loss': 0.2897, 'grad_norm': 6.578421115875244, 'learning_rate': 2.0966442953020135e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43773/75000 [40:39<30:11, 17.23it/s]

{'loss': 0.3258, 'grad_norm': 12.345627784729004, 'learning_rate': 2.095973154362416e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43783/75000 [40:39<27:46, 18.73it/s]

{'loss': 0.2803, 'grad_norm': 1.7782765626907349, 'learning_rate': 2.095302013422819e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43792/75000 [40:40<27:54, 18.63it/s]

{'loss': 0.3492, 'grad_norm': 7.504842758178711, 'learning_rate': 2.0946308724832217e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43801/75000 [40:40<30:18, 17.16it/s]

{'loss': 0.1705, 'grad_norm': 4.062314510345459, 'learning_rate': 2.0939597315436242e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43814/75000 [40:41<27:43, 18.75it/s]

{'loss': 0.3842, 'grad_norm': 7.106360912322998, 'learning_rate': 2.093288590604027e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43822/75000 [40:42<29:49, 17.42it/s]

{'loss': 0.382, 'grad_norm': 7.152349948883057, 'learning_rate': 2.0926174496644296e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43832/75000 [40:42<30:03, 17.28it/s]

{'loss': 0.268, 'grad_norm': 2.302342176437378, 'learning_rate': 2.091946308724832e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43841/75000 [40:43<29:39, 17.51it/s]

{'loss': 0.3276, 'grad_norm': 3.499194383621216, 'learning_rate': 2.091275167785235e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43851/75000 [40:43<28:18, 18.34it/s]

{'loss': 0.3649, 'grad_norm': 1.7224050760269165, 'learning_rate': 2.0906040268456375e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43864/75000 [40:44<26:43, 19.42it/s]

{'loss': 0.3371, 'grad_norm': 2.292963743209839, 'learning_rate': 2.0899328859060403e-05, 'epoch': 1.75}


                                                     
 58%|█████▊    | 43871/75000 [40:44<27:54, 18.59it/s]

{'loss': 0.3055, 'grad_norm': 6.5495805740356445, 'learning_rate': 2.0892617449664432e-05, 'epoch': 1.75}


                                                     
 59%|█████▊    | 43882/75000 [40:45<31:00, 16.73it/s]

{'loss': 0.3738, 'grad_norm': 2.749016761779785, 'learning_rate': 2.0885906040268457e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43893/75000 [40:45<27:40, 18.73it/s]

{'loss': 0.2336, 'grad_norm': 1.3507996797561646, 'learning_rate': 2.0879194630872486e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43901/75000 [40:46<32:57, 15.73it/s]

{'loss': 0.2795, 'grad_norm': 2.1454951763153076, 'learning_rate': 2.087248322147651e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43912/75000 [40:47<29:45, 17.41it/s]

{'loss': 0.4542, 'grad_norm': 9.851049423217773, 'learning_rate': 2.0865771812080536e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43921/75000 [40:47<30:21, 17.06it/s]

{'loss': 0.4017, 'grad_norm': 6.413106441497803, 'learning_rate': 2.0859060402684564e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43934/75000 [40:48<27:59, 18.50it/s]

{'loss': 0.2963, 'grad_norm': 2.852762222290039, 'learning_rate': 2.0852348993288593e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43942/75000 [40:48<29:27, 17.57it/s]

{'loss': 0.3767, 'grad_norm': 3.394301176071167, 'learning_rate': 2.0845637583892618e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43953/75000 [40:49<28:31, 18.15it/s]

{'loss': 0.2989, 'grad_norm': 2.2359235286712646, 'learning_rate': 2.0838926174496647e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43962/75000 [40:49<31:30, 16.42it/s]

{'loss': 0.2531, 'grad_norm': 4.107883453369141, 'learning_rate': 2.0832214765100672e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43972/75000 [40:50<29:00, 17.83it/s]

{'loss': 0.3204, 'grad_norm': 1.3275823593139648, 'learning_rate': 2.0825503355704697e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43983/75000 [40:51<31:42, 16.30it/s]

{'loss': 0.333, 'grad_norm': 5.903854846954346, 'learning_rate': 2.0818791946308726e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 43993/75000 [40:51<29:14, 17.67it/s]

{'loss': 0.3294, 'grad_norm': 4.724432945251465, 'learning_rate': 2.0812080536912754e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 44000/75000 [40:52<30:22, 17.01it/s]

{'loss': 0.3983, 'grad_norm': 12.40914535522461, 'learning_rate': 2.080536912751678e-05, 'epoch': 1.76}


                                                       
 59%|█████▊    | 44014/75000 [40:55<53:31,  9.65it/s]  

{'loss': 0.3927, 'grad_norm': 6.560214042663574, 'learning_rate': 2.0798657718120808e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 44022/75000 [40:56<36:35, 14.11it/s]

{'loss': 0.4502, 'grad_norm': 6.445200443267822, 'learning_rate': 2.0791946308724833e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 44034/75000 [40:56<28:21, 18.20it/s]

{'loss': 0.3217, 'grad_norm': 24.327882766723633, 'learning_rate': 2.0785234899328858e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 44043/75000 [40:57<31:02, 16.62it/s]

{'loss': 0.3298, 'grad_norm': 4.4563307762146, 'learning_rate': 2.0778523489932887e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 44053/75000 [40:57<29:29, 17.49it/s]

{'loss': 0.3727, 'grad_norm': 7.563257217407227, 'learning_rate': 2.0771812080536915e-05, 'epoch': 1.76}


                                                     
 59%|█████▊    | 44061/75000 [40:58<36:10, 14.25it/s]

{'loss': 0.2754, 'grad_norm': 5.633731365203857, 'learning_rate': 2.076510067114094e-05, 'epoch': 1.76}


                                                     
 59%|█████▉    | 44073/75000 [40:59<30:11, 17.08it/s]

{'loss': 0.4108, 'grad_norm': 9.876867294311523, 'learning_rate': 2.075838926174497e-05, 'epoch': 1.76}


                                                     
 59%|█████▉    | 44082/75000 [40:59<28:52, 17.85it/s]

{'loss': 0.3013, 'grad_norm': 3.5626795291900635, 'learning_rate': 2.0751677852348994e-05, 'epoch': 1.76}


                                                     
 59%|█████▉    | 44092/75000 [41:00<31:00, 16.61it/s]

{'loss': 0.3096, 'grad_norm': 2.2781291007995605, 'learning_rate': 2.074496644295302e-05, 'epoch': 1.76}


                                                     
 59%|█████▉    | 44102/75000 [41:00<34:59, 14.72it/s]

{'loss': 0.2793, 'grad_norm': 4.3649725914001465, 'learning_rate': 2.0738255033557048e-05, 'epoch': 1.76}


                                                     
 59%|█████▉    | 44112/75000 [41:01<42:06, 12.22it/s]

{'loss': 0.3344, 'grad_norm': 1.450208067893982, 'learning_rate': 2.0731543624161076e-05, 'epoch': 1.76}


                                                     
 59%|█████▉    | 44122/75000 [41:02<34:58, 14.71it/s]

{'loss': 0.2765, 'grad_norm': 2.5127973556518555, 'learning_rate': 2.07248322147651e-05, 'epoch': 1.76}


                                                     
 59%|█████▉    | 44132/75000 [41:02<30:40, 16.77it/s]

{'loss': 0.2631, 'grad_norm': 3.9435160160064697, 'learning_rate': 2.071812080536913e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44142/75000 [41:03<33:18, 15.44it/s]

{'loss': 0.2727, 'grad_norm': 2.572523355484009, 'learning_rate': 2.0711409395973155e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44152/75000 [41:04<32:47, 15.68it/s]

{'loss': 0.1813, 'grad_norm': 3.221079111099243, 'learning_rate': 2.070469798657718e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44160/75000 [41:04<34:02, 15.10it/s]

{'loss': 0.4371, 'grad_norm': 6.110288619995117, 'learning_rate': 2.069798657718121e-05, 'epoch': 1.77}


                                                       
 59%|█████▉    | 44172/75000 [41:06<55:04,  9.33it/s]  

{'loss': 0.3956, 'grad_norm': 5.153372764587402, 'learning_rate': 2.0691275167785234e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44182/75000 [41:06<41:24, 12.40it/s]

{'loss': 0.2164, 'grad_norm': 0.9262606501579285, 'learning_rate': 2.0684563758389263e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44192/75000 [41:07<33:26, 15.35it/s]

{'loss': 0.2486, 'grad_norm': 1.9424549341201782, 'learning_rate': 2.067785234899329e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44202/75000 [41:08<35:12, 14.58it/s]

{'loss': 0.228, 'grad_norm': 5.3554606437683105, 'learning_rate': 2.0671140939597316e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44212/75000 [41:08<41:06, 12.48it/s]

{'loss': 0.269, 'grad_norm': 4.0268988609313965, 'learning_rate': 2.066442953020134e-05, 'epoch': 1.77}


                                                       
 59%|█████▉    | 44221/75000 [41:09<49:22, 10.39it/s]

{'loss': 0.3773, 'grad_norm': 7.241708755493164, 'learning_rate': 2.065771812080537e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44233/75000 [41:10<38:22, 13.36it/s]

{'loss': 0.315, 'grad_norm': 2.5886247158050537, 'learning_rate': 2.0651006711409395e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44243/75000 [41:11<32:54, 15.58it/s]

{'loss': 0.2668, 'grad_norm': 7.113068103790283, 'learning_rate': 2.0644295302013424e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44251/75000 [41:12<34:07, 15.02it/s]

{'loss': 0.4111, 'grad_norm': 2.5604021549224854, 'learning_rate': 2.0637583892617452e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44263/75000 [41:13<36:49, 13.91it/s]

{'loss': 0.3585, 'grad_norm': 17.587926864624023, 'learning_rate': 2.0630872483221477e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44272/75000 [41:13<31:28, 16.27it/s]

{'loss': 0.3194, 'grad_norm': 15.0521821975708, 'learning_rate': 2.0624161073825503e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44282/75000 [41:14<51:08, 10.01it/s]

{'loss': 0.281, 'grad_norm': 21.639991760253906, 'learning_rate': 2.061744966442953e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44292/75000 [41:15<42:41, 11.99it/s]

{'loss': 0.2739, 'grad_norm': 2.174071788787842, 'learning_rate': 2.0610738255033556e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44302/75000 [41:16<43:31, 11.76it/s]

{'loss': 0.3463, 'grad_norm': 3.882148265838623, 'learning_rate': 2.0604026845637585e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44312/75000 [41:17<41:44, 12.25it/s]

{'loss': 0.2878, 'grad_norm': 6.361289024353027, 'learning_rate': 2.0597315436241613e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44322/75000 [41:17<36:57, 13.84it/s]

{'loss': 0.2629, 'grad_norm': 15.356752395629883, 'learning_rate': 2.059060402684564e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44332/75000 [41:18<54:10,  9.43it/s]

{'loss': 0.2527, 'grad_norm': 4.424058437347412, 'learning_rate': 2.0583892617449667e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44340/75000 [41:19<39:08, 13.05it/s]

{'loss': 0.2016, 'grad_norm': 0.9712347388267517, 'learning_rate': 2.0577181208053692e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44352/75000 [41:20<34:15, 14.91it/s]

{'loss': 0.1978, 'grad_norm': 1.4915562868118286, 'learning_rate': 2.0570469798657717e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44361/75000 [41:20<32:09, 15.88it/s]

{'loss': 0.2809, 'grad_norm': 0.5878624320030212, 'learning_rate': 2.0563758389261746e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44373/75000 [41:21<34:51, 14.64it/s]

{'loss': 0.3748, 'grad_norm': 1.450204610824585, 'learning_rate': 2.0557046979865775e-05, 'epoch': 1.77}


                                                     
 59%|█████▉    | 44381/75000 [41:22<36:54, 13.83it/s]

{'loss': 0.3531, 'grad_norm': 5.0486650466918945, 'learning_rate': 2.05503355704698e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44391/75000 [41:23<48:17, 10.56it/s]

{'loss': 0.2113, 'grad_norm': 9.313287734985352, 'learning_rate': 2.0543624161073828e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44401/75000 [41:24<40:30, 12.59it/s]

{'loss': 0.3327, 'grad_norm': 1.7223763465881348, 'learning_rate': 2.0536912751677853e-05, 'epoch': 1.78}


                                                       
 59%|█████▉    | 44413/75000 [41:25<43:09, 11.81it/s]

{'loss': 0.3335, 'grad_norm': 12.391210556030273, 'learning_rate': 2.053020134228188e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44423/75000 [41:25<32:17, 15.79it/s]

{'loss': 0.3682, 'grad_norm': 4.452463150024414, 'learning_rate': 2.0523489932885907e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44433/75000 [41:26<33:32, 15.19it/s]

{'loss': 0.389, 'grad_norm': 9.975034713745117, 'learning_rate': 2.0516778523489936e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44440/75000 [41:27<31:50, 16.00it/s]

{'loss': 0.3175, 'grad_norm': 2.073009967803955, 'learning_rate': 2.051006711409396e-05, 'epoch': 1.78}


                                                       
 59%|█████▉    | 44451/75000 [41:28<45:50, 11.10it/s]

{'loss': 0.2172, 'grad_norm': 12.69475269317627, 'learning_rate': 2.050335570469799e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44462/75000 [41:29<36:59, 13.76it/s]

{'loss': 0.3822, 'grad_norm': 9.723572731018066, 'learning_rate': 2.0496644295302015e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44471/75000 [41:29<36:18, 14.01it/s]

{'loss': 0.3774, 'grad_norm': 2.8159961700439453, 'learning_rate': 2.048993288590604e-05, 'epoch': 1.78}


                                                       
 59%|█████▉    | 44482/75000 [41:30<45:59, 11.06it/s]

{'loss': 0.2826, 'grad_norm': 5.614456653594971, 'learning_rate': 2.0483221476510068e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44491/75000 [41:31<38:02, 13.37it/s]

{'loss': 0.2846, 'grad_norm': 1.7945135831832886, 'learning_rate': 2.0476510067114093e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44500/75000 [41:32<39:30, 12.86it/s]

{'loss': 0.3406, 'grad_norm': 2.592620611190796, 'learning_rate': 2.0469798657718122e-05, 'epoch': 1.78}


                                                       
 59%|█████▉    | 44514/75000 [41:34<37:46, 13.45it/s]

{'loss': 0.4285, 'grad_norm': 7.466022491455078, 'learning_rate': 2.046308724832215e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44522/75000 [41:34<37:45, 13.45it/s]

{'loss': 0.4266, 'grad_norm': 1.730146884918213, 'learning_rate': 2.0456375838926176e-05, 'epoch': 1.78}


                                                       
 59%|█████▉    | 44532/75000 [41:36<59:54,  8.48it/s]  

{'loss': 0.3043, 'grad_norm': 4.748472690582275, 'learning_rate': 2.04496644295302e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44542/75000 [41:36<41:02, 12.37it/s]

{'loss': 0.3142, 'grad_norm': 6.025416374206543, 'learning_rate': 2.044295302013423e-05, 'epoch': 1.78}


                                                       
 59%|█████▉    | 44552/75000 [41:37<43:28, 11.67it/s]

{'loss': 0.2603, 'grad_norm': 3.129448175430298, 'learning_rate': 2.0436241610738254e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44562/75000 [41:38<34:39, 14.64it/s]

{'loss': 0.1845, 'grad_norm': 0.29596009850502014, 'learning_rate': 2.0429530201342283e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44570/75000 [41:39<55:39,  9.11it/s]

{'loss': 0.3757, 'grad_norm': 8.728106498718262, 'learning_rate': 2.042281879194631e-05, 'epoch': 1.78}


                                                       
 59%|█████▉    | 44583/75000 [41:41<51:43,  9.80it/s]  

{'loss': 0.2804, 'grad_norm': 3.687009334564209, 'learning_rate': 2.0416107382550337e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44591/75000 [41:41<37:06, 13.66it/s]

{'loss': 0.2987, 'grad_norm': 3.139045238494873, 'learning_rate': 2.0409395973154362e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44603/75000 [41:42<33:58, 14.91it/s]

{'loss': 0.2755, 'grad_norm': 6.052265167236328, 'learning_rate': 2.040268456375839e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44613/75000 [41:43<39:56, 12.68it/s]

{'loss': 0.2818, 'grad_norm': 2.4167051315307617, 'learning_rate': 2.0395973154362416e-05, 'epoch': 1.78}


                                                     
 59%|█████▉    | 44621/75000 [41:43<35:30, 14.26it/s]

{'loss': 0.2469, 'grad_norm': 1.2328803539276123, 'learning_rate': 2.0389261744966444e-05, 'epoch': 1.78}


                                                     
 60%|█████▉    | 44631/75000 [41:44<44:15, 11.43it/s]

{'loss': 0.3438, 'grad_norm': 2.733424663543701, 'learning_rate': 2.0382550335570473e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44642/75000 [41:46<54:47,  9.24it/s]  

{'loss': 0.3806, 'grad_norm': 6.435195446014404, 'learning_rate': 2.0375838926174498e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44652/75000 [41:47<56:20,  8.98it/s]  

{'loss': 0.3287, 'grad_norm': 2.195269823074341, 'learning_rate': 2.0369127516778523e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44660/75000 [41:48<48:18, 10.47it/s]

{'loss': 0.3129, 'grad_norm': 6.979386806488037, 'learning_rate': 2.036241610738255e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44672/75000 [41:49<44:09, 11.45it/s]

{'loss': 0.2754, 'grad_norm': 10.165706634521484, 'learning_rate': 2.0355704697986577e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44682/75000 [41:50<43:43, 11.55it/s]

{'loss': 0.2575, 'grad_norm': 1.9694329500198364, 'learning_rate': 2.0348993288590605e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44690/75000 [41:50<37:58, 13.30it/s]

{'loss': 0.3253, 'grad_norm': 11.249139785766602, 'learning_rate': 2.0342281879194634e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44702/75000 [41:52<48:27, 10.42it/s]

{'loss': 0.3396, 'grad_norm': 3.910814046859741, 'learning_rate': 2.033557046979866e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44710/75000 [41:53<1:05:55,  7.66it/s]

{'loss': 0.3681, 'grad_norm': 1.3089494705200195, 'learning_rate': 2.0328859060402684e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44722/75000 [41:54<46:14, 10.91it/s]

{'loss': 0.344, 'grad_norm': 4.1092848777771, 'learning_rate': 2.0322147651006713e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44732/75000 [41:55<38:27, 13.12it/s]

{'loss': 0.2813, 'grad_norm': 2.767393112182617, 'learning_rate': 2.0315436241610738e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44742/75000 [41:56<41:33, 12.13it/s]

{'loss': 0.3275, 'grad_norm': 3.289090871810913, 'learning_rate': 2.0308724832214766e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44752/75000 [41:57<45:55, 10.98it/s]

{'loss': 0.3492, 'grad_norm': 1.98405921459198, 'learning_rate': 2.0302013422818795e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44760/75000 [41:57<40:27, 12.46it/s]

{'loss': 0.3482, 'grad_norm': 6.188816547393799, 'learning_rate': 2.029530201342282e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44772/75000 [41:59<46:48, 10.76it/s]

{'loss': 0.2659, 'grad_norm': 1.589777946472168, 'learning_rate': 2.028859060402685e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44782/75000 [42:00<42:53, 11.74it/s]

{'loss': 0.2328, 'grad_norm': 1.1331703662872314, 'learning_rate': 2.0281879194630874e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44792/75000 [42:00<36:27, 13.81it/s]

{'loss': 0.3923, 'grad_norm': 6.86716365814209, 'learning_rate': 2.02751677852349e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44802/75000 [42:02<57:33,  8.74it/s]  

{'loss': 0.2161, 'grad_norm': 6.717905044555664, 'learning_rate': 2.0268456375838928e-05, 'epoch': 1.79}


 60%|█████▉    | 44812/75000 [42:03<46:26, 10.83it/s]

{'loss': 0.2636, 'grad_norm': 2.8879127502441406, 'learning_rate': 2.0261744966442953e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44820/75000 [42:04<1:06:27,  7.57it/s]

{'loss': 0.4235, 'grad_norm': 3.3426284790039062, 'learning_rate': 2.025503355704698e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44831/75000 [42:05<44:39, 11.26it/s]

{'loss': 0.299, 'grad_norm': 1.7605711221694946, 'learning_rate': 2.024832214765101e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44842/75000 [42:07<1:10:46,  7.10it/s]

{'loss': 0.2033, 'grad_norm': 2.338547945022583, 'learning_rate': 2.0241610738255035e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44852/75000 [42:07<42:16, 11.88it/s]

{'loss': 0.3639, 'grad_norm': 1.3812474012374878, 'learning_rate': 2.023489932885906e-05, 'epoch': 1.79}


                                                     
 60%|█████▉    | 44862/75000 [42:08<39:03, 12.86it/s]

{'loss': 0.2714, 'grad_norm': 2.504014015197754, 'learning_rate': 2.022818791946309e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44870/75000 [42:09<1:18:19,  6.41it/s]

{'loss': 0.2423, 'grad_norm': 1.1466164588928223, 'learning_rate': 2.0221476510067114e-05, 'epoch': 1.79}


                                                       
 60%|█████▉    | 44882/75000 [42:10<46:35, 10.77it/s]

{'loss': 0.3653, 'grad_norm': 4.4099860191345215, 'learning_rate': 2.0214765100671142e-05, 'epoch': 1.8}


                                                       
 60%|█████▉    | 44891/75000 [42:12<1:11:09,  7.05it/s]

{'loss': 0.2241, 'grad_norm': 2.179281711578369, 'learning_rate': 2.020805369127517e-05, 'epoch': 1.8}


                                                       
 60%|█████▉    | 44901/75000 [42:13<44:56, 11.16it/s]

{'loss': 0.2093, 'grad_norm': 0.7562881708145142, 'learning_rate': 2.0201342281879196e-05, 'epoch': 1.8}


                                                     
 60%|█████▉    | 44911/75000 [42:13<45:07, 11.11it/s]

{'loss': 0.2623, 'grad_norm': 5.432481288909912, 'learning_rate': 2.019463087248322e-05, 'epoch': 1.8}


                                                       
 60%|█████▉    | 44920/75000 [42:15<1:34:12,  5.32it/s]

{'loss': 0.4242, 'grad_norm': 4.634864807128906, 'learning_rate': 2.018791946308725e-05, 'epoch': 1.8}


                                                       
 60%|█████▉    | 44931/75000 [42:16<42:55, 11.68it/s]

{'loss': 0.3623, 'grad_norm': 3.378122568130493, 'learning_rate': 2.0181208053691275e-05, 'epoch': 1.8}


                                                     
 60%|█████▉    | 44941/75000 [42:17<41:25, 12.09it/s]

{'loss': 0.325, 'grad_norm': 3.0998423099517822, 'learning_rate': 2.0174496644295303e-05, 'epoch': 1.8}


                                                     
 60%|█████▉    | 44951/75000 [42:18<46:49, 10.69it/s]

{'loss': 0.1764, 'grad_norm': 4.9118971824646, 'learning_rate': 2.0167785234899332e-05, 'epoch': 1.8}


                                                     
 60%|█████▉    | 44961/75000 [42:18<41:16, 12.13it/s]

{'loss': 0.2494, 'grad_norm': 4.255553722381592, 'learning_rate': 2.0161073825503357e-05, 'epoch': 1.8}


                                                       
 60%|█████▉    | 44971/75000 [42:20<1:18:24,  6.38it/s]

{'loss': 0.5084, 'grad_norm': 1.252299189567566, 'learning_rate': 2.0154362416107382e-05, 'epoch': 1.8}


                                                       
 60%|█████▉    | 44981/75000 [42:21<41:31, 12.05it/s]

{'loss': 0.3131, 'grad_norm': 3.246248960494995, 'learning_rate': 2.014765100671141e-05, 'epoch': 1.8}


                                                     
 60%|█████▉    | 44991/75000 [42:22<37:27, 13.35it/s]

{'loss': 0.2145, 'grad_norm': 2.2828593254089355, 'learning_rate': 2.0140939597315436e-05, 'epoch': 1.8}


                                                     
 60%|██████    | 45000/75000 [42:22<40:12, 12.43it/s]

{'loss': 0.3996, 'grad_norm': 7.762579917907715, 'learning_rate': 2.013422818791946e-05, 'epoch': 1.8}


                                                       
 60%|██████    | 45012/75000 [42:25<53:15,  9.38it/s]  

{'loss': 0.3122, 'grad_norm': 6.120431900024414, 'learning_rate': 2.0127516778523493e-05, 'epoch': 1.8}


                                                     
 60%|██████    | 45020/75000 [42:26<38:53, 12.85it/s]

{'loss': 0.2725, 'grad_norm': 4.485745906829834, 'learning_rate': 2.0120805369127518e-05, 'epoch': 1.8}


                                                     
 60%|██████    | 45030/75000 [42:27<55:32,  8.99it/s]

{'loss': 0.1983, 'grad_norm': 14.964974403381348, 'learning_rate': 2.0114093959731543e-05, 'epoch': 1.8}


                                                       
 60%|██████    | 45042/75000 [42:28<42:41, 11.70it/s]

{'loss': 0.261, 'grad_norm': 0.984978437423706, 'learning_rate': 2.0107382550335572e-05, 'epoch': 1.8}


                                                     
 60%|██████    | 45052/75000 [42:29<38:00, 13.13it/s]

{'loss': 0.2867, 'grad_norm': 11.892090797424316, 'learning_rate': 2.0100671140939597e-05, 'epoch': 1.8}


                                                     
 60%|██████    | 45060/75000 [42:30<40:57, 12.18it/s]

{'loss': 0.2871, 'grad_norm': 1.37696373462677, 'learning_rate': 2.0093959731543622e-05, 'epoch': 1.8}


                                                       
 60%|██████    | 45072/75000 [42:31<47:48, 10.43it/s]

{'loss': 0.2491, 'grad_norm': 3.804203748703003, 'learning_rate': 2.0087248322147654e-05, 'epoch': 1.8}


                                                       
 60%|██████    | 45080/75000 [42:32<1:01:17,  8.14it/s]

{'loss': 0.5346, 'grad_norm': 1.0139498710632324, 'learning_rate': 2.008053691275168e-05, 'epoch': 1.8}


                                                       
 60%|██████    | 45091/75000 [42:33<49:46, 10.02it/s]

{'loss': 0.3939, 'grad_norm': 9.55018138885498, 'learning_rate': 2.0073825503355705e-05, 'epoch': 1.8}


                                                     
 60%|██████    | 45101/75000 [42:34<41:14, 12.08it/s]

{'loss': 0.2434, 'grad_norm': 3.9885854721069336, 'learning_rate': 2.0067114093959733e-05, 'epoch': 1.8}


                                                       
 60%|██████    | 45111/75000 [42:36<1:19:59,  6.23it/s]

{'loss': 0.238, 'grad_norm': 3.2360312938690186, 'learning_rate': 2.0060402684563758e-05, 'epoch': 1.8}


                                                       
 60%|██████    | 45121/75000 [42:36<50:30,  9.86it/s]

{'loss': 0.4089, 'grad_norm': 9.702753067016602, 'learning_rate': 2.0053691275167787e-05, 'epoch': 1.8}


                                                       
 60%|██████    | 45132/75000 [42:38<58:38,  8.49it/s]  

{'loss': 0.3266, 'grad_norm': 2.169532060623169, 'learning_rate': 2.0046979865771815e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45142/75000 [42:39<43:02, 11.56it/s]

{'loss': 0.2448, 'grad_norm': 3.10072660446167, 'learning_rate': 2.004026845637584e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45150/75000 [42:39<36:57, 13.46it/s]

{'loss': 0.3934, 'grad_norm': 3.222914218902588, 'learning_rate': 2.0033557046979866e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45161/75000 [42:41<52:09,  9.54it/s]

{'loss': 0.2464, 'grad_norm': 2.942248582839966, 'learning_rate': 2.0026845637583894e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45173/75000 [42:42<45:51, 10.84it/s]

{'loss': 0.2439, 'grad_norm': 0.843132734298706, 'learning_rate': 2.002013422818792e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45183/75000 [42:43<36:35, 13.58it/s]

{'loss': 0.2711, 'grad_norm': 2.679427146911621, 'learning_rate': 2.0013422818791948e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45193/75000 [42:43<35:36, 13.95it/s]

{'loss': 0.2269, 'grad_norm': 2.044522285461426, 'learning_rate': 2.0006711409395973e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45203/75000 [42:45<49:00, 10.13it/s]  

{'loss': 0.2474, 'grad_norm': 11.365151405334473, 'learning_rate': 2e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45211/75000 [42:46<40:17, 12.32it/s]

{'loss': 0.3477, 'grad_norm': 3.9082422256469727, 'learning_rate': 1.9993288590604027e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45221/75000 [42:47<50:22,  9.85it/s]

{'loss': 0.326, 'grad_norm': 8.060609817504883, 'learning_rate': 1.9986577181208055e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45231/75000 [42:48<49:35, 10.00it/s]

{'loss': 0.2619, 'grad_norm': 7.352433204650879, 'learning_rate': 1.997986577181208e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45241/75000 [42:49<46:30, 10.66it/s]

{'loss': 0.204, 'grad_norm': 6.2929253578186035, 'learning_rate': 1.997315436241611e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45253/75000 [42:50<42:14, 11.74it/s]

{'loss': 0.3202, 'grad_norm': 1.6481555700302124, 'learning_rate': 1.9966442953020134e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45261/75000 [42:50<37:17, 13.29it/s]

{'loss': 0.3487, 'grad_norm': 1.273207426071167, 'learning_rate': 1.9959731543624163e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45271/75000 [42:51<35:46, 13.85it/s]

{'loss': 0.4062, 'grad_norm': 1.2448376417160034, 'learning_rate': 1.995302013422819e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45280/75000 [42:52<37:58, 13.05it/s]

{'loss': 0.2909, 'grad_norm': 2.3619511127471924, 'learning_rate': 1.9946308724832216e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45292/75000 [42:53<45:09, 10.97it/s]

{'loss': 0.3435, 'grad_norm': 2.612074613571167, 'learning_rate': 1.993959731543624e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45301/75000 [42:55<1:06:03,  7.49it/s]

{'loss': 0.2807, 'grad_norm': 1.7843092679977417, 'learning_rate': 1.993288590604027e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45311/75000 [42:55<39:48, 12.43it/s]

{'loss': 0.2569, 'grad_norm': 0.790701687335968, 'learning_rate': 1.9926174496644295e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45321/75000 [42:56<37:36, 13.15it/s]

{'loss': 0.3751, 'grad_norm': 3.34961199760437, 'learning_rate': 1.991946308724832e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45331/75000 [42:57<54:19,  9.10it/s]  

{'loss': 0.2631, 'grad_norm': 1.5213255882263184, 'learning_rate': 1.9912751677852352e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45343/75000 [42:58<37:56, 13.03it/s]

{'loss': 0.3812, 'grad_norm': 13.212556838989258, 'learning_rate': 1.9906040268456378e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45351/75000 [42:59<38:13, 12.93it/s]

{'loss': 0.361, 'grad_norm': 5.406718730926514, 'learning_rate': 1.9899328859060403e-05, 'epoch': 1.81}


                                                     
 60%|██████    | 45361/75000 [42:59<36:39, 13.48it/s]

{'loss': 0.4532, 'grad_norm': 4.398451805114746, 'learning_rate': 1.989261744966443e-05, 'epoch': 1.81}


                                                       
 60%|██████    | 45373/75000 [43:01<44:19, 11.14it/s]

{'loss': 0.3051, 'grad_norm': 2.7471001148223877, 'learning_rate': 1.9885906040268456e-05, 'epoch': 1.81}


                                                       
 61%|██████    | 45381/75000 [43:02<1:00:42,  8.13it/s]

{'loss': 0.352, 'grad_norm': 2.2130472660064697, 'learning_rate': 1.987919463087248e-05, 'epoch': 1.82}


                                                       
 61%|██████    | 45391/75000 [43:03<39:38, 12.45it/s]

{'loss': 0.3186, 'grad_norm': 3.4584333896636963, 'learning_rate': 1.9872483221476514e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45400/75000 [43:04<39:09, 12.60it/s]

{'loss': 0.2329, 'grad_norm': 2.13918137550354, 'learning_rate': 1.986577181208054e-05, 'epoch': 1.82}


                                                       
 61%|██████    | 45411/75000 [43:05<43:16, 11.39it/s]

{'loss': 0.3734, 'grad_norm': 2.256866455078125, 'learning_rate': 1.9859060402684564e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45421/75000 [43:06<47:48, 10.31it/s]

{'loss': 0.2693, 'grad_norm': 9.182975769042969, 'learning_rate': 1.9852348993288592e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45431/75000 [43:07<47:35, 10.36it/s]

{'loss': 0.4112, 'grad_norm': 3.53326153755188, 'learning_rate': 1.9845637583892618e-05, 'epoch': 1.82}


                                                       
 61%|██████    | 45441/75000 [43:08<48:15, 10.21it/s]

{'loss': 0.3344, 'grad_norm': 2.0420773029327393, 'learning_rate': 1.9838926174496643e-05, 'epoch': 1.82}


                                                       
 61%|██████    | 45452/75000 [43:10<1:20:01,  6.15it/s]

{'loss': 0.2759, 'grad_norm': 3.5474514961242676, 'learning_rate': 1.9832214765100675e-05, 'epoch': 1.82}


                                                       
 61%|██████    | 45462/75000 [43:10<47:01, 10.47it/s]

{'loss': 0.2063, 'grad_norm': 4.4382429122924805, 'learning_rate': 1.98255033557047e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45472/75000 [43:11<39:00, 12.62it/s]

{'loss': 0.3358, 'grad_norm': 2.9066977500915527, 'learning_rate': 1.9818791946308725e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45482/75000 [43:12<35:30, 13.85it/s]

{'loss': 0.227, 'grad_norm': 1.2290676832199097, 'learning_rate': 1.9812080536912754e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45492/75000 [43:13<38:11, 12.88it/s]

{'loss': 0.2396, 'grad_norm': 1.3619319200515747, 'learning_rate': 1.980536912751678e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45500/75000 [43:13<37:51, 12.99it/s]

{'loss': 0.3646, 'grad_norm': 2.686981201171875, 'learning_rate': 1.9798657718120804e-05, 'epoch': 1.82}


                                                       
 61%|██████    | 45512/75000 [43:15<51:43,  9.50it/s]

{'loss': 0.2266, 'grad_norm': 5.315023899078369, 'learning_rate': 1.9791946308724832e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45522/75000 [43:16<34:36, 14.20it/s]

{'loss': 0.3817, 'grad_norm': 21.309425354003906, 'learning_rate': 1.978523489932886e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45532/75000 [43:17<30:01, 16.36it/s]

{'loss': 0.3728, 'grad_norm': 0.5025246143341064, 'learning_rate': 1.9778523489932886e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45541/75000 [43:17<27:40, 17.75it/s]

{'loss': 0.2301, 'grad_norm': 0.8159006834030151, 'learning_rate': 1.9771812080536915e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45552/75000 [43:18<27:42, 17.71it/s]

{'loss': 0.3259, 'grad_norm': 5.825253963470459, 'learning_rate': 1.976510067114094e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45563/75000 [43:18<26:08, 18.77it/s]

{'loss': 0.3675, 'grad_norm': 7.1523590087890625, 'learning_rate': 1.975838926174497e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45573/75000 [43:19<26:58, 18.18it/s]

{'loss': 0.2826, 'grad_norm': 1.1373693943023682, 'learning_rate': 1.9751677852348994e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45583/75000 [43:19<26:01, 18.84it/s]

{'loss': 0.4423, 'grad_norm': 1.922906517982483, 'learning_rate': 1.9744966442953022e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45591/75000 [43:20<29:24, 16.67it/s]

{'loss': 0.195, 'grad_norm': 7.061865329742432, 'learning_rate': 1.9738255033557047e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45603/75000 [43:21<26:55, 18.20it/s]

{'loss': 0.2996, 'grad_norm': 10.160421371459961, 'learning_rate': 1.9731543624161076e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45613/75000 [43:21<26:35, 18.42it/s]

{'loss': 0.3458, 'grad_norm': 6.467408657073975, 'learning_rate': 1.97248322147651e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45623/75000 [43:22<26:03, 18.79it/s]

{'loss': 0.3497, 'grad_norm': 14.955841064453125, 'learning_rate': 1.971812080536913e-05, 'epoch': 1.82}


                                                     
 61%|██████    | 45634/75000 [43:22<25:35, 19.13it/s]

{'loss': 0.4261, 'grad_norm': 1.2349166870117188, 'learning_rate': 1.9711409395973155e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45642/75000 [43:23<26:53, 18.20it/s]

{'loss': 0.39, 'grad_norm': 2.7147278785705566, 'learning_rate': 1.970469798657718e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45653/75000 [43:23<25:27, 19.22it/s]

{'loss': 0.3947, 'grad_norm': 4.452437400817871, 'learning_rate': 1.969798657718121e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45664/75000 [43:24<26:46, 18.26it/s]

{'loss': 0.2784, 'grad_norm': 3.2579870223999023, 'learning_rate': 1.9691275167785237e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45672/75000 [43:24<29:18, 16.68it/s]

{'loss': 0.333, 'grad_norm': 1.5808751583099365, 'learning_rate': 1.9684563758389262e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45683/75000 [43:25<25:39, 19.04it/s]

{'loss': 0.2683, 'grad_norm': 2.9377946853637695, 'learning_rate': 1.967785234899329e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45692/75000 [43:25<26:59, 18.10it/s]

{'loss': 0.2347, 'grad_norm': 1.7888346910476685, 'learning_rate': 1.9671140939597316e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45702/75000 [43:26<27:05, 18.03it/s]

{'loss': 0.3051, 'grad_norm': 10.942870140075684, 'learning_rate': 1.966442953020134e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45713/75000 [43:27<26:01, 18.75it/s]

{'loss': 0.2867, 'grad_norm': 2.775739908218384, 'learning_rate': 1.9657718120805373e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45724/75000 [43:27<26:56, 18.11it/s]

{'loss': 0.4181, 'grad_norm': 6.352334976196289, 'learning_rate': 1.9651006711409398e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45732/75000 [43:28<27:59, 17.43it/s]

{'loss': 0.3018, 'grad_norm': 4.066493511199951, 'learning_rate': 1.9644295302013423e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45742/75000 [43:28<29:07, 16.74it/s]

{'loss': 0.392, 'grad_norm': 10.345407485961914, 'learning_rate': 1.9637583892617452e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45752/75000 [43:29<27:26, 17.76it/s]

{'loss': 0.4247, 'grad_norm': 1.0650391578674316, 'learning_rate': 1.9630872483221477e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45763/75000 [43:29<25:26, 19.16it/s]

{'loss': 0.312, 'grad_norm': 8.192337989807129, 'learning_rate': 1.9624161073825502e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45773/75000 [43:30<27:05, 17.98it/s]

{'loss': 0.4008, 'grad_norm': 4.049411296844482, 'learning_rate': 1.9617449664429534e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45783/75000 [43:30<27:52, 17.47it/s]

{'loss': 0.3183, 'grad_norm': 3.712196111679077, 'learning_rate': 1.961073825503356e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45791/75000 [43:31<27:56, 17.42it/s]

{'loss': 0.1575, 'grad_norm': 6.1391119956970215, 'learning_rate': 1.9604026845637584e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45803/75000 [43:32<25:12, 19.31it/s]

{'loss': 0.2717, 'grad_norm': 2.3966407775878906, 'learning_rate': 1.9597315436241613e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45813/75000 [43:32<27:27, 17.72it/s]

{'loss': 0.2806, 'grad_norm': 11.626455307006836, 'learning_rate': 1.9590604026845638e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45823/75000 [43:33<27:31, 17.66it/s]

{'loss': 0.4585, 'grad_norm': 10.198625564575195, 'learning_rate': 1.9583892617449663e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45831/75000 [43:33<26:45, 18.16it/s]

{'loss': 0.2192, 'grad_norm': 1.2810298204421997, 'learning_rate': 1.957718120805369e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45843/75000 [43:34<26:47, 18.14it/s]

{'loss': 0.327, 'grad_norm': 5.7821946144104, 'learning_rate': 1.957046979865772e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45853/75000 [43:34<29:16, 16.59it/s]

{'loss': 0.2823, 'grad_norm': 4.7017412185668945, 'learning_rate': 1.9563758389261745e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45863/75000 [43:35<26:13, 18.52it/s]

{'loss': 0.4224, 'grad_norm': 5.161799430847168, 'learning_rate': 1.9557046979865774e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45872/75000 [43:35<25:01, 19.39it/s]

{'loss': 0.2647, 'grad_norm': 1.18687105178833, 'learning_rate': 1.95503355704698e-05, 'epoch': 1.83}


                                                     
 61%|██████    | 45884/75000 [43:36<25:32, 19.00it/s]

{'loss': 0.3915, 'grad_norm': 0.8287584781646729, 'learning_rate': 1.9543624161073824e-05, 'epoch': 1.84}


                                                     
 61%|██████    | 45893/75000 [43:37<25:04, 19.35it/s]

{'loss': 0.3363, 'grad_norm': 6.780590534210205, 'learning_rate': 1.9536912751677853e-05, 'epoch': 1.84}


                                                     
 61%|██████    | 45903/75000 [43:37<25:00, 19.39it/s]

{'loss': 0.3632, 'grad_norm': 9.117094039916992, 'learning_rate': 1.953020134228188e-05, 'epoch': 1.84}


                                                     
 61%|██████    | 45912/75000 [43:37<25:26, 19.05it/s]

{'loss': 0.1989, 'grad_norm': 4.673646450042725, 'learning_rate': 1.9523489932885907e-05, 'epoch': 1.84}


                                                     
 61%|██████    | 45922/75000 [43:38<25:16, 19.18it/s]

{'loss': 0.276, 'grad_norm': 3.7562272548675537, 'learning_rate': 1.9516778523489935e-05, 'epoch': 1.84}


                                                     
 61%|██████    | 45933/75000 [43:39<25:13, 19.21it/s]

{'loss': 0.2903, 'grad_norm': 3.4119420051574707, 'learning_rate': 1.951006711409396e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 45943/75000 [43:39<25:45, 18.80it/s]

{'loss': 0.3701, 'grad_norm': 2.6427252292633057, 'learning_rate': 1.9503355704697985e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 45954/75000 [43:40<25:04, 19.31it/s]

{'loss': 0.2852, 'grad_norm': 4.881860256195068, 'learning_rate': 1.9496644295302014e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 45963/75000 [43:40<24:52, 19.46it/s]

{'loss': 0.2766, 'grad_norm': 4.243859767913818, 'learning_rate': 1.948993288590604e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 45971/75000 [43:41<25:09, 19.23it/s]

{'loss': 0.2933, 'grad_norm': 3.879042148590088, 'learning_rate': 1.9483221476510068e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 45983/75000 [43:41<25:08, 19.23it/s]

{'loss': 0.2513, 'grad_norm': 15.567381858825684, 'learning_rate': 1.9476510067114096e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 45992/75000 [43:42<25:20, 19.07it/s]

{'loss': 0.3716, 'grad_norm': 6.003537178039551, 'learning_rate': 1.946979865771812e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46000/75000 [43:42<25:12, 19.17it/s]

{'loss': 0.4051, 'grad_norm': 3.2385387420654297, 'learning_rate': 1.946308724832215e-05, 'epoch': 1.84}


                                                       
 61%|██████▏   | 46011/75000 [43:43<33:46, 14.30it/s]

{'loss': 0.2719, 'grad_norm': 0.7779818773269653, 'learning_rate': 1.9456375838926175e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46021/75000 [43:44<29:17, 16.48it/s]

{'loss': 0.394, 'grad_norm': 4.8282976150512695, 'learning_rate': 1.94496644295302e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46033/75000 [43:44<25:31, 18.92it/s]

{'loss': 0.2747, 'grad_norm': 3.1486151218414307, 'learning_rate': 1.944295302013423e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46042/75000 [43:45<24:48, 19.45it/s]

{'loss': 0.3436, 'grad_norm': 0.5550602078437805, 'learning_rate': 1.9436241610738257e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46054/75000 [43:45<24:59, 19.30it/s]

{'loss': 0.2816, 'grad_norm': 2.555248975753784, 'learning_rate': 1.9429530201342282e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46063/75000 [43:46<25:27, 18.95it/s]

{'loss': 0.3041, 'grad_norm': 3.504182815551758, 'learning_rate': 1.942281879194631e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46073/75000 [43:46<27:12, 17.72it/s]

{'loss': 0.3834, 'grad_norm': 3.87802791595459, 'learning_rate': 1.9416107382550336e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46083/75000 [43:47<27:46, 17.35it/s]

{'loss': 0.3237, 'grad_norm': 4.261440753936768, 'learning_rate': 1.940939597315436e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46091/75000 [43:48<30:13, 15.94it/s]

{'loss': 0.2761, 'grad_norm': 1.1560553312301636, 'learning_rate': 1.940268456375839e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46101/75000 [43:48<30:45, 15.66it/s]

{'loss': 0.2703, 'grad_norm': 3.573092222213745, 'learning_rate': 1.939597315436242e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46111/75000 [43:49<31:19, 15.37it/s]

{'loss': 0.373, 'grad_norm': 7.1643266677856445, 'learning_rate': 1.9389261744966444e-05, 'epoch': 1.84}


                                                     
 61%|██████▏   | 46123/75000 [43:50<31:01, 15.51it/s]

{'loss': 0.3963, 'grad_norm': 4.222812175750732, 'learning_rate': 1.9382550335570472e-05, 'epoch': 1.84}


                                                     
 62%|██████▏   | 46131/75000 [43:50<29:46, 16.16it/s]

{'loss': 0.3535, 'grad_norm': 4.770960807800293, 'learning_rate': 1.9375838926174497e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46143/75000 [43:51<29:24, 16.35it/s]

{'loss': 0.2695, 'grad_norm': 4.783606052398682, 'learning_rate': 1.9369127516778522e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46153/75000 [43:51<28:45, 16.72it/s]

{'loss': 0.3268, 'grad_norm': 1.9809480905532837, 'learning_rate': 1.936241610738255e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46163/75000 [43:52<28:02, 17.14it/s]

{'loss': 0.211, 'grad_norm': 6.105452537536621, 'learning_rate': 1.935570469798658e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46173/75000 [43:53<27:30, 17.47it/s]

{'loss': 0.3096, 'grad_norm': 4.001155853271484, 'learning_rate': 1.9348993288590605e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46183/75000 [43:53<28:32, 16.83it/s]

{'loss': 0.2273, 'grad_norm': 0.7863665819168091, 'learning_rate': 1.9342281879194633e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46193/75000 [43:54<29:10, 16.46it/s]

{'loss': 0.2267, 'grad_norm': 4.664255619049072, 'learning_rate': 1.933557046979866e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46203/75000 [43:54<28:43, 16.70it/s]

{'loss': 0.2688, 'grad_norm': 0.5095953345298767, 'learning_rate': 1.9328859060402684e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46211/75000 [43:55<29:51, 16.07it/s]

{'loss': 0.2734, 'grad_norm': 1.876813530921936, 'learning_rate': 1.9322147651006712e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46223/75000 [43:56<30:11, 15.89it/s]

{'loss': 0.2847, 'grad_norm': 3.817965030670166, 'learning_rate': 1.931543624161074e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46233/75000 [43:56<28:11, 17.01it/s]

{'loss': 0.2233, 'grad_norm': 8.273554801940918, 'learning_rate': 1.9308724832214766e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46243/75000 [43:57<27:27, 17.45it/s]

{'loss': 0.3316, 'grad_norm': 7.6833343505859375, 'learning_rate': 1.9302013422818794e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46253/75000 [43:57<28:06, 17.04it/s]

{'loss': 0.3011, 'grad_norm': 1.2808959484100342, 'learning_rate': 1.929530201342282e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46263/75000 [43:58<27:21, 17.51it/s]

{'loss': 0.3182, 'grad_norm': 2.0875630378723145, 'learning_rate': 1.9288590604026845e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46273/75000 [43:59<27:03, 17.69it/s]

{'loss': 0.2559, 'grad_norm': 3.8255207538604736, 'learning_rate': 1.9281879194630873e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46281/75000 [43:59<26:28, 18.08it/s]

{'loss': 0.2718, 'grad_norm': 3.07766056060791, 'learning_rate': 1.9275167785234902e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46291/75000 [44:00<25:13, 18.97it/s]

{'loss': 0.2979, 'grad_norm': 4.20646333694458, 'learning_rate': 1.9268456375838927e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46304/75000 [44:00<23:53, 20.01it/s]

{'loss': 0.2729, 'grad_norm': 4.28300666809082, 'learning_rate': 1.9261744966442955e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46312/75000 [44:01<25:09, 19.01it/s]

{'loss': 0.2324, 'grad_norm': 3.7854273319244385, 'learning_rate': 1.925503355704698e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46323/75000 [44:01<24:56, 19.16it/s]

{'loss': 0.2887, 'grad_norm': 1.2332875728607178, 'learning_rate': 1.9248322147651006e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46332/75000 [44:02<26:13, 18.21it/s]

{'loss': 0.3413, 'grad_norm': 10.372382164001465, 'learning_rate': 1.9241610738255034e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46341/75000 [44:02<25:51, 18.47it/s]

{'loss': 0.2211, 'grad_norm': 5.532078742980957, 'learning_rate': 1.923489932885906e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46351/75000 [44:03<25:38, 18.62it/s]

{'loss': 0.3523, 'grad_norm': 3.4001803398132324, 'learning_rate': 1.9228187919463088e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46364/75000 [44:03<24:00, 19.88it/s]

{'loss': 0.2239, 'grad_norm': 3.0692496299743652, 'learning_rate': 1.9221476510067117e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46372/75000 [44:04<25:48, 18.48it/s]

{'loss': 0.2225, 'grad_norm': 3.08455753326416, 'learning_rate': 1.9214765100671142e-05, 'epoch': 1.85}


                                                     
 62%|██████▏   | 46382/75000 [44:04<25:13, 18.91it/s]

{'loss': 0.4081, 'grad_norm': 3.092390537261963, 'learning_rate': 1.9208053691275167e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46391/75000 [44:05<28:19, 16.83it/s]

{'loss': 0.4496, 'grad_norm': 3.133779525756836, 'learning_rate': 1.9201342281879195e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46402/75000 [44:05<26:50, 17.76it/s]

{'loss': 0.2562, 'grad_norm': 1.3510702848434448, 'learning_rate': 1.919463087248322e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46411/75000 [44:06<26:16, 18.13it/s]

{'loss': 0.2643, 'grad_norm': 1.8109300136566162, 'learning_rate': 1.918791946308725e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46424/75000 [44:07<23:53, 19.94it/s]

{'loss': 0.2895, 'grad_norm': 1.9093246459960938, 'learning_rate': 1.9181208053691278e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46433/75000 [44:07<26:02, 18.28it/s]

{'loss': 0.3313, 'grad_norm': 4.672362804412842, 'learning_rate': 1.9174496644295303e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46444/75000 [44:08<24:51, 19.15it/s]

{'loss': 0.2419, 'grad_norm': 5.167087078094482, 'learning_rate': 1.916778523489933e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46452/75000 [44:08<25:22, 18.75it/s]

{'loss': 0.3236, 'grad_norm': 4.668351650238037, 'learning_rate': 1.9161073825503357e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46463/75000 [44:09<28:04, 16.94it/s]

{'loss': 0.3987, 'grad_norm': 2.1931798458099365, 'learning_rate': 1.9154362416107382e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46472/75000 [44:09<25:23, 18.72it/s]

{'loss': 0.4212, 'grad_norm': 5.362159729003906, 'learning_rate': 1.914765100671141e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46483/75000 [44:10<26:34, 17.88it/s]

{'loss': 0.2433, 'grad_norm': 4.623892307281494, 'learning_rate': 1.914093959731544e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46491/75000 [44:10<27:36, 17.21it/s]

{'loss': 0.2272, 'grad_norm': 0.885499119758606, 'learning_rate': 1.9134228187919464e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46500/75000 [44:11<28:03, 16.93it/s]

{'loss': 0.3374, 'grad_norm': 3.809518337249756, 'learning_rate': 1.9127516778523493e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46514/75000 [44:12<29:37, 16.03it/s]

{'loss': 0.4002, 'grad_norm': 2.732182741165161, 'learning_rate': 1.9120805369127518e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46523/75000 [44:12<26:23, 17.98it/s]

{'loss': 0.3924, 'grad_norm': 13.592791557312012, 'learning_rate': 1.9114093959731543e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46534/75000 [44:13<25:16, 18.77it/s]

{'loss': 0.3764, 'grad_norm': 9.223459243774414, 'learning_rate': 1.910738255033557e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46541/75000 [44:14<28:02, 16.91it/s]

{'loss': 0.2326, 'grad_norm': 5.261417865753174, 'learning_rate': 1.91006711409396e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46552/75000 [44:14<25:46, 18.39it/s]

{'loss': 0.2381, 'grad_norm': 5.226888179779053, 'learning_rate': 1.9093959731543625e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46561/75000 [44:15<27:59, 16.93it/s]

{'loss': 0.3148, 'grad_norm': 2.8426003456115723, 'learning_rate': 1.9087248322147654e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46572/75000 [44:15<25:11, 18.81it/s]

{'loss': 0.2782, 'grad_norm': 1.4856375455856323, 'learning_rate': 1.908053691275168e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46583/75000 [44:16<26:23, 17.95it/s]

{'loss': 0.2756, 'grad_norm': 3.296503782272339, 'learning_rate': 1.9073825503355704e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46593/75000 [44:16<26:34, 17.82it/s]

{'loss': 0.3268, 'grad_norm': 4.004017353057861, 'learning_rate': 1.9067114093959733e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46604/75000 [44:17<25:11, 18.78it/s]

{'loss': 0.3546, 'grad_norm': 10.483469009399414, 'learning_rate': 1.906040268456376e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46612/75000 [44:17<26:48, 17.65it/s]

{'loss': 0.3063, 'grad_norm': 1.1893575191497803, 'learning_rate': 1.9053691275167786e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46621/75000 [44:18<28:38, 16.51it/s]

{'loss': 0.1855, 'grad_norm': 0.47207900881767273, 'learning_rate': 1.9046979865771815e-05, 'epoch': 1.86}


                                                     
 62%|██████▏   | 46631/75000 [44:19<29:00, 16.30it/s]

{'loss': 0.2922, 'grad_norm': 4.112438678741455, 'learning_rate': 1.904026845637584e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46644/75000 [44:19<24:01, 19.67it/s]

{'loss': 0.3371, 'grad_norm': 0.9395604133605957, 'learning_rate': 1.9033557046979865e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46653/75000 [44:20<26:24, 17.89it/s]

{'loss': 0.3362, 'grad_norm': 4.545016288757324, 'learning_rate': 1.9026845637583894e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46663/75000 [44:20<27:42, 17.05it/s]

{'loss': 0.3145, 'grad_norm': 2.0121707916259766, 'learning_rate': 1.902013422818792e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46671/75000 [44:21<26:49, 17.60it/s]

{'loss': 0.2828, 'grad_norm': 3.117288827896118, 'learning_rate': 1.9013422818791947e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46682/75000 [44:21<25:12, 18.73it/s]

{'loss': 0.3321, 'grad_norm': 1.4992790222167969, 'learning_rate': 1.9006711409395976e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46693/75000 [44:22<26:16, 17.95it/s]

{'loss': 0.3661, 'grad_norm': 3.6166303157806396, 'learning_rate': 1.9e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46701/75000 [44:22<25:00, 18.86it/s]

{'loss': 0.2697, 'grad_norm': 4.181875228881836, 'learning_rate': 1.8993288590604026e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46714/75000 [44:23<24:57, 18.88it/s]

{'loss': 0.3029, 'grad_norm': 7.5623779296875, 'learning_rate': 1.8986577181208055e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46723/75000 [44:24<26:34, 17.74it/s]

{'loss': 0.1648, 'grad_norm': 1.8009217977523804, 'learning_rate': 1.897986577181208e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46733/75000 [44:24<26:47, 17.59it/s]

{'loss': 0.3779, 'grad_norm': 4.920332908630371, 'learning_rate': 1.897315436241611e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46743/75000 [44:25<26:43, 17.62it/s]

{'loss': 0.4679, 'grad_norm': 3.3762083053588867, 'learning_rate': 1.8966442953020137e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46753/75000 [44:25<27:20, 17.22it/s]

{'loss': 0.3284, 'grad_norm': 3.0942413806915283, 'learning_rate': 1.8959731543624162e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46761/75000 [44:26<26:33, 17.72it/s]

{'loss': 0.2789, 'grad_norm': 2.623518466949463, 'learning_rate': 1.8953020134228187e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46772/75000 [44:26<24:59, 18.83it/s]

{'loss': 0.3429, 'grad_norm': 11.920071601867676, 'learning_rate': 1.8946308724832216e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46783/75000 [44:27<25:49, 18.21it/s]

{'loss': 0.2447, 'grad_norm': 3.959867238998413, 'learning_rate': 1.893959731543624e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46792/75000 [44:27<24:57, 18.84it/s]

{'loss': 0.3355, 'grad_norm': 9.081948280334473, 'learning_rate': 1.893288590604027e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46802/75000 [44:28<26:08, 17.98it/s]

{'loss': 0.3163, 'grad_norm': 9.963850021362305, 'learning_rate': 1.8926174496644298e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46812/75000 [44:29<29:00, 16.19it/s]

{'loss': 0.3208, 'grad_norm': 3.6269454956054688, 'learning_rate': 1.8919463087248323e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46821/75000 [44:29<29:51, 15.73it/s]

{'loss': 0.3055, 'grad_norm': 4.837553024291992, 'learning_rate': 1.891275167785235e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46832/75000 [44:30<25:02, 18.75it/s]

{'loss': 0.2172, 'grad_norm': 2.190300703048706, 'learning_rate': 1.8906040268456377e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46844/75000 [44:30<25:13, 18.60it/s]

{'loss': 0.2441, 'grad_norm': 12.147624015808105, 'learning_rate': 1.8899328859060402e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46852/75000 [44:31<26:54, 17.44it/s]

{'loss': 0.4311, 'grad_norm': 2.730259418487549, 'learning_rate': 1.889261744966443e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46862/75000 [44:31<28:57, 16.19it/s]

{'loss': 0.2758, 'grad_norm': 2.6017513275146484, 'learning_rate': 1.888590604026846e-05, 'epoch': 1.87}


                                                     
 62%|██████▏   | 46871/75000 [44:32<28:46, 16.29it/s]

{'loss': 0.4895, 'grad_norm': 7.020269393920898, 'learning_rate': 1.8879194630872484e-05, 'epoch': 1.87}


                                                     
 63%|██████▎   | 46882/75000 [44:32<24:31, 19.10it/s]

{'loss': 0.32, 'grad_norm': 9.30335521697998, 'learning_rate': 1.8872483221476513e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46894/75000 [44:33<24:41, 18.97it/s]

{'loss': 0.3393, 'grad_norm': 3.3493664264678955, 'learning_rate': 1.8865771812080538e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46904/75000 [44:34<24:39, 19.00it/s]

{'loss': 0.2544, 'grad_norm': 3.6189005374908447, 'learning_rate': 1.8859060402684563e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46912/75000 [44:34<28:05, 16.67it/s]

{'loss': 0.2693, 'grad_norm': 3.363968849182129, 'learning_rate': 1.8852348993288592e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46922/75000 [44:35<25:13, 18.55it/s]

{'loss': 0.2407, 'grad_norm': 8.663670539855957, 'learning_rate': 1.884563758389262e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46934/75000 [44:35<24:14, 19.30it/s]

{'loss': 0.3485, 'grad_norm': 1.6491103172302246, 'learning_rate': 1.8838926174496646e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46942/75000 [44:36<28:29, 16.42it/s]

{'loss': 0.2122, 'grad_norm': 1.800350546836853, 'learning_rate': 1.8832214765100674e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46953/75000 [44:36<26:15, 17.80it/s]

{'loss': 0.3924, 'grad_norm': 1.4057903289794922, 'learning_rate': 1.88255033557047e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46962/75000 [44:37<25:50, 18.08it/s]

{'loss': 0.3719, 'grad_norm': 2.821422815322876, 'learning_rate': 1.8818791946308724e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46971/75000 [44:37<25:17, 18.46it/s]

{'loss': 0.2997, 'grad_norm': 1.6755679845809937, 'learning_rate': 1.8812080536912753e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46984/75000 [44:38<24:16, 19.23it/s]

{'loss': 0.2812, 'grad_norm': 4.924259185791016, 'learning_rate': 1.8805369127516778e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 46992/75000 [44:38<25:05, 18.61it/s]

{'loss': 0.4392, 'grad_norm': 5.403642177581787, 'learning_rate': 1.8798657718120807e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47000/75000 [44:39<24:44, 18.87it/s]

{'loss': 0.3263, 'grad_norm': 3.454463481903076, 'learning_rate': 1.8791946308724835e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47012/75000 [44:40<31:06, 14.99it/s]

{'loss': 0.4099, 'grad_norm': 0.9487890005111694, 'learning_rate': 1.878523489932886e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47023/75000 [44:41<25:48, 18.07it/s]

{'loss': 0.3363, 'grad_norm': 5.40716028213501, 'learning_rate': 1.8778523489932886e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47033/75000 [44:41<26:42, 17.45it/s]

{'loss': 0.283, 'grad_norm': 4.522772312164307, 'learning_rate': 1.8771812080536914e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47043/75000 [44:42<27:08, 17.17it/s]

{'loss': 0.3065, 'grad_norm': 5.585822105407715, 'learning_rate': 1.876510067114094e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47052/75000 [44:42<26:43, 17.43it/s]

{'loss': 0.3024, 'grad_norm': 5.305836200714111, 'learning_rate': 1.8758389261744968e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47064/75000 [44:43<25:24, 18.32it/s]

{'loss': 0.433, 'grad_norm': 3.7609400749206543, 'learning_rate': 1.8751677852348996e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47072/75000 [44:43<25:48, 18.03it/s]

{'loss': 0.3387, 'grad_norm': 5.927376747131348, 'learning_rate': 1.874496644295302e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47082/75000 [44:44<28:30, 16.32it/s]

{'loss': 0.1481, 'grad_norm': 4.186298370361328, 'learning_rate': 1.8738255033557047e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47091/75000 [44:45<27:00, 17.23it/s]

{'loss': 0.2334, 'grad_norm': 1.770542860031128, 'learning_rate': 1.8731543624161075e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47104/75000 [44:45<24:47, 18.75it/s]

{'loss': 0.4426, 'grad_norm': 16.494401931762695, 'learning_rate': 1.87248322147651e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47112/75000 [44:46<25:01, 18.57it/s]

{'loss': 0.3968, 'grad_norm': 10.500732421875, 'learning_rate': 1.8718120805369125e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47123/75000 [44:46<27:38, 16.81it/s]

{'loss': 0.328, 'grad_norm': 1.932904601097107, 'learning_rate': 1.8711409395973157e-05, 'epoch': 1.88}


                                                     
 63%|██████▎   | 47133/75000 [44:47<25:53, 17.94it/s]

{'loss': 0.4317, 'grad_norm': 5.9074578285217285, 'learning_rate': 1.8704697986577183e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47142/75000 [44:47<25:19, 18.33it/s]

{'loss': 0.2435, 'grad_norm': 2.1809403896331787, 'learning_rate': 1.8697986577181208e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47152/75000 [44:48<25:35, 18.14it/s]

{'loss': 0.3111, 'grad_norm': 0.8010615706443787, 'learning_rate': 1.8691275167785236e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47162/75000 [44:48<24:20, 19.06it/s]

{'loss': 0.3018, 'grad_norm': 6.449916362762451, 'learning_rate': 1.868456375838926e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47171/75000 [44:49<27:21, 16.96it/s]

{'loss': 0.3651, 'grad_norm': 4.058602809906006, 'learning_rate': 1.8677852348993287e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47184/75000 [44:50<24:30, 18.92it/s]

{'loss': 0.3921, 'grad_norm': 5.137784481048584, 'learning_rate': 1.867114093959732e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47192/75000 [44:50<25:13, 18.37it/s]

{'loss': 0.2728, 'grad_norm': 13.973347663879395, 'learning_rate': 1.8664429530201344e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47201/75000 [44:51<25:56, 17.86it/s]

{'loss': 0.2066, 'grad_norm': 1.9685814380645752, 'learning_rate': 1.865771812080537e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47212/75000 [44:51<25:14, 18.34it/s]

{'loss': 0.222, 'grad_norm': 1.001822590827942, 'learning_rate': 1.8651006711409397e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47223/75000 [44:52<25:07, 18.43it/s]

{'loss': 0.4593, 'grad_norm': 6.399625301361084, 'learning_rate': 1.8644295302013423e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47234/75000 [44:52<23:45, 19.48it/s]

{'loss': 0.3382, 'grad_norm': 4.441808223724365, 'learning_rate': 1.863758389261745e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47242/75000 [44:53<25:24, 18.21it/s]

{'loss': 0.3424, 'grad_norm': 2.7660233974456787, 'learning_rate': 1.863087248322148e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47251/75000 [44:53<28:21, 16.31it/s]

{'loss': 0.2197, 'grad_norm': 3.884789228439331, 'learning_rate': 1.8624161073825505e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47264/75000 [44:54<23:55, 19.32it/s]

{'loss': 0.2782, 'grad_norm': 2.4768991470336914, 'learning_rate': 1.861744966442953e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47272/75000 [44:54<26:14, 17.61it/s]

{'loss': 0.242, 'grad_norm': 3.03987979888916, 'learning_rate': 1.861073825503356e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47282/75000 [44:55<27:08, 17.02it/s]

{'loss': 0.3108, 'grad_norm': 1.9870015382766724, 'learning_rate': 1.8604026845637584e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47294/75000 [44:55<24:12, 19.07it/s]

{'loss': 0.2106, 'grad_norm': 1.173050045967102, 'learning_rate': 1.8597315436241612e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47302/75000 [44:56<26:35, 17.36it/s]

{'loss': 0.348, 'grad_norm': 9.185275077819824, 'learning_rate': 1.8590604026845637e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47314/75000 [44:57<23:56, 19.28it/s]

{'loss': 0.1955, 'grad_norm': 5.278047561645508, 'learning_rate': 1.8583892617449666e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47323/75000 [44:57<25:57, 17.77it/s]

{'loss': 0.3343, 'grad_norm': 4.13913631439209, 'learning_rate': 1.857718120805369e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47333/75000 [44:58<26:00, 17.73it/s]

{'loss': 0.3166, 'grad_norm': 2.4265174865722656, 'learning_rate': 1.857046979865772e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47342/75000 [44:58<24:54, 18.51it/s]

{'loss': 0.2128, 'grad_norm': 4.239081859588623, 'learning_rate': 1.8563758389261745e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47354/75000 [44:59<24:08, 19.08it/s]

{'loss': 0.3922, 'grad_norm': 2.900341749191284, 'learning_rate': 1.8557046979865773e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47362/75000 [44:59<25:45, 17.88it/s]

{'loss': 0.3384, 'grad_norm': 4.030930519104004, 'learning_rate': 1.85503355704698e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47373/75000 [45:00<26:35, 17.32it/s]

{'loss': 0.3243, 'grad_norm': 5.337426662445068, 'learning_rate': 1.8543624161073827e-05, 'epoch': 1.89}


                                                     
 63%|██████▎   | 47381/75000 [45:00<25:05, 18.35it/s]

{'loss': 0.3663, 'grad_norm': 8.669525146484375, 'learning_rate': 1.8536912751677856e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47392/75000 [45:01<24:45, 18.58it/s]

{'loss': 0.1729, 'grad_norm': 1.6822857856750488, 'learning_rate': 1.853020134228188e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47401/75000 [45:01<25:10, 18.28it/s]

{'loss': 0.3987, 'grad_norm': 1.579994559288025, 'learning_rate': 1.8523489932885906e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47413/75000 [45:02<24:56, 18.43it/s]

{'loss': 0.2477, 'grad_norm': 1.3023698329925537, 'learning_rate': 1.8516778523489934e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47422/75000 [45:02<24:41, 18.62it/s]

{'loss': 0.2631, 'grad_norm': 2.036536693572998, 'learning_rate': 1.851006711409396e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47432/75000 [45:03<27:07, 16.94it/s]

{'loss': 0.4263, 'grad_norm': 3.931187629699707, 'learning_rate': 1.8503355704697988e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47442/75000 [45:04<27:14, 16.86it/s]

{'loss': 0.3253, 'grad_norm': 2.277240514755249, 'learning_rate': 1.8496644295302017e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47453/75000 [45:04<25:31, 17.99it/s]

{'loss': 0.2558, 'grad_norm': 2.7048773765563965, 'learning_rate': 1.8489932885906042e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47463/75000 [45:05<25:11, 18.21it/s]

{'loss': 0.4477, 'grad_norm': 8.535520553588867, 'learning_rate': 1.8483221476510067e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47473/75000 [45:05<25:17, 18.14it/s]

{'loss': 0.3835, 'grad_norm': 3.510918140411377, 'learning_rate': 1.8476510067114096e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47481/75000 [45:06<24:21, 18.83it/s]

{'loss': 0.3049, 'grad_norm': 4.135121822357178, 'learning_rate': 1.846979865771812e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47494/75000 [45:07<25:45, 17.80it/s]

{'loss': 0.2858, 'grad_norm': 3.5972208976745605, 'learning_rate': 1.8463087248322146e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47500/75000 [45:07<25:07, 18.24it/s]

{'loss': 0.3205, 'grad_norm': 4.316514015197754, 'learning_rate': 1.8456375838926178e-05, 'epoch': 1.9}


                                                       
 63%|██████▎   | 47514/75000 [45:08<28:36, 16.01it/s]

{'loss': 0.2238, 'grad_norm': 1.7643376588821411, 'learning_rate': 1.8449664429530203e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47522/75000 [45:09<27:21, 16.74it/s]

{'loss': 0.2595, 'grad_norm': 0.9306591749191284, 'learning_rate': 1.8442953020134228e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47533/75000 [45:09<26:53, 17.02it/s]

{'loss': 0.2853, 'grad_norm': 3.0598342418670654, 'learning_rate': 1.8436241610738257e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47543/75000 [45:10<26:26, 17.31it/s]

{'loss': 0.2398, 'grad_norm': 4.669641017913818, 'learning_rate': 1.8429530201342282e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47552/75000 [45:10<24:43, 18.50it/s]

{'loss': 0.4241, 'grad_norm': 3.6581013202667236, 'learning_rate': 1.8422818791946307e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47563/75000 [45:11<23:56, 19.10it/s]

{'loss': 0.3785, 'grad_norm': 2.6931822299957275, 'learning_rate': 1.841610738255034e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47572/75000 [45:11<25:44, 17.75it/s]

{'loss': 0.3776, 'grad_norm': 2.8266563415527344, 'learning_rate': 1.8409395973154364e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47582/75000 [45:12<24:14, 18.84it/s]

{'loss': 0.3858, 'grad_norm': 1.8164609670639038, 'learning_rate': 1.840268456375839e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47594/75000 [45:12<23:34, 19.37it/s]

{'loss': 0.2933, 'grad_norm': 1.9040119647979736, 'learning_rate': 1.8395973154362418e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47602/75000 [45:13<27:25, 16.65it/s]

{'loss': 0.3461, 'grad_norm': 8.058309555053711, 'learning_rate': 1.8389261744966443e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47611/75000 [45:13<25:22, 17.99it/s]

{'loss': 0.3954, 'grad_norm': 8.920737266540527, 'learning_rate': 1.8382550335570468e-05, 'epoch': 1.9}


                                                     
 63%|██████▎   | 47622/75000 [45:14<24:15, 18.81it/s]

{'loss': 0.2294, 'grad_norm': 2.6338040828704834, 'learning_rate': 1.8375838926174497e-05, 'epoch': 1.9}


                                                     
 64%|██████▎   | 47631/75000 [45:14<24:40, 18.49it/s]

{'loss': 0.3446, 'grad_norm': 4.5821661949157715, 'learning_rate': 1.8369127516778525e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47642/75000 [45:15<26:21, 17.30it/s]

{'loss': 0.2766, 'grad_norm': 4.462385177612305, 'learning_rate': 1.836241610738255e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47653/75000 [45:16<25:42, 17.73it/s]

{'loss': 0.2432, 'grad_norm': 1.2458473443984985, 'learning_rate': 1.835570469798658e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47664/75000 [45:16<23:58, 19.01it/s]

{'loss': 0.3199, 'grad_norm': 7.591531753540039, 'learning_rate': 1.8348993288590604e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47673/75000 [45:17<25:53, 17.59it/s]

{'loss': 0.2413, 'grad_norm': 3.382432222366333, 'learning_rate': 1.8342281879194633e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47683/75000 [45:17<25:26, 17.90it/s]

{'loss': 0.2817, 'grad_norm': 2.406076192855835, 'learning_rate': 1.8335570469798658e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47692/75000 [45:18<27:03, 16.82it/s]

{'loss': 0.3498, 'grad_norm': 1.7732086181640625, 'learning_rate': 1.8328859060402686e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47703/75000 [45:19<26:31, 17.15it/s]

{'loss': 0.231, 'grad_norm': 1.2696895599365234, 'learning_rate': 1.832214765100671e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47713/75000 [45:19<25:22, 17.92it/s]

{'loss': 0.2759, 'grad_norm': 4.1781110763549805, 'learning_rate': 1.831543624161074e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47723/75000 [45:20<25:50, 17.59it/s]

{'loss': 0.2446, 'grad_norm': 1.4724165201187134, 'learning_rate': 1.8308724832214765e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47732/75000 [45:20<26:48, 16.95it/s]

{'loss': 0.2395, 'grad_norm': 5.585309028625488, 'learning_rate': 1.8302013422818794e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47743/75000 [45:21<25:36, 17.74it/s]

{'loss': 0.233, 'grad_norm': 1.220304250717163, 'learning_rate': 1.829530201342282e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47753/75000 [45:21<23:53, 19.00it/s]

{'loss': 0.2405, 'grad_norm': 5.715085983276367, 'learning_rate': 1.8288590604026847e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47762/75000 [45:22<26:24, 17.19it/s]

{'loss': 0.3157, 'grad_norm': 6.1429829597473145, 'learning_rate': 1.8281879194630873e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47772/75000 [45:22<25:01, 18.14it/s]

{'loss': 0.2632, 'grad_norm': 4.545894145965576, 'learning_rate': 1.82751677852349e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47784/75000 [45:23<22:49, 19.87it/s]

{'loss': 0.3434, 'grad_norm': 2.7682788372039795, 'learning_rate': 1.8268456375838926e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47792/75000 [45:23<23:48, 19.05it/s]

{'loss': 0.2357, 'grad_norm': 0.4190114438533783, 'learning_rate': 1.8261744966442955e-05, 'epoch': 1.91}


                                                     
 64%|██████▎   | 47802/75000 [45:24<25:55, 17.48it/s]

{'loss': 0.4206, 'grad_norm': 2.82686710357666, 'learning_rate': 1.825503355704698e-05, 'epoch': 1.91}


                                                     
 64%|██████▍   | 47813/75000 [45:25<24:34, 18.44it/s]

{'loss': 0.381, 'grad_norm': 6.435600280761719, 'learning_rate': 1.8248322147651005e-05, 'epoch': 1.91}


                                                     
 64%|██████▍   | 47823/75000 [45:25<26:27, 17.12it/s]

{'loss': 0.3189, 'grad_norm': 4.458742618560791, 'learning_rate': 1.8241610738255037e-05, 'epoch': 1.91}


                                                     
 64%|██████▍   | 47832/75000 [45:26<26:55, 16.82it/s]

{'loss': 0.3278, 'grad_norm': 4.508513927459717, 'learning_rate': 1.8234899328859062e-05, 'epoch': 1.91}


                                                     
 64%|██████▍   | 47842/75000 [45:26<24:40, 18.34it/s]

{'loss': 0.2865, 'grad_norm': 2.2422046661376953, 'learning_rate': 1.8228187919463087e-05, 'epoch': 1.91}


                                                     
 64%|██████▍   | 47853/75000 [45:27<25:46, 17.55it/s]

{'loss': 0.3604, 'grad_norm': 4.254420280456543, 'learning_rate': 1.8221476510067116e-05, 'epoch': 1.91}


                                                     
 64%|██████▍   | 47863/75000 [45:27<25:33, 17.69it/s]

{'loss': 0.2752, 'grad_norm': 2.398343086242676, 'learning_rate': 1.821476510067114e-05, 'epoch': 1.91}


                                                     
 64%|██████▍   | 47872/75000 [45:28<25:40, 17.61it/s]

{'loss': 0.2648, 'grad_norm': 3.525256872177124, 'learning_rate': 1.8208053691275166e-05, 'epoch': 1.91}


                                                     
 64%|██████▍   | 47882/75000 [45:28<26:12, 17.25it/s]

{'loss': 0.3067, 'grad_norm': 10.400910377502441, 'learning_rate': 1.8201342281879198e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47893/75000 [45:29<24:22, 18.54it/s]

{'loss': 0.4025, 'grad_norm': 2.4289870262145996, 'learning_rate': 1.8194630872483223e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47902/75000 [45:30<25:55, 17.42it/s]

{'loss': 0.3315, 'grad_norm': 4.163233757019043, 'learning_rate': 1.818791946308725e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47913/75000 [45:30<25:23, 17.78it/s]

{'loss': 0.1932, 'grad_norm': 2.3244128227233887, 'learning_rate': 1.8181208053691277e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47922/75000 [45:31<24:10, 18.67it/s]

{'loss': 0.1973, 'grad_norm': 2.3637380599975586, 'learning_rate': 1.8174496644295302e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47933/75000 [45:31<24:14, 18.60it/s]

{'loss': 0.3689, 'grad_norm': 4.6707963943481445, 'learning_rate': 1.8167785234899327e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47942/75000 [45:32<25:05, 17.98it/s]

{'loss': 0.2387, 'grad_norm': 2.366201877593994, 'learning_rate': 1.8161073825503356e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47952/75000 [45:32<26:27, 17.04it/s]

{'loss': 0.4449, 'grad_norm': 5.143810272216797, 'learning_rate': 1.8154362416107385e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47962/75000 [45:33<24:25, 18.45it/s]

{'loss': 0.2695, 'grad_norm': 2.331979751586914, 'learning_rate': 1.814765100671141e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47972/75000 [45:33<24:21, 18.49it/s]

{'loss': 0.3193, 'grad_norm': 4.241630554199219, 'learning_rate': 1.8140939597315438e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47981/75000 [45:34<28:33, 15.77it/s]

{'loss': 0.3277, 'grad_norm': 2.6733319759368896, 'learning_rate': 1.8134228187919463e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 47993/75000 [45:35<26:08, 17.22it/s]

{'loss': 0.2617, 'grad_norm': 11.864340782165527, 'learning_rate': 1.812751677852349e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48000/75000 [45:35<24:58, 18.02it/s]

{'loss': 0.367, 'grad_norm': 1.905399203300476, 'learning_rate': 1.8120805369127517e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48012/75000 [45:36<29:54, 15.04it/s]

{'loss': 0.2923, 'grad_norm': 5.750240325927734, 'learning_rate': 1.8114093959731546e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48023/75000 [45:37<25:20, 17.74it/s]

{'loss': 0.2628, 'grad_norm': 4.956802845001221, 'learning_rate': 1.810738255033557e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48034/75000 [45:37<23:48, 18.88it/s]

{'loss': 0.2813, 'grad_norm': 2.2417731285095215, 'learning_rate': 1.81006711409396e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48043/75000 [45:38<24:11, 18.57it/s]

{'loss': 0.4303, 'grad_norm': 4.933605670928955, 'learning_rate': 1.8093959731543625e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48052/75000 [45:38<25:33, 17.58it/s]

{'loss': 0.2877, 'grad_norm': 2.92574143409729, 'learning_rate': 1.808724832214765e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48062/75000 [45:39<26:30, 16.94it/s]

{'loss': 0.2746, 'grad_norm': 1.6885476112365723, 'learning_rate': 1.8080536912751678e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48074/75000 [45:39<23:25, 19.16it/s]

{'loss': 0.2948, 'grad_norm': 2.397758722305298, 'learning_rate': 1.8073825503355707e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48082/75000 [45:40<24:56, 17.99it/s]

{'loss': 0.292, 'grad_norm': 1.0368021726608276, 'learning_rate': 1.8067114093959732e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48093/75000 [45:41<26:41, 16.81it/s]

{'loss': 0.3726, 'grad_norm': 4.277264595031738, 'learning_rate': 1.806040268456376e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48103/75000 [45:41<25:01, 17.91it/s]

{'loss': 0.3598, 'grad_norm': 2.1343421936035156, 'learning_rate': 1.8053691275167786e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48111/75000 [45:42<27:33, 16.27it/s]

{'loss': 0.3391, 'grad_norm': 1.8075608015060425, 'learning_rate': 1.8046979865771814e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48123/75000 [45:42<23:02, 19.44it/s]

{'loss': 0.2188, 'grad_norm': 5.0648884773254395, 'learning_rate': 1.804026845637584e-05, 'epoch': 1.92}


                                                     
 64%|██████▍   | 48132/75000 [45:43<24:44, 18.10it/s]

{'loss': 0.204, 'grad_norm': 1.40085768699646, 'learning_rate': 1.8033557046979864e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48141/75000 [45:43<26:42, 16.76it/s]

{'loss': 0.3203, 'grad_norm': 3.957021713256836, 'learning_rate': 1.8026845637583893e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48152/75000 [45:44<23:46, 18.82it/s]

{'loss': 0.5034, 'grad_norm': 3.613523006439209, 'learning_rate': 1.802013422818792e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48163/75000 [45:44<23:57, 18.67it/s]

{'loss': 0.211, 'grad_norm': 7.337508201599121, 'learning_rate': 1.8013422818791947e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48172/75000 [45:45<23:38, 18.91it/s]

{'loss': 0.3388, 'grad_norm': 3.7251076698303223, 'learning_rate': 1.8006711409395975e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48183/75000 [45:46<24:34, 18.18it/s]

{'loss': 0.3601, 'grad_norm': 8.506216049194336, 'learning_rate': 1.8e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48192/75000 [45:46<25:03, 17.83it/s]

{'loss': 0.2696, 'grad_norm': 8.435638427734375, 'learning_rate': 1.7993288590604026e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48204/75000 [45:47<23:07, 19.32it/s]

{'loss': 0.2068, 'grad_norm': 5.754323959350586, 'learning_rate': 1.7986577181208054e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48213/75000 [45:47<24:36, 18.14it/s]

{'loss': 0.2932, 'grad_norm': 2.928706645965576, 'learning_rate': 1.7979865771812083e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48222/75000 [45:48<24:42, 18.06it/s]

{'loss': 0.2047, 'grad_norm': 3.701251745223999, 'learning_rate': 1.7973154362416108e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48231/75000 [45:48<23:43, 18.81it/s]

{'loss': 0.371, 'grad_norm': 6.4243974685668945, 'learning_rate': 1.7966442953020136e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48244/75000 [45:49<23:15, 19.18it/s]

{'loss': 0.368, 'grad_norm': 5.1039252281188965, 'learning_rate': 1.795973154362416e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48253/75000 [45:49<25:30, 17.48it/s]

{'loss': 0.3888, 'grad_norm': 7.5082621574401855, 'learning_rate': 1.7953020134228187e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48261/75000 [45:50<23:38, 18.85it/s]

{'loss': 0.391, 'grad_norm': 2.0225958824157715, 'learning_rate': 1.7946308724832215e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48273/75000 [45:50<24:37, 18.09it/s]

{'loss': 0.2839, 'grad_norm': 1.9012380838394165, 'learning_rate': 1.7939597315436244e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48283/75000 [45:51<24:00, 18.55it/s]

{'loss': 0.3575, 'grad_norm': 3.6688125133514404, 'learning_rate': 1.793288590604027e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48292/75000 [45:51<25:29, 17.46it/s]

{'loss': 0.2649, 'grad_norm': 1.2635966539382935, 'learning_rate': 1.7926174496644298e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48304/75000 [45:52<23:18, 19.08it/s]

{'loss': 0.286, 'grad_norm': 2.013625383377075, 'learning_rate': 1.7919463087248323e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48311/75000 [45:52<23:29, 18.94it/s]

{'loss': 0.4082, 'grad_norm': 4.192857265472412, 'learning_rate': 1.7912751677852348e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48324/75000 [45:53<22:49, 19.48it/s]

{'loss': 0.3279, 'grad_norm': 5.30449914932251, 'learning_rate': 1.7906040268456376e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48332/75000 [45:54<23:57, 18.55it/s]

{'loss': 0.3597, 'grad_norm': 6.575654029846191, 'learning_rate': 1.7899328859060405e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48342/75000 [45:54<26:16, 16.91it/s]

{'loss': 0.2584, 'grad_norm': 3.873364210128784, 'learning_rate': 1.789261744966443e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48353/75000 [45:55<23:54, 18.58it/s]

{'loss': 0.473, 'grad_norm': 1.936867356300354, 'learning_rate': 1.788590604026846e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48362/75000 [45:55<23:58, 18.52it/s]

{'loss': 0.2262, 'grad_norm': 0.7977860569953918, 'learning_rate': 1.7879194630872484e-05, 'epoch': 1.93}


                                                     
 64%|██████▍   | 48373/75000 [45:56<25:24, 17.46it/s]

{'loss': 0.3294, 'grad_norm': 7.387853622436523, 'learning_rate': 1.787248322147651e-05, 'epoch': 1.93}


                                                     
 65%|██████▍   | 48382/75000 [45:56<24:10, 18.35it/s]

{'loss': 0.2101, 'grad_norm': 4.1636433601379395, 'learning_rate': 1.7865771812080538e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48392/75000 [45:57<27:12, 16.30it/s]

{'loss': 0.244, 'grad_norm': 2.9280080795288086, 'learning_rate': 1.7859060402684566e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48402/75000 [45:57<23:42, 18.70it/s]

{'loss': 0.369, 'grad_norm': 4.22894287109375, 'learning_rate': 1.785234899328859e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48411/75000 [45:58<27:01, 16.40it/s]

{'loss': 0.3503, 'grad_norm': 3.646470069885254, 'learning_rate': 1.784563758389262e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48422/75000 [45:59<26:18, 16.83it/s]

{'loss': 0.2966, 'grad_norm': 5.180667877197266, 'learning_rate': 1.7838926174496645e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48434/75000 [45:59<23:09, 19.12it/s]

{'loss': 0.2172, 'grad_norm': 1.2642529010772705, 'learning_rate': 1.783221476510067e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48443/75000 [46:00<25:28, 17.38it/s]

{'loss': 0.2737, 'grad_norm': 1.428033709526062, 'learning_rate': 1.78255033557047e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48452/75000 [46:00<26:38, 16.60it/s]

{'loss': 0.2497, 'grad_norm': 1.3933827877044678, 'learning_rate': 1.7818791946308724e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48461/75000 [46:01<24:22, 18.15it/s]

{'loss': 0.3129, 'grad_norm': 5.514493942260742, 'learning_rate': 1.7812080536912752e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48474/75000 [46:01<23:25, 18.87it/s]

{'loss': 0.3954, 'grad_norm': 2.6812217235565186, 'learning_rate': 1.780536912751678e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48482/75000 [46:02<25:44, 17.17it/s]

{'loss': 0.3703, 'grad_norm': 4.741772174835205, 'learning_rate': 1.7798657718120806e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48494/75000 [46:02<22:35, 19.55it/s]

{'loss': 0.2029, 'grad_norm': 3.3594958782196045, 'learning_rate': 1.779194630872483e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48500/75000 [46:03<25:07, 17.58it/s]

{'loss': 0.2403, 'grad_norm': 3.4519124031066895, 'learning_rate': 1.778523489932886e-05, 'epoch': 1.94}


                                                       
 65%|██████▍   | 48511/75000 [46:04<32:52, 13.43it/s]

{'loss': 0.3049, 'grad_norm': 2.3003897666931152, 'learning_rate': 1.7778523489932885e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48523/75000 [46:05<25:07, 17.56it/s]

{'loss': 0.3218, 'grad_norm': 2.0507068634033203, 'learning_rate': 1.7771812080536913e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48531/75000 [46:05<23:44, 18.59it/s]

{'loss': 0.3201, 'grad_norm': 2.0812337398529053, 'learning_rate': 1.7765100671140942e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48544/75000 [46:06<22:50, 19.30it/s]

{'loss': 0.3483, 'grad_norm': 4.505789279937744, 'learning_rate': 1.7758389261744967e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48551/75000 [46:06<25:07, 17.54it/s]

{'loss': 0.3449, 'grad_norm': 2.2067861557006836, 'learning_rate': 1.7751677852348996e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48563/75000 [46:07<23:57, 18.39it/s]

{'loss': 0.3547, 'grad_norm': 2.0075976848602295, 'learning_rate': 1.774496644295302e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48572/75000 [46:07<30:07, 14.62it/s]

{'loss': 0.4058, 'grad_norm': 1.8746752738952637, 'learning_rate': 1.7738255033557046e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48582/75000 [46:08<30:43, 14.33it/s]

{'loss': 0.358, 'grad_norm': 3.8064708709716797, 'learning_rate': 1.7731543624161075e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48592/75000 [46:09<32:14, 13.65it/s]

{'loss': 0.324, 'grad_norm': 2.0594847202301025, 'learning_rate': 1.7724832214765103e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48602/75000 [46:09<27:28, 16.02it/s]

{'loss': 0.3495, 'grad_norm': 10.515174865722656, 'learning_rate': 1.7718120805369128e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48612/75000 [46:10<24:53, 17.67it/s]

{'loss': 0.3089, 'grad_norm': 1.70014226436615, 'learning_rate': 1.7711409395973157e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48622/75000 [46:11<28:52, 15.23it/s]

{'loss': 0.2325, 'grad_norm': 0.9734629988670349, 'learning_rate': 1.7704697986577182e-05, 'epoch': 1.94}


                                                     
 65%|██████▍   | 48632/75000 [46:11<34:33, 12.72it/s]

{'loss': 0.3773, 'grad_norm': 4.893088340759277, 'learning_rate': 1.7697986577181207e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48642/75000 [46:12<32:49, 13.38it/s]

{'loss': 0.242, 'grad_norm': 1.6927729845046997, 'learning_rate': 1.7691275167785236e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48652/75000 [46:13<28:33, 15.38it/s]

{'loss': 0.2112, 'grad_norm': 9.625975608825684, 'learning_rate': 1.7684563758389264e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48662/75000 [46:13<28:43, 15.28it/s]

{'loss': 0.3452, 'grad_norm': 0.7510436773300171, 'learning_rate': 1.767785234899329e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48672/75000 [46:14<24:47, 17.70it/s]

{'loss': 0.2867, 'grad_norm': 3.247636556625366, 'learning_rate': 1.7671140939597318e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48682/75000 [46:14<24:51, 17.64it/s]

{'loss': 0.334, 'grad_norm': 5.428210258483887, 'learning_rate': 1.7664429530201343e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48692/75000 [46:15<29:04, 15.08it/s]

{'loss': 0.3192, 'grad_norm': 3.248166799545288, 'learning_rate': 1.7657718120805368e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48702/75000 [46:16<25:06, 17.46it/s]

{'loss': 0.3328, 'grad_norm': 3.948826551437378, 'learning_rate': 1.7651006711409397e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48712/75000 [46:16<27:06, 16.16it/s]

{'loss': 0.3207, 'grad_norm': 5.105745792388916, 'learning_rate': 1.7644295302013425e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48722/75000 [46:17<28:30, 15.36it/s]

{'loss': 0.258, 'grad_norm': 0.8623248338699341, 'learning_rate': 1.763758389261745e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48732/75000 [46:17<27:04, 16.17it/s]

{'loss': 0.3865, 'grad_norm': 7.762882232666016, 'learning_rate': 1.763087248322148e-05, 'epoch': 1.95}


                                                     
 65%|██████▍   | 48742/75000 [46:18<24:13, 18.07it/s]

{'loss': 0.3913, 'grad_norm': 3.5929033756256104, 'learning_rate': 1.7624161073825504e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48754/75000 [46:19<24:30, 17.85it/s]

{'loss': 0.3809, 'grad_norm': 6.354190349578857, 'learning_rate': 1.761744966442953e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48762/75000 [46:19<26:03, 16.78it/s]

{'loss': 0.2589, 'grad_norm': 2.6203274726867676, 'learning_rate': 1.7610738255033558e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48772/75000 [46:20<28:24, 15.38it/s]

{'loss': 0.3776, 'grad_norm': 2.6467764377593994, 'learning_rate': 1.7604026845637583e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48781/75000 [46:20<24:46, 17.64it/s]

{'loss': 0.3052, 'grad_norm': 3.090414524078369, 'learning_rate': 1.759731543624161e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48792/75000 [46:21<24:33, 17.79it/s]

{'loss': 0.4577, 'grad_norm': 3.912477970123291, 'learning_rate': 1.759060402684564e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48803/75000 [46:21<23:04, 18.92it/s]

{'loss': 0.4602, 'grad_norm': 2.921700954437256, 'learning_rate': 1.7583892617449665e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48811/75000 [46:22<24:12, 18.03it/s]

{'loss': 0.3605, 'grad_norm': 4.1312713623046875, 'learning_rate': 1.757718120805369e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48821/75000 [46:22<23:46, 18.35it/s]

{'loss': 0.2376, 'grad_norm': 1.7582652568817139, 'learning_rate': 1.757046979865772e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48832/75000 [46:23<26:31, 16.44it/s]

{'loss': 0.2897, 'grad_norm': 1.9204540252685547, 'learning_rate': 1.7563758389261744e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48842/75000 [46:24<27:27, 15.88it/s]

{'loss': 0.2295, 'grad_norm': 1.4784435033798218, 'learning_rate': 1.7557046979865773e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48852/75000 [46:24<23:17, 18.71it/s]

{'loss': 0.353, 'grad_norm': 5.074387073516846, 'learning_rate': 1.75503355704698e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48863/75000 [46:25<25:27, 17.11it/s]

{'loss': 0.3947, 'grad_norm': 5.641563415527344, 'learning_rate': 1.7543624161073826e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48871/75000 [46:25<27:45, 15.69it/s]

{'loss': 0.3548, 'grad_norm': 3.8935489654541016, 'learning_rate': 1.753691275167785e-05, 'epoch': 1.95}


                                                     
 65%|██████▌   | 48881/75000 [46:26<23:32, 18.49it/s]

{'loss': 0.3523, 'grad_norm': 2.568244457244873, 'learning_rate': 1.753020134228188e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48892/75000 [46:26<24:04, 18.08it/s]

{'loss': 0.3828, 'grad_norm': 10.662102699279785, 'learning_rate': 1.7523489932885905e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48903/75000 [46:27<25:24, 17.12it/s]

{'loss': 0.2413, 'grad_norm': 1.8952935934066772, 'learning_rate': 1.7516778523489934e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48912/75000 [46:28<27:36, 15.75it/s]

{'loss': 0.2149, 'grad_norm': 2.2545254230499268, 'learning_rate': 1.7510067114093962e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48922/75000 [46:28<26:37, 16.32it/s]

{'loss': 0.3869, 'grad_norm': 3.1352686882019043, 'learning_rate': 1.7503355704697988e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48932/75000 [46:29<23:31, 18.47it/s]

{'loss': 0.3514, 'grad_norm': 3.9589498043060303, 'learning_rate': 1.7496644295302013e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48942/75000 [46:29<25:39, 16.92it/s]

{'loss': 0.3004, 'grad_norm': 2.5395681858062744, 'learning_rate': 1.748993288590604e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48953/75000 [46:30<24:33, 17.67it/s]

{'loss': 0.36, 'grad_norm': 1.8471463918685913, 'learning_rate': 1.7483221476510066e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48964/75000 [46:30<22:27, 19.33it/s]

{'loss': 0.237, 'grad_norm': 4.080195426940918, 'learning_rate': 1.7476510067114095e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48972/75000 [46:31<24:28, 17.72it/s]

{'loss': 0.2879, 'grad_norm': 1.7685490846633911, 'learning_rate': 1.7469798657718124e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48981/75000 [46:31<25:26, 17.04it/s]

{'loss': 0.2673, 'grad_norm': 3.096658706665039, 'learning_rate': 1.746308724832215e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 48994/75000 [46:32<22:00, 19.70it/s]

{'loss': 0.3359, 'grad_norm': 5.120534420013428, 'learning_rate': 1.7456375838926177e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49000/75000 [46:32<24:44, 17.51it/s]

{'loss': 0.312, 'grad_norm': 1.77584707736969, 'learning_rate': 1.7449664429530202e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49014/75000 [46:34<27:16, 15.88it/s]

{'loss': 0.3223, 'grad_norm': 5.953812599182129, 'learning_rate': 1.7442953020134228e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49022/75000 [46:34<24:46, 17.47it/s]

{'loss': 0.3537, 'grad_norm': 0.7675098776817322, 'learning_rate': 1.7436241610738256e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49033/75000 [46:35<24:38, 17.56it/s]

{'loss': 0.3904, 'grad_norm': 2.253608465194702, 'learning_rate': 1.7429530201342285e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49043/75000 [46:35<26:11, 16.52it/s]

{'loss': 0.2508, 'grad_norm': 4.400769233703613, 'learning_rate': 1.742281879194631e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49052/75000 [46:36<26:04, 16.59it/s]

{'loss': 0.3317, 'grad_norm': 4.169053554534912, 'learning_rate': 1.741610738255034e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49063/75000 [46:37<25:59, 16.63it/s]

{'loss': 0.3099, 'grad_norm': 3.545802593231201, 'learning_rate': 1.7409395973154364e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49072/75000 [46:37<23:45, 18.19it/s]

{'loss': 0.2374, 'grad_norm': 13.359356880187988, 'learning_rate': 1.740268456375839e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49081/75000 [46:38<23:46, 18.18it/s]

{'loss': 0.352, 'grad_norm': 4.062107563018799, 'learning_rate': 1.7395973154362417e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49092/75000 [46:38<23:54, 18.06it/s]

{'loss': 0.2411, 'grad_norm': 1.3395940065383911, 'learning_rate': 1.7389261744966442e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49103/75000 [46:39<23:53, 18.07it/s]

{'loss': 0.378, 'grad_norm': 4.568789005279541, 'learning_rate': 1.738255033557047e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49112/75000 [46:39<23:18, 18.51it/s]

{'loss': 0.2351, 'grad_norm': 3.393869400024414, 'learning_rate': 1.73758389261745e-05, 'epoch': 1.96}


                                                     
 65%|██████▌   | 49122/75000 [46:40<25:12, 17.11it/s]

{'loss': 0.3697, 'grad_norm': 1.2307155132293701, 'learning_rate': 1.7369127516778525e-05, 'epoch': 1.96}


                                                     
 66%|██████▌   | 49133/75000 [46:41<25:54, 16.64it/s]

{'loss': 0.3138, 'grad_norm': 5.498104095458984, 'learning_rate': 1.736241610738255e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49143/75000 [46:41<23:26, 18.39it/s]

{'loss': 0.3559, 'grad_norm': 3.454684257507324, 'learning_rate': 1.735570469798658e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49153/75000 [46:42<25:37, 16.81it/s]

{'loss': 0.3118, 'grad_norm': 10.9945650100708, 'learning_rate': 1.7348993288590604e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49163/75000 [46:42<28:25, 15.15it/s]

{'loss': 0.2329, 'grad_norm': 1.7876662015914917, 'learning_rate': 1.7342281879194632e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49171/75000 [46:43<27:12, 15.82it/s]

{'loss': 0.3219, 'grad_norm': 2.838987350463867, 'learning_rate': 1.733557046979866e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49182/75000 [46:43<25:36, 16.80it/s]

{'loss': 0.3801, 'grad_norm': 3.3566482067108154, 'learning_rate': 1.7328859060402686e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49191/75000 [46:44<24:54, 17.27it/s]

{'loss': 0.2798, 'grad_norm': 3.7684881687164307, 'learning_rate': 1.732214765100671e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49203/75000 [46:45<24:15, 17.73it/s]

{'loss': 0.193, 'grad_norm': 1.6993253231048584, 'learning_rate': 1.731543624161074e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49212/75000 [46:45<25:51, 16.62it/s]

{'loss': 0.3572, 'grad_norm': 6.375967502593994, 'learning_rate': 1.7308724832214765e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49224/75000 [46:46<24:28, 17.55it/s]

{'loss': 0.3758, 'grad_norm': 3.522881031036377, 'learning_rate': 1.7302013422818793e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49233/75000 [46:46<23:52, 17.99it/s]

{'loss': 0.2524, 'grad_norm': 1.7090078592300415, 'learning_rate': 1.7295302013422822e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49243/75000 [46:47<24:11, 17.74it/s]

{'loss': 0.2499, 'grad_norm': 3.991605043411255, 'learning_rate': 1.7288590604026847e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49252/75000 [46:47<24:44, 17.35it/s]

{'loss': 0.3318, 'grad_norm': 5.358124732971191, 'learning_rate': 1.7281879194630872e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49262/75000 [46:48<23:39, 18.14it/s]

{'loss': 0.3209, 'grad_norm': 1.3475440740585327, 'learning_rate': 1.72751677852349e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49274/75000 [46:49<25:02, 17.12it/s]

{'loss': 0.3141, 'grad_norm': 4.148561477661133, 'learning_rate': 1.7268456375838926e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49282/75000 [46:49<24:12, 17.71it/s]

{'loss': 0.398, 'grad_norm': 1.9843509197235107, 'learning_rate': 1.726174496644295e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49292/75000 [46:50<23:32, 18.21it/s]

{'loss': 0.3181, 'grad_norm': 5.680272102355957, 'learning_rate': 1.7255033557046983e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49302/75000 [46:50<26:20, 16.26it/s]

{'loss': 0.2957, 'grad_norm': 5.915193557739258, 'learning_rate': 1.7248322147651008e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49312/75000 [46:51<24:45, 17.29it/s]

{'loss': 0.3271, 'grad_norm': 5.533250331878662, 'learning_rate': 1.7241610738255033e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49323/75000 [46:52<24:33, 17.43it/s]

{'loss': 0.2893, 'grad_norm': 5.2629570960998535, 'learning_rate': 1.7234899328859062e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49333/75000 [46:52<25:57, 16.48it/s]

{'loss': 0.2937, 'grad_norm': 5.0441083908081055, 'learning_rate': 1.7228187919463087e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49343/75000 [46:53<24:13, 17.65it/s]

{'loss': 0.3396, 'grad_norm': 3.1379871368408203, 'learning_rate': 1.7221476510067115e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49353/75000 [46:53<27:57, 15.29it/s]

{'loss': 0.2213, 'grad_norm': 2.661086320877075, 'learning_rate': 1.7214765100671144e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49361/75000 [46:54<29:13, 14.62it/s]

{'loss': 0.2827, 'grad_norm': 0.3632069230079651, 'learning_rate': 1.720805369127517e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49371/75000 [46:55<30:03, 14.21it/s]

{'loss': 0.3954, 'grad_norm': 2.115285634994507, 'learning_rate': 1.7201342281879194e-05, 'epoch': 1.97}


                                                     
 66%|██████▌   | 49382/75000 [46:55<24:30, 17.42it/s]

{'loss': 0.304, 'grad_norm': 9.023850440979004, 'learning_rate': 1.7194630872483223e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49392/75000 [46:56<33:52, 12.60it/s]

{'loss': 0.4589, 'grad_norm': 3.2068846225738525, 'learning_rate': 1.7187919463087248e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49403/75000 [46:57<27:01, 15.79it/s]

{'loss': 0.3042, 'grad_norm': 4.14398717880249, 'learning_rate': 1.7181208053691277e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49413/75000 [46:57<24:37, 17.31it/s]

{'loss': 0.3348, 'grad_norm': 6.264852523803711, 'learning_rate': 1.71744966442953e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49421/75000 [46:58<32:03, 13.30it/s]

{'loss': 0.2693, 'grad_norm': 5.626492977142334, 'learning_rate': 1.716778523489933e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49433/75000 [46:59<28:11, 15.11it/s]

{'loss': 0.1945, 'grad_norm': 17.411373138427734, 'learning_rate': 1.7161073825503355e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49443/75000 [46:59<26:11, 16.26it/s]

{'loss': 0.3626, 'grad_norm': 5.918603897094727, 'learning_rate': 1.7154362416107384e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49453/75000 [47:00<24:24, 17.45it/s]

{'loss': 0.2967, 'grad_norm': 1.5651661157608032, 'learning_rate': 1.714765100671141e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49463/75000 [47:01<24:28, 17.38it/s]

{'loss': 0.2889, 'grad_norm': 1.2839734554290771, 'learning_rate': 1.7140939597315438e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49472/75000 [47:01<25:26, 16.72it/s]

{'loss': 0.2547, 'grad_norm': 11.618749618530273, 'learning_rate': 1.7134228187919463e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49482/75000 [47:02<29:05, 14.62it/s]

{'loss': 0.3416, 'grad_norm': 2.176267147064209, 'learning_rate': 1.712751677852349e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49492/75000 [47:02<27:07, 15.67it/s]

{'loss': 0.2795, 'grad_norm': 2.9773733615875244, 'learning_rate': 1.712080536912752e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49500/75000 [47:03<28:29, 14.91it/s]

{'loss': 0.2328, 'grad_norm': 4.513580322265625, 'learning_rate': 1.7114093959731545e-05, 'epoch': 1.98}


                                                       
 66%|██████▌   | 49512/75000 [47:08<1:22:11,  5.17it/s]

{'loss': 0.2621, 'grad_norm': 14.868976593017578, 'learning_rate': 1.710738255033557e-05, 'epoch': 1.98}


                                                       
 66%|██████▌   | 49523/75000 [47:09<36:44, 11.55it/s]

{'loss': 0.2541, 'grad_norm': 2.617314577102661, 'learning_rate': 1.71006711409396e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49533/75000 [47:09<25:48, 16.45it/s]

{'loss': 0.2531, 'grad_norm': 9.425962448120117, 'learning_rate': 1.7093959731543624e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49543/75000 [47:10<30:47, 13.78it/s]

{'loss': 0.3852, 'grad_norm': 4.433588027954102, 'learning_rate': 1.7087248322147652e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49551/75000 [47:11<29:21, 14.45it/s]

{'loss': 0.3184, 'grad_norm': 1.5639867782592773, 'learning_rate': 1.708053691275168e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49561/75000 [47:11<32:31, 13.04it/s]

{'loss': 0.2312, 'grad_norm': 16.54662322998047, 'learning_rate': 1.7073825503355706e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49571/75000 [47:12<36:40, 11.55it/s]

{'loss': 0.2662, 'grad_norm': 3.1874728202819824, 'learning_rate': 1.706711409395973e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49580/75000 [47:13<35:18, 12.00it/s]

{'loss': 0.2633, 'grad_norm': 4.76104211807251, 'learning_rate': 1.706040268456376e-05, 'epoch': 1.98}


                                                       
 66%|██████▌   | 49593/75000 [47:14<32:09, 13.16it/s]

{'loss': 0.3609, 'grad_norm': 1.4728590250015259, 'learning_rate': 1.7053691275167785e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49601/75000 [47:15<31:04, 13.62it/s]

{'loss': 0.2168, 'grad_norm': 1.6316536664962769, 'learning_rate': 1.704697986577181e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49611/75000 [47:16<44:21,  9.54it/s]

{'loss': 0.3491, 'grad_norm': 2.353212594985962, 'learning_rate': 1.7040268456375842e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49621/75000 [47:17<38:47, 10.90it/s]

{'loss': 0.4188, 'grad_norm': 2.3622212409973145, 'learning_rate': 1.7033557046979867e-05, 'epoch': 1.98}


                                                     
 66%|██████▌   | 49633/75000 [47:18<33:32, 12.60it/s]

{'loss': 0.2946, 'grad_norm': 8.226664543151855, 'learning_rate': 1.7026845637583892e-05, 'epoch': 1.99}


                                                     
 66%|██████▌   | 49643/75000 [47:18<25:51, 16.35it/s]

{'loss': 0.2471, 'grad_norm': 1.7957431077957153, 'learning_rate': 1.702013422818792e-05, 'epoch': 1.99}


                                                     
 66%|██████▌   | 49651/75000 [47:19<29:59, 14.09it/s]

{'loss': 0.2047, 'grad_norm': 3.4520456790924072, 'learning_rate': 1.7013422818791946e-05, 'epoch': 1.99}


                                                     
 66%|██████▌   | 49661/75000 [47:20<40:03, 10.54it/s]

{'loss': 0.1784, 'grad_norm': 9.344317436218262, 'learning_rate': 1.700671140939597e-05, 'epoch': 1.99}


                                                     
 66%|██████▌   | 49672/75000 [47:21<36:56, 11.43it/s]

{'loss': 0.2305, 'grad_norm': 2.2431702613830566, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.99}


                                                     
 66%|██████▌   | 49682/75000 [47:22<34:10, 12.35it/s]

{'loss': 0.3299, 'grad_norm': 1.5597118139266968, 'learning_rate': 1.699328859060403e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49690/75000 [47:23<37:42, 11.19it/s]

{'loss': 0.3063, 'grad_norm': 5.386175632476807, 'learning_rate': 1.6986577181208054e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49702/75000 [47:24<34:23, 12.26it/s]

{'loss': 0.2988, 'grad_norm': 1.5184863805770874, 'learning_rate': 1.6979865771812082e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49712/75000 [47:24<30:40, 13.74it/s]

{'loss': 0.2543, 'grad_norm': 4.600366115570068, 'learning_rate': 1.6973154362416107e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49721/75000 [47:25<50:43,  8.31it/s]

{'loss': 0.3247, 'grad_norm': 2.7922027111053467, 'learning_rate': 1.6966442953020132e-05, 'epoch': 1.99}


                                                       
 66%|██████▋   | 49732/75000 [47:27<39:05, 10.77it/s]

{'loss': 0.3203, 'grad_norm': 1.4962574243545532, 'learning_rate': 1.695973154362416e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49742/75000 [47:27<27:35, 15.26it/s]

{'loss': 0.3376, 'grad_norm': 3.1509647369384766, 'learning_rate': 1.695302013422819e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49752/75000 [47:28<26:24, 15.94it/s]

{'loss': 0.2052, 'grad_norm': 1.9164412021636963, 'learning_rate': 1.6946308724832215e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49760/75000 [47:29<28:28, 14.78it/s]

{'loss': 0.2492, 'grad_norm': 1.3699742555618286, 'learning_rate': 1.6939597315436243e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49772/75000 [47:30<43:56,  9.57it/s]

{'loss': 0.3103, 'grad_norm': 3.427626848220825, 'learning_rate': 1.693288590604027e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49782/75000 [47:31<31:12, 13.47it/s]

{'loss': 0.2678, 'grad_norm': 7.485470771789551, 'learning_rate': 1.6926174496644297e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49792/75000 [47:32<42:27,  9.90it/s]

{'loss': 0.2629, 'grad_norm': 3.181623935699463, 'learning_rate': 1.6919463087248322e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49802/75000 [47:32<30:52, 13.60it/s]

{'loss': 0.2483, 'grad_norm': 0.8810925483703613, 'learning_rate': 1.691275167785235e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49812/75000 [47:33<39:06, 10.74it/s]

{'loss': 0.3081, 'grad_norm': 6.998549461364746, 'learning_rate': 1.6906040268456376e-05, 'epoch': 1.99}


                                                       
 66%|██████▋   | 49822/75000 [47:34<39:07, 10.72it/s]

{'loss': 0.2665, 'grad_norm': 3.173273801803589, 'learning_rate': 1.6899328859060404e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49832/75000 [47:35<29:38, 14.15it/s]

{'loss': 0.2664, 'grad_norm': 2.932271957397461, 'learning_rate': 1.689261744966443e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49840/75000 [47:36<30:55, 13.56it/s]

{'loss': 0.3855, 'grad_norm': 0.962471067905426, 'learning_rate': 1.6885906040268458e-05, 'epoch': 1.99}


                                                       
 66%|██████▋   | 49851/75000 [47:37<38:53, 10.78it/s]

{'loss': 0.1943, 'grad_norm': 1.5170507431030273, 'learning_rate': 1.6879194630872483e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49861/75000 [47:38<30:43, 13.64it/s]

{'loss': 0.2243, 'grad_norm': 2.043583869934082, 'learning_rate': 1.6872483221476512e-05, 'epoch': 1.99}


                                                     
 66%|██████▋   | 49871/75000 [47:39<34:46, 12.04it/s]

{'loss': 0.3166, 'grad_norm': 2.096681833267212, 'learning_rate': 1.6865771812080537e-05, 'epoch': 1.99}


                                                       
 67%|██████▋   | 49881/75000 [47:40<59:09,  7.08it/s]  

{'loss': 0.4258, 'grad_norm': 2.744673252105713, 'learning_rate': 1.6859060402684565e-05, 'epoch': 2.0}


                                                       
 67%|██████▋   | 49891/75000 [47:41<37:34, 11.13it/s]

{'loss': 0.3772, 'grad_norm': 3.357322931289673, 'learning_rate': 1.685234899328859e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 49901/75000 [47:42<56:32,  7.40it/s]

{'loss': 0.2639, 'grad_norm': 2.6812431812286377, 'learning_rate': 1.684563758389262e-05, 'epoch': 2.0}


                                                       
 67%|██████▋   | 49913/75000 [47:43<35:41, 11.72it/s]

{'loss': 0.3845, 'grad_norm': 1.16580069065094, 'learning_rate': 1.6838926174496644e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 49921/75000 [47:44<51:17,  8.15it/s]

{'loss': 0.3076, 'grad_norm': 5.757433891296387, 'learning_rate': 1.683221476510067e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 49931/75000 [47:45<34:10, 12.22it/s]

{'loss': 0.3957, 'grad_norm': 4.001480579376221, 'learning_rate': 1.68255033557047e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 49943/75000 [47:46<31:06, 13.42it/s]

{'loss': 0.2761, 'grad_norm': 1.8030922412872314, 'learning_rate': 1.6818791946308727e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 49951/75000 [47:47<30:10, 13.84it/s]

{'loss': 0.3008, 'grad_norm': 3.420297384262085, 'learning_rate': 1.6812080536912752e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 49961/75000 [47:48<33:03, 12.62it/s]

{'loss': 0.307, 'grad_norm': 7.812896728515625, 'learning_rate': 1.680536912751678e-05, 'epoch': 2.0}


                                                       
 67%|██████▋   | 49971/75000 [47:49<1:11:50,  5.81it/s]

{'loss': 0.1994, 'grad_norm': 2.327993631362915, 'learning_rate': 1.6798657718120805e-05, 'epoch': 2.0}


                                                       
 67%|██████▋   | 49981/75000 [47:50<44:51,  9.29it/s]

{'loss': 0.3668, 'grad_norm': 3.9257748126983643, 'learning_rate': 1.679194630872483e-05, 'epoch': 2.0}


                                                       
 67%|██████▋   | 49992/75000 [47:51<38:39, 10.78it/s]

{'loss': 0.4158, 'grad_norm': 4.99882698059082, 'learning_rate': 1.6785234899328863e-05, 'epoch': 2.0}


                                                       
 67%|██████▋   | 50000/75000 [47:55<3:47:32,  1.83it/s]

{'loss': 0.3386, 'grad_norm': 4.641327857971191, 'learning_rate': 1.6778523489932888e-05, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'eval_loss': 0.3203533887863159, 'eval_runtime': 69.5834, 'eval_samples_per_second': 1437.124, 'eval_steps_per_second': 22.462, 'epoch': 2.0}


                                                        
 67%|██████▋   | 50011/75000 [49:06<13:36:05,  1.96s/it]

{'loss': 0.3088, 'grad_norm': 3.1670775413513184, 'learning_rate': 1.6771812080536913e-05, 'epoch': 2.0}


                                                        
 67%|██████▋   | 50021/75000 [49:07<2:38:03,  2.63it/s]

{'loss': 0.3025, 'grad_norm': 3.840909481048584, 'learning_rate': 1.676510067114094e-05, 'epoch': 2.0}


                                                       
 67%|██████▋   | 50032/75000 [49:08<51:20,  8.11it/s]

{'loss': 0.2547, 'grad_norm': 4.022068500518799, 'learning_rate': 1.6758389261744967e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50042/75000 [49:08<30:26, 13.66it/s]

{'loss': 0.3008, 'grad_norm': 3.4650542736053467, 'learning_rate': 1.6751677852348992e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50053/75000 [49:09<24:53, 16.70it/s]

{'loss': 0.23, 'grad_norm': 3.8866875171661377, 'learning_rate': 1.6744966442953024e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50061/75000 [49:09<26:39, 15.59it/s]

{'loss': 0.2849, 'grad_norm': 4.494259834289551, 'learning_rate': 1.673825503355705e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50073/75000 [49:10<28:44, 14.45it/s]

{'loss': 0.2542, 'grad_norm': 1.8485791683197021, 'learning_rate': 1.6731543624161074e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50081/75000 [49:11<30:24, 13.66it/s]

{'loss': 0.3497, 'grad_norm': 1.515212059020996, 'learning_rate': 1.6724832214765103e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50093/75000 [49:12<26:13, 15.83it/s]

{'loss': 0.2247, 'grad_norm': 3.9428441524505615, 'learning_rate': 1.6718120805369128e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50103/75000 [49:12<24:27, 16.97it/s]

{'loss': 0.2769, 'grad_norm': 0.5633134245872498, 'learning_rate': 1.6711409395973153e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50113/75000 [49:13<23:47, 17.43it/s]

{'loss': 0.3855, 'grad_norm': 6.582859992980957, 'learning_rate': 1.670469798657718e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50123/75000 [49:13<27:13, 15.23it/s]

{'loss': 0.1824, 'grad_norm': 1.6518076658248901, 'learning_rate': 1.669798657718121e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 50133/75000 [49:14<23:36, 17.55it/s]

{'loss': 0.3373, 'grad_norm': 2.3260903358459473, 'learning_rate': 1.6691275167785235e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50143/75000 [49:15<23:53, 17.33it/s]

{'loss': 0.2477, 'grad_norm': 2.132547378540039, 'learning_rate': 1.6684563758389264e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50152/75000 [49:15<23:43, 17.46it/s]

{'loss': 0.2904, 'grad_norm': 2.004852056503296, 'learning_rate': 1.667785234899329e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50162/75000 [49:16<27:44, 14.92it/s]

{'loss': 0.3268, 'grad_norm': 17.533559799194336, 'learning_rate': 1.6671140939597314e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50171/75000 [49:16<25:08, 16.45it/s]

{'loss': 0.3323, 'grad_norm': 6.635251998901367, 'learning_rate': 1.6664429530201343e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50184/75000 [49:17<22:01, 18.78it/s]

{'loss': 0.2664, 'grad_norm': 6.074926853179932, 'learning_rate': 1.665771812080537e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50192/75000 [49:17<24:26, 16.92it/s]

{'loss': 0.3021, 'grad_norm': 2.8231589794158936, 'learning_rate': 1.6651006711409396e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50202/75000 [49:18<23:48, 17.35it/s]

{'loss': 0.2705, 'grad_norm': 4.719163417816162, 'learning_rate': 1.6644295302013425e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50212/75000 [49:19<24:07, 17.12it/s]

{'loss': 0.4402, 'grad_norm': 4.618155479431152, 'learning_rate': 1.663758389261745e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50222/75000 [49:19<25:27, 16.22it/s]

{'loss': 0.2704, 'grad_norm': 2.4439401626586914, 'learning_rate': 1.663087248322148e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50232/75000 [49:20<28:05, 14.70it/s]

{'loss': 0.315, 'grad_norm': 13.795235633850098, 'learning_rate': 1.6624161073825504e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50242/75000 [49:20<25:17, 16.32it/s]

{'loss': 0.2295, 'grad_norm': 1.218342900276184, 'learning_rate': 1.661744966442953e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50252/75000 [49:21<26:31, 15.55it/s]

{'loss': 0.2855, 'grad_norm': 1.2060978412628174, 'learning_rate': 1.6610738255033557e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50262/75000 [49:22<25:11, 16.37it/s]

{'loss': 0.1992, 'grad_norm': 7.276684761047363, 'learning_rate': 1.6604026845637586e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50272/75000 [49:22<27:35, 14.94it/s]

{'loss': 0.3345, 'grad_norm': 5.181431770324707, 'learning_rate': 1.659731543624161e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50282/75000 [49:23<26:46, 15.39it/s]

{'loss': 0.3972, 'grad_norm': 1.734584927558899, 'learning_rate': 1.659060402684564e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50293/75000 [49:23<23:29, 17.53it/s]

{'loss': 0.4062, 'grad_norm': 3.4891366958618164, 'learning_rate': 1.6583892617449665e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50303/75000 [49:24<24:51, 16.56it/s]

{'loss': 0.2984, 'grad_norm': 2.603785991668701, 'learning_rate': 1.657718120805369e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50313/75000 [49:25<25:24, 16.19it/s]

{'loss': 0.345, 'grad_norm': 4.148930072784424, 'learning_rate': 1.657046979865772e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50321/75000 [49:25<25:05, 16.39it/s]

{'loss': 0.3149, 'grad_norm': 6.586978435516357, 'learning_rate': 1.6563758389261747e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50334/75000 [49:26<22:42, 18.10it/s]

{'loss': 0.4033, 'grad_norm': 5.11602258682251, 'learning_rate': 1.6557046979865772e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50343/75000 [49:26<24:11, 16.99it/s]

{'loss': 0.3791, 'grad_norm': 2.9907467365264893, 'learning_rate': 1.65503355704698e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50351/75000 [49:27<22:54, 17.93it/s]

{'loss': 0.4009, 'grad_norm': 5.492851257324219, 'learning_rate': 1.6543624161073826e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50363/75000 [49:28<23:02, 17.82it/s]

{'loss': 0.2873, 'grad_norm': 12.987896919250488, 'learning_rate': 1.653691275167785e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50372/75000 [49:28<25:16, 16.24it/s]

{'loss': 0.2402, 'grad_norm': 2.751744508743286, 'learning_rate': 1.6530201342281883e-05, 'epoch': 2.01}


                                                     
 67%|██████▋   | 50382/75000 [49:29<24:47, 16.55it/s]

{'loss': 0.3483, 'grad_norm': 1.659475564956665, 'learning_rate': 1.6523489932885908e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50392/75000 [49:29<23:56, 17.13it/s]

{'loss': 0.3863, 'grad_norm': 2.9106693267822266, 'learning_rate': 1.6516778523489933e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50403/75000 [49:30<22:52, 17.93it/s]

{'loss': 0.3791, 'grad_norm': 6.004096984863281, 'learning_rate': 1.6510067114093962e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50413/75000 [49:30<22:24, 18.28it/s]

{'loss': 0.3109, 'grad_norm': 0.8008826971054077, 'learning_rate': 1.6503355704697987e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50422/75000 [49:31<23:22, 17.52it/s]

{'loss': 0.3627, 'grad_norm': 4.532844543457031, 'learning_rate': 1.6496644295302012e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50432/75000 [49:31<22:08, 18.49it/s]

{'loss': 0.2357, 'grad_norm': 1.4427378177642822, 'learning_rate': 1.648993288590604e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50442/75000 [49:32<23:01, 17.77it/s]

{'loss': 0.279, 'grad_norm': 0.9312718510627747, 'learning_rate': 1.648322147651007e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50452/75000 [49:33<25:05, 16.31it/s]

{'loss': 0.3311, 'grad_norm': 2.310925245285034, 'learning_rate': 1.6476510067114094e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50461/75000 [49:33<24:34, 16.64it/s]

{'loss': 0.4214, 'grad_norm': 1.7083195447921753, 'learning_rate': 1.6469798657718123e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50473/75000 [49:34<21:33, 18.96it/s]

{'loss': 0.3445, 'grad_norm': 2.5801444053649902, 'learning_rate': 1.6463087248322148e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50481/75000 [49:34<25:00, 16.34it/s]

{'loss': 0.3742, 'grad_norm': 8.682577133178711, 'learning_rate': 1.6456375838926173e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50494/75000 [49:35<21:33, 18.94it/s]

{'loss': 0.2565, 'grad_norm': 2.5194754600524902, 'learning_rate': 1.6449664429530202e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50500/75000 [49:35<24:21, 16.76it/s]

{'loss': 0.3424, 'grad_norm': 4.879598617553711, 'learning_rate': 1.644295302013423e-05, 'epoch': 2.02}


                                                       
 67%|██████▋   | 50513/75000 [49:37<30:13, 13.50it/s]

{'loss': 0.2619, 'grad_norm': 1.048440933227539, 'learning_rate': 1.6436241610738256e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50522/75000 [49:37<26:31, 15.38it/s]

{'loss': 0.3866, 'grad_norm': 4.372849464416504, 'learning_rate': 1.6429530201342284e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50532/75000 [49:38<26:27, 15.41it/s]

{'loss': 0.3102, 'grad_norm': 1.2233797311782837, 'learning_rate': 1.642281879194631e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50542/75000 [49:39<23:44, 17.16it/s]

{'loss': 0.2751, 'grad_norm': 4.907714366912842, 'learning_rate': 1.6416107382550334e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50552/75000 [49:39<25:35, 15.92it/s]

{'loss': 0.4068, 'grad_norm': 6.894245147705078, 'learning_rate': 1.6409395973154363e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50562/75000 [49:40<23:59, 16.98it/s]

{'loss': 0.2348, 'grad_norm': 10.769987106323242, 'learning_rate': 1.6402684563758388e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50572/75000 [49:40<23:40, 17.20it/s]

{'loss': 0.2571, 'grad_norm': 0.6152070164680481, 'learning_rate': 1.6395973154362417e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50582/75000 [49:41<22:49, 17.83it/s]

{'loss': 0.3295, 'grad_norm': 5.376821517944336, 'learning_rate': 1.6389261744966445e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50592/75000 [49:42<22:51, 17.80it/s]

{'loss': 0.2322, 'grad_norm': 0.4685664176940918, 'learning_rate': 1.638255033557047e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50602/75000 [49:42<25:23, 16.01it/s]

{'loss': 0.3654, 'grad_norm': 8.700148582458496, 'learning_rate': 1.6375838926174496e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50612/75000 [49:43<24:04, 16.88it/s]

{'loss': 0.465, 'grad_norm': 4.472904205322266, 'learning_rate': 1.6369127516778524e-05, 'epoch': 2.02}


                                                     
 67%|██████▋   | 50622/75000 [49:43<26:18, 15.44it/s]

{'loss': 0.3257, 'grad_norm': 1.0972703695297241, 'learning_rate': 1.636241610738255e-05, 'epoch': 2.02}


                                                     
 68%|██████▊   | 50632/75000 [49:44<28:20, 14.33it/s]

{'loss': 0.2509, 'grad_norm': 4.500035285949707, 'learning_rate': 1.6355704697986578e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50642/75000 [49:45<33:27, 12.13it/s]

{'loss': 0.2754, 'grad_norm': 2.2990872859954834, 'learning_rate': 1.6348993288590606e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50652/75000 [49:46<31:16, 12.98it/s]

{'loss': 0.2858, 'grad_norm': 2.679880380630493, 'learning_rate': 1.634228187919463e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50662/75000 [49:46<27:07, 14.96it/s]

{'loss': 0.3906, 'grad_norm': 1.6283767223358154, 'learning_rate': 1.633557046979866e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50672/75000 [49:47<22:32, 17.99it/s]

{'loss': 0.3341, 'grad_norm': 3.262720823287964, 'learning_rate': 1.6328859060402685e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50683/75000 [49:47<21:52, 18.53it/s]

{'loss': 0.3695, 'grad_norm': 8.596745491027832, 'learning_rate': 1.632214765100671e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50691/75000 [49:48<22:03, 18.36it/s]

{'loss': 0.3947, 'grad_norm': 9.307564735412598, 'learning_rate': 1.631543624161074e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50703/75000 [49:48<21:45, 18.60it/s]

{'loss': 0.267, 'grad_norm': 0.6853147149085999, 'learning_rate': 1.6308724832214767e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50711/75000 [49:49<25:07, 16.12it/s]

{'loss': 0.3514, 'grad_norm': 1.3813114166259766, 'learning_rate': 1.6302013422818793e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50721/75000 [49:50<28:25, 14.24it/s]

{'loss': 0.3089, 'grad_norm': 1.209771752357483, 'learning_rate': 1.629530201342282e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50732/75000 [49:50<24:22, 16.60it/s]

{'loss': 0.3412, 'grad_norm': 1.108593463897705, 'learning_rate': 1.6288590604026846e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50742/75000 [49:51<22:51, 17.69it/s]

{'loss': 0.3243, 'grad_norm': 1.2799261808395386, 'learning_rate': 1.628187919463087e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50753/75000 [49:51<23:44, 17.02it/s]

{'loss': 0.2888, 'grad_norm': 2.4921391010284424, 'learning_rate': 1.62751677852349e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50763/75000 [49:52<21:48, 18.52it/s]

{'loss': 0.353, 'grad_norm': 1.3614763021469116, 'learning_rate': 1.626845637583893e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50771/75000 [49:52<23:10, 17.42it/s]

{'loss': 0.2006, 'grad_norm': 10.103172302246094, 'learning_rate': 1.6261744966442954e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50782/75000 [49:53<21:59, 18.36it/s]

{'loss': 0.2629, 'grad_norm': 4.502359867095947, 'learning_rate': 1.6255033557046982e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50793/75000 [49:54<21:23, 18.86it/s]

{'loss': 0.2669, 'grad_norm': 1.5549755096435547, 'learning_rate': 1.6248322147651007e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50802/75000 [49:54<21:20, 18.89it/s]

{'loss': 0.3523, 'grad_norm': 1.3628063201904297, 'learning_rate': 1.6241610738255033e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50814/75000 [49:55<20:35, 19.58it/s]

{'loss': 0.3597, 'grad_norm': 1.7880932092666626, 'learning_rate': 1.623489932885906e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50822/75000 [49:55<24:56, 16.16it/s]

{'loss': 0.4156, 'grad_norm': 2.50785231590271, 'learning_rate': 1.622818791946309e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50832/75000 [49:56<22:39, 17.77it/s]

{'loss': 0.2038, 'grad_norm': 11.09958553314209, 'learning_rate': 1.6221476510067115e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50843/75000 [49:56<21:53, 18.39it/s]

{'loss': 0.2973, 'grad_norm': 7.548918724060059, 'learning_rate': 1.6214765100671143e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50853/75000 [49:57<25:29, 15.79it/s]

{'loss': 0.2243, 'grad_norm': 1.977665662765503, 'learning_rate': 1.620805369127517e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50862/75000 [49:57<23:19, 17.25it/s]

{'loss': 0.3267, 'grad_norm': 2.581021547317505, 'learning_rate': 1.6201342281879194e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50874/75000 [49:58<20:49, 19.30it/s]

{'loss': 0.3084, 'grad_norm': 3.8757426738739014, 'learning_rate': 1.6194630872483222e-05, 'epoch': 2.03}


                                                     
 68%|██████▊   | 50882/75000 [49:59<23:18, 17.25it/s]

{'loss': 0.2944, 'grad_norm': 1.2586561441421509, 'learning_rate': 1.6187919463087247e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50892/75000 [49:59<22:59, 17.48it/s]

{'loss': 0.3599, 'grad_norm': 2.9369125366210938, 'learning_rate': 1.6181208053691276e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50903/75000 [50:00<22:06, 18.17it/s]

{'loss': 0.3723, 'grad_norm': 1.6292049884796143, 'learning_rate': 1.6174496644295304e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50913/75000 [50:00<22:13, 18.07it/s]

{'loss': 0.298, 'grad_norm': 3.6614482402801514, 'learning_rate': 1.616778523489933e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50923/75000 [50:01<22:44, 17.64it/s]

{'loss': 0.2981, 'grad_norm': 6.4923624992370605, 'learning_rate': 1.6161073825503355e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50934/75000 [50:01<21:24, 18.74it/s]

{'loss': 0.2592, 'grad_norm': 0.46300432085990906, 'learning_rate': 1.6154362416107383e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50942/75000 [50:02<21:23, 18.75it/s]

{'loss': 0.2837, 'grad_norm': 1.5965814590454102, 'learning_rate': 1.614765100671141e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50953/75000 [50:02<21:46, 18.41it/s]

{'loss': 0.3024, 'grad_norm': 7.967065811157227, 'learning_rate': 1.6140939597315437e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50963/75000 [50:03<21:29, 18.64it/s]

{'loss': 0.3242, 'grad_norm': 2.2483878135681152, 'learning_rate': 1.6134228187919466e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50972/75000 [50:03<23:14, 17.23it/s]

{'loss': 0.3374, 'grad_norm': 4.034577369689941, 'learning_rate': 1.612751677852349e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50981/75000 [50:04<21:48, 18.35it/s]

{'loss': 0.2695, 'grad_norm': 5.922946929931641, 'learning_rate': 1.6120805369127516e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 50992/75000 [50:04<20:30, 19.51it/s]

{'loss': 0.314, 'grad_norm': 4.339809894561768, 'learning_rate': 1.6114093959731544e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51000/75000 [50:05<20:47, 19.24it/s]

{'loss': 0.3273, 'grad_norm': 3.7677292823791504, 'learning_rate': 1.610738255033557e-05, 'epoch': 2.04}


                                                       
 68%|██████▊   | 51014/75000 [50:06<25:39, 15.58it/s]

{'loss': 0.3247, 'grad_norm': 1.505878210067749, 'learning_rate': 1.6100671140939598e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51023/75000 [50:07<22:04, 18.10it/s]

{'loss': 0.2788, 'grad_norm': 2.946155309677124, 'learning_rate': 1.6093959731543627e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51032/75000 [50:07<22:20, 17.87it/s]

{'loss': 0.2823, 'grad_norm': 16.355806350708008, 'learning_rate': 1.6087248322147652e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51042/75000 [50:08<22:33, 17.70it/s]

{'loss': 0.2632, 'grad_norm': 5.2692742347717285, 'learning_rate': 1.6080536912751677e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51051/75000 [50:08<22:44, 17.55it/s]

{'loss': 0.3131, 'grad_norm': 8.537171363830566, 'learning_rate': 1.6073825503355706e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51064/75000 [50:09<21:15, 18.77it/s]

{'loss': 0.3194, 'grad_norm': 3.2212295532226562, 'learning_rate': 1.606711409395973e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51072/75000 [50:10<22:08, 18.01it/s]

{'loss': 0.4018, 'grad_norm': 5.041062355041504, 'learning_rate': 1.606040268456376e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51082/75000 [50:10<21:26, 18.59it/s]

{'loss': 0.3058, 'grad_norm': 6.732369422912598, 'learning_rate': 1.6053691275167788e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51092/75000 [50:11<23:01, 17.31it/s]

{'loss': 0.2762, 'grad_norm': 3.992434501647949, 'learning_rate': 1.6046979865771813e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51102/75000 [50:11<26:07, 15.24it/s]

{'loss': 0.1907, 'grad_norm': 10.275834083557129, 'learning_rate': 1.604026845637584e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51113/75000 [50:12<22:32, 17.66it/s]

{'loss': 0.2846, 'grad_norm': 7.625926971435547, 'learning_rate': 1.6033557046979867e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51122/75000 [50:12<23:00, 17.30it/s]

{'loss': 0.1973, 'grad_norm': 3.642504930496216, 'learning_rate': 1.6026845637583892e-05, 'epoch': 2.04}


                                                     
 68%|██████▊   | 51134/75000 [50:13<19:56, 19.95it/s]

{'loss': 0.2352, 'grad_norm': 1.4290568828582764, 'learning_rate': 1.602013422818792e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51142/75000 [50:13<21:03, 18.88it/s]

{'loss': 0.3289, 'grad_norm': 1.7149889469146729, 'learning_rate': 1.601342281879195e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51154/75000 [50:14<20:34, 19.32it/s]

{'loss': 0.2838, 'grad_norm': 1.4752094745635986, 'learning_rate': 1.6006711409395974e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51162/75000 [50:15<24:23, 16.29it/s]

{'loss': 0.2219, 'grad_norm': 3.359525680541992, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51172/75000 [50:15<24:31, 16.19it/s]

{'loss': 0.255, 'grad_norm': 3.298210382461548, 'learning_rate': 1.5993288590604028e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51182/75000 [50:16<22:06, 17.95it/s]

{'loss': 0.2905, 'grad_norm': 2.965742826461792, 'learning_rate': 1.5986577181208053e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51192/75000 [50:16<20:48, 19.06it/s]

{'loss': 0.3291, 'grad_norm': 6.486969947814941, 'learning_rate': 1.597986577181208e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51201/75000 [50:17<27:42, 14.31it/s]

{'loss': 0.1759, 'grad_norm': 3.9103527069091797, 'learning_rate': 1.597315436241611e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51214/75000 [50:18<22:48, 17.38it/s]

{'loss': 0.2927, 'grad_norm': 1.7284122705459595, 'learning_rate': 1.5966442953020135e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51221/75000 [50:18<21:33, 18.38it/s]

{'loss': 0.3662, 'grad_norm': 0.9722363948822021, 'learning_rate': 1.5959731543624164e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51233/75000 [50:19<20:23, 19.43it/s]

{'loss': 0.3064, 'grad_norm': 1.1959936618804932, 'learning_rate': 1.595302013422819e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51242/75000 [50:19<21:40, 18.27it/s]

{'loss': 0.3768, 'grad_norm': 1.8560291528701782, 'learning_rate': 1.5946308724832214e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51252/75000 [50:20<21:26, 18.46it/s]

{'loss': 0.329, 'grad_norm': 7.86962890625, 'learning_rate': 1.5939597315436243e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51261/75000 [50:20<24:30, 16.14it/s]

{'loss': 0.3158, 'grad_norm': 3.1932969093322754, 'learning_rate': 1.5932885906040268e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51274/75000 [50:21<20:34, 19.22it/s]

{'loss': 0.2174, 'grad_norm': 0.6088829636573792, 'learning_rate': 1.5926174496644296e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51282/75000 [50:21<23:45, 16.64it/s]

{'loss': 0.3408, 'grad_norm': 21.25040054321289, 'learning_rate': 1.5919463087248325e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51292/75000 [50:22<24:17, 16.27it/s]

{'loss': 0.3143, 'grad_norm': 1.926766276359558, 'learning_rate': 1.591275167785235e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51302/75000 [50:23<25:26, 15.53it/s]

{'loss': 0.3235, 'grad_norm': 2.8335819244384766, 'learning_rate': 1.5906040268456375e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51312/75000 [50:23<25:54, 15.23it/s]

{'loss': 0.3065, 'grad_norm': 2.6908557415008545, 'learning_rate': 1.5899328859060404e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51323/75000 [50:24<23:21, 16.90it/s]

{'loss': 0.318, 'grad_norm': 3.3655600547790527, 'learning_rate': 1.589261744966443e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51332/75000 [50:24<24:33, 16.06it/s]

{'loss': 0.2078, 'grad_norm': 1.3021949529647827, 'learning_rate': 1.5885906040268457e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51342/75000 [50:25<22:15, 17.71it/s]

{'loss': 0.4267, 'grad_norm': 10.13025188446045, 'learning_rate': 1.5879194630872486e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51352/75000 [50:26<22:21, 17.62it/s]

{'loss': 0.2836, 'grad_norm': 2.557203531265259, 'learning_rate': 1.587248322147651e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51363/75000 [50:26<21:01, 18.74it/s]

{'loss': 0.5171, 'grad_norm': 4.3447394371032715, 'learning_rate': 1.5865771812080536e-05, 'epoch': 2.05}


                                                     
 68%|██████▊   | 51373/75000 [50:27<21:18, 18.48it/s]

{'loss': 0.279, 'grad_norm': 1.1283493041992188, 'learning_rate': 1.5859060402684565e-05, 'epoch': 2.05}


                                                     
 69%|██████▊   | 51382/75000 [50:27<24:19, 16.18it/s]

{'loss': 0.2622, 'grad_norm': 4.815130710601807, 'learning_rate': 1.585234899328859e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51392/75000 [50:28<24:41, 15.93it/s]

{'loss': 0.2844, 'grad_norm': 1.2955683469772339, 'learning_rate': 1.5845637583892615e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51402/75000 [50:28<23:06, 17.02it/s]

{'loss': 0.343, 'grad_norm': 4.290043830871582, 'learning_rate': 1.5838926174496647e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51412/75000 [50:29<25:11, 15.61it/s]

{'loss': 0.2799, 'grad_norm': 3.699934482574463, 'learning_rate': 1.5832214765100672e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51421/75000 [50:30<22:47, 17.25it/s]

{'loss': 0.2521, 'grad_norm': 3.174280881881714, 'learning_rate': 1.5825503355704697e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51432/75000 [50:30<21:15, 18.47it/s]

{'loss': 0.2182, 'grad_norm': 4.355770111083984, 'learning_rate': 1.5818791946308726e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51442/75000 [50:31<22:33, 17.41it/s]

{'loss': 0.465, 'grad_norm': 2.1716113090515137, 'learning_rate': 1.581208053691275e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51453/75000 [50:31<23:00, 17.06it/s]

{'loss': 0.3792, 'grad_norm': 4.26646614074707, 'learning_rate': 1.580536912751678e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51464/75000 [50:32<20:33, 19.08it/s]

{'loss': 0.3694, 'grad_norm': 21.35348892211914, 'learning_rate': 1.5798657718120808e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51472/75000 [50:32<23:25, 16.74it/s]

{'loss': 0.3611, 'grad_norm': 7.144538402557373, 'learning_rate': 1.5791946308724833e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51482/75000 [50:33<21:16, 18.42it/s]

{'loss': 0.2952, 'grad_norm': 1.4115970134735107, 'learning_rate': 1.578523489932886e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51491/75000 [50:33<21:39, 18.09it/s]

{'loss': 0.3382, 'grad_norm': 2.1948726177215576, 'learning_rate': 1.5778523489932887e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51500/75000 [50:34<23:09, 16.91it/s]

{'loss': 0.3261, 'grad_norm': 2.776948928833008, 'learning_rate': 1.5771812080536912e-05, 'epoch': 2.06}


                                                       
 69%|██████▊   | 51514/75000 [50:35<26:39, 14.69it/s]

{'loss': 0.268, 'grad_norm': 1.7247893810272217, 'learning_rate': 1.576510067114094e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51521/75000 [50:36<22:34, 17.33it/s]

{'loss': 0.317, 'grad_norm': 4.555445671081543, 'learning_rate': 1.575838926174497e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51533/75000 [50:37<24:55, 15.69it/s]

{'loss': 0.3288, 'grad_norm': 5.072713375091553, 'learning_rate': 1.5751677852348995e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51542/75000 [50:37<22:30, 17.37it/s]

{'loss': 0.3336, 'grad_norm': 4.906060695648193, 'learning_rate': 1.574496644295302e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51552/75000 [50:38<23:29, 16.64it/s]

{'loss': 0.2575, 'grad_norm': 5.9052815437316895, 'learning_rate': 1.5738255033557048e-05, 'epoch': 2.06}


                                                     
 69%|██████▊   | 51562/75000 [50:38<22:06, 17.67it/s]

{'loss': 0.4654, 'grad_norm': 1.307815432548523, 'learning_rate': 1.5731543624161073e-05, 'epoch': 2.06}


                                                     
 69%|██████▉   | 51574/75000 [50:39<20:28, 19.06it/s]

{'loss': 0.4297, 'grad_norm': 4.564724445343018, 'learning_rate': 1.5724832214765102e-05, 'epoch': 2.06}


                                                     
 69%|██████▉   | 51582/75000 [50:39<22:31, 17.32it/s]

{'loss': 0.3517, 'grad_norm': 2.5477070808410645, 'learning_rate': 1.5718120805369127e-05, 'epoch': 2.06}


                                                     
 69%|██████▉   | 51592/75000 [50:40<23:30, 16.60it/s]

{'loss': 0.2209, 'grad_norm': 2.5940544605255127, 'learning_rate': 1.5711409395973156e-05, 'epoch': 2.06}


                                                     
 69%|██████▉   | 51602/75000 [50:41<22:48, 17.10it/s]

{'loss': 0.2616, 'grad_norm': 2.989190101623535, 'learning_rate': 1.5704697986577184e-05, 'epoch': 2.06}


                                                     
 69%|██████▉   | 51614/75000 [50:41<21:58, 17.74it/s]

{'loss': 0.1712, 'grad_norm': 6.396759986877441, 'learning_rate': 1.569798657718121e-05, 'epoch': 2.06}


                                                     
 69%|██████▉   | 51622/75000 [50:42<21:03, 18.51it/s]

{'loss': 0.3254, 'grad_norm': 8.60769271850586, 'learning_rate': 1.5691275167785235e-05, 'epoch': 2.06}


                                                     
 69%|██████▉   | 51632/75000 [50:42<23:35, 16.50it/s]

{'loss': 0.4603, 'grad_norm': 1.7984427213668823, 'learning_rate': 1.5684563758389263e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51642/75000 [50:43<21:34, 18.05it/s]

{'loss': 0.3419, 'grad_norm': 1.2577095031738281, 'learning_rate': 1.5677852348993288e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51652/75000 [50:43<21:39, 17.96it/s]

{'loss': 0.2636, 'grad_norm': 3.7806529998779297, 'learning_rate': 1.5671140939597317e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51663/75000 [50:44<20:33, 18.92it/s]

{'loss': 0.3725, 'grad_norm': 2.4408187866210938, 'learning_rate': 1.5664429530201345e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51672/75000 [50:44<21:23, 18.18it/s]

{'loss': 0.2241, 'grad_norm': 4.366959095001221, 'learning_rate': 1.565771812080537e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51682/75000 [50:45<21:43, 17.89it/s]

{'loss': 0.3262, 'grad_norm': 6.454798221588135, 'learning_rate': 1.5651006711409396e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51693/75000 [50:46<23:06, 16.81it/s]

{'loss': 0.3572, 'grad_norm': 2.1155059337615967, 'learning_rate': 1.5644295302013424e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51702/75000 [50:46<22:07, 17.55it/s]

{'loss': 0.1948, 'grad_norm': 4.6398396492004395, 'learning_rate': 1.563758389261745e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51713/75000 [50:47<20:05, 19.31it/s]

{'loss': 0.2386, 'grad_norm': 2.809039831161499, 'learning_rate': 1.5630872483221474e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51723/75000 [50:47<20:05, 19.32it/s]

{'loss': 0.2487, 'grad_norm': 0.30623364448547363, 'learning_rate': 1.5624161073825506e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51732/75000 [50:48<21:40, 17.89it/s]

{'loss': 0.3224, 'grad_norm': 0.8196315169334412, 'learning_rate': 1.561744966442953e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51743/75000 [50:48<22:43, 17.06it/s]

{'loss': 0.4013, 'grad_norm': 3.608182907104492, 'learning_rate': 1.5610738255033557e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51753/75000 [50:49<20:53, 18.55it/s]

{'loss': 0.3327, 'grad_norm': 3.7913568019866943, 'learning_rate': 1.5604026845637585e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51762/75000 [50:49<21:53, 17.69it/s]

{'loss': 0.3051, 'grad_norm': 4.5053606033325195, 'learning_rate': 1.559731543624161e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51772/75000 [50:50<19:54, 19.45it/s]

{'loss': 0.4311, 'grad_norm': 4.780789375305176, 'learning_rate': 1.5590604026845636e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51783/75000 [50:50<21:49, 17.73it/s]

{'loss': 0.2707, 'grad_norm': 6.530449867248535, 'learning_rate': 1.5583892617449668e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51793/75000 [50:51<23:36, 16.38it/s]

{'loss': 0.3381, 'grad_norm': 0.9356743693351746, 'learning_rate': 1.5577181208053693e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51802/75000 [50:52<21:38, 17.86it/s]

{'loss': 0.2958, 'grad_norm': 5.129231929779053, 'learning_rate': 1.5570469798657718e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51812/75000 [50:52<20:38, 18.73it/s]

{'loss': 0.2175, 'grad_norm': 4.351498126983643, 'learning_rate': 1.5563758389261746e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51823/75000 [50:53<20:27, 18.89it/s]

{'loss': 0.328, 'grad_norm': 10.224101066589355, 'learning_rate': 1.555704697986577e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51832/75000 [50:53<21:06, 18.29it/s]

{'loss': 0.4907, 'grad_norm': 2.00170636177063, 'learning_rate': 1.5550335570469797e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51843/75000 [50:54<22:06, 17.45it/s]

{'loss': 0.2942, 'grad_norm': 3.499976634979248, 'learning_rate': 1.554362416107383e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51851/75000 [50:54<21:05, 18.30it/s]

{'loss': 0.3236, 'grad_norm': 3.278953790664673, 'learning_rate': 1.5536912751677854e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51864/75000 [50:55<19:25, 19.86it/s]

{'loss': 0.3217, 'grad_norm': 4.005604267120361, 'learning_rate': 1.553020134228188e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51873/75000 [50:55<21:22, 18.03it/s]

{'loss': 0.3264, 'grad_norm': 7.355592250823975, 'learning_rate': 1.5523489932885908e-05, 'epoch': 2.07}


                                                     
 69%|██████▉   | 51881/75000 [50:56<23:27, 16.43it/s]

{'loss': 0.3608, 'grad_norm': 2.544637680053711, 'learning_rate': 1.5516778523489933e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51893/75000 [50:57<20:43, 18.58it/s]

{'loss': 0.4465, 'grad_norm': 2.4242215156555176, 'learning_rate': 1.551006711409396e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51902/75000 [50:57<23:56, 16.08it/s]

{'loss': 0.3198, 'grad_norm': 1.2042961120605469, 'learning_rate': 1.5503355704697986e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51912/75000 [50:58<27:15, 14.11it/s]

{'loss': 0.3059, 'grad_norm': 3.9669928550720215, 'learning_rate': 1.5496644295302015e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51921/75000 [50:58<22:59, 16.73it/s]

{'loss': 0.2964, 'grad_norm': 3.5486228466033936, 'learning_rate': 1.548993288590604e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51932/75000 [50:59<21:25, 17.94it/s]

{'loss': 0.3153, 'grad_norm': 4.245806694030762, 'learning_rate': 1.548322147651007e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51941/75000 [50:59<22:43, 16.91it/s]

{'loss': 0.28, 'grad_norm': 1.050845742225647, 'learning_rate': 1.5476510067114094e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51951/75000 [51:00<23:44, 16.18it/s]

{'loss': 0.3266, 'grad_norm': 9.761819839477539, 'learning_rate': 1.5469798657718122e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51962/75000 [51:01<22:48, 16.83it/s]

{'loss': 0.3044, 'grad_norm': 3.3296360969543457, 'learning_rate': 1.5463087248322148e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51972/75000 [51:01<22:59, 16.69it/s]

{'loss': 0.2822, 'grad_norm': 2.3464407920837402, 'learning_rate': 1.5456375838926176e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51982/75000 [51:02<21:38, 17.73it/s]

{'loss': 0.347, 'grad_norm': 4.091149806976318, 'learning_rate': 1.54496644295302e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 51993/75000 [51:02<23:29, 16.32it/s]

{'loss': 0.3333, 'grad_norm': 3.2975246906280518, 'learning_rate': 1.544295302013423e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52000/75000 [51:03<22:17, 17.20it/s]

{'loss': 0.3546, 'grad_norm': 5.696304798126221, 'learning_rate': 1.5436241610738255e-05, 'epoch': 2.08}


                                                       
 69%|██████▉   | 52012/75000 [51:04<26:30, 14.46it/s]

{'loss': 0.3744, 'grad_norm': 1.3153667449951172, 'learning_rate': 1.5429530201342283e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52021/75000 [51:05<21:13, 18.05it/s]

{'loss': 0.2956, 'grad_norm': 4.591616153717041, 'learning_rate': 1.542281879194631e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52033/75000 [51:05<20:35, 18.58it/s]

{'loss': 0.2064, 'grad_norm': 22.66655158996582, 'learning_rate': 1.5416107382550334e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52042/75000 [51:06<20:43, 18.46it/s]

{'loss': 0.371, 'grad_norm': 6.654432773590088, 'learning_rate': 1.5409395973154366e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52052/75000 [51:06<21:09, 18.07it/s]

{'loss': 0.3712, 'grad_norm': 1.3214147090911865, 'learning_rate': 1.540268456375839e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52063/75000 [51:07<21:34, 17.72it/s]

{'loss': 0.2494, 'grad_norm': 2.228581190109253, 'learning_rate': 1.5395973154362416e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52073/75000 [51:07<20:54, 18.28it/s]

{'loss': 0.3081, 'grad_norm': 5.275297164916992, 'learning_rate': 1.5389261744966445e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52083/75000 [51:08<23:01, 16.59it/s]

{'loss': 0.2251, 'grad_norm': 0.9363311529159546, 'learning_rate': 1.538255033557047e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52091/75000 [51:09<23:57, 15.94it/s]

{'loss': 0.2861, 'grad_norm': 2.824584484100342, 'learning_rate': 1.5375838926174495e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52103/75000 [51:09<20:20, 18.76it/s]

{'loss': 0.3438, 'grad_norm': 1.9229642152786255, 'learning_rate': 1.5369127516778527e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52112/75000 [51:10<21:28, 17.76it/s]

{'loss': 0.261, 'grad_norm': 1.3701192140579224, 'learning_rate': 1.5362416107382552e-05, 'epoch': 2.08}


                                                     
 69%|██████▉   | 52122/75000 [51:10<20:50, 18.29it/s]

{'loss': 0.3818, 'grad_norm': 4.830079078674316, 'learning_rate': 1.5355704697986577e-05, 'epoch': 2.08}


                                                     
 70%|██████▉   | 52132/75000 [51:11<20:42, 18.41it/s]

{'loss': 0.3585, 'grad_norm': 3.2313051223754883, 'learning_rate': 1.5348993288590606e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52143/75000 [51:11<21:59, 17.32it/s]

{'loss': 0.2693, 'grad_norm': 8.866459846496582, 'learning_rate': 1.534228187919463e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52153/75000 [51:12<22:36, 16.85it/s]

{'loss': 0.2174, 'grad_norm': 4.068094730377197, 'learning_rate': 1.5335570469798656e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52162/75000 [51:13<21:10, 17.97it/s]

{'loss': 0.2439, 'grad_norm': 5.948298454284668, 'learning_rate': 1.5328859060402688e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52172/75000 [51:13<21:26, 17.75it/s]

{'loss': 0.3249, 'grad_norm': 2.59667706489563, 'learning_rate': 1.5322147651006713e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52183/75000 [51:14<21:50, 17.40it/s]

{'loss': 0.2386, 'grad_norm': 3.185349225997925, 'learning_rate': 1.5315436241610738e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52193/75000 [51:14<21:31, 17.65it/s]

{'loss': 0.2531, 'grad_norm': 8.949527740478516, 'learning_rate': 1.5308724832214767e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52203/75000 [51:15<22:40, 16.76it/s]

{'loss': 0.2596, 'grad_norm': 1.6429859399795532, 'learning_rate': 1.5302013422818792e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52212/75000 [51:15<23:31, 16.15it/s]

{'loss': 0.2083, 'grad_norm': 4.781710147857666, 'learning_rate': 1.5295302013422817e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52222/75000 [51:16<21:44, 17.46it/s]

{'loss': 0.3794, 'grad_norm': 3.745723247528076, 'learning_rate': 1.5288590604026846e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52231/75000 [51:16<22:29, 16.87it/s]

{'loss': 0.1636, 'grad_norm': 1.778243064880371, 'learning_rate': 1.5281879194630874e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52242/75000 [51:17<20:24, 18.58it/s]

{'loss': 0.3282, 'grad_norm': 6.643731594085693, 'learning_rate': 1.52751677852349e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52251/75000 [51:18<21:11, 17.89it/s]

{'loss': 0.3815, 'grad_norm': 3.6120221614837646, 'learning_rate': 1.5268456375838928e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52263/75000 [51:18<19:50, 19.10it/s]

{'loss': 0.3552, 'grad_norm': 4.040378570556641, 'learning_rate': 1.5261744966442953e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52273/75000 [51:19<21:25, 17.68it/s]

{'loss': 0.2487, 'grad_norm': 1.7602791786193848, 'learning_rate': 1.525503355704698e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52283/75000 [51:19<21:08, 17.91it/s]

{'loss': 0.3553, 'grad_norm': 1.4102250337600708, 'learning_rate': 1.5248322147651007e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52292/75000 [51:20<20:56, 18.07it/s]

{'loss': 0.2818, 'grad_norm': 0.9259831309318542, 'learning_rate': 1.5241610738255035e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52302/75000 [51:20<20:15, 18.68it/s]

{'loss': 0.4159, 'grad_norm': 2.792966365814209, 'learning_rate': 1.5234899328859062e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52313/75000 [51:21<20:17, 18.64it/s]

{'loss': 0.2865, 'grad_norm': 3.6665985584259033, 'learning_rate': 1.5228187919463089e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52322/75000 [51:21<20:25, 18.51it/s]

{'loss': 0.2027, 'grad_norm': 5.774407863616943, 'learning_rate': 1.5221476510067114e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52333/75000 [51:22<20:47, 18.17it/s]

{'loss': 0.3603, 'grad_norm': 4.213932514190674, 'learning_rate': 1.5214765100671141e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52342/75000 [51:22<21:24, 17.64it/s]

{'loss': 0.2613, 'grad_norm': 14.630141258239746, 'learning_rate': 1.5208053691275168e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52352/75000 [51:23<23:31, 16.04it/s]

{'loss': 0.3315, 'grad_norm': 2.994278907775879, 'learning_rate': 1.5201342281879196e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52362/75000 [51:24<20:19, 18.56it/s]

{'loss': 0.2761, 'grad_norm': 18.443567276000977, 'learning_rate': 1.5194630872483223e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52371/75000 [51:24<20:12, 18.66it/s]

{'loss': 0.3012, 'grad_norm': 4.620081901550293, 'learning_rate': 1.518791946308725e-05, 'epoch': 2.09}


                                                     
 70%|██████▉   | 52383/75000 [51:25<19:14, 19.59it/s]

{'loss': 0.3532, 'grad_norm': 11.557992935180664, 'learning_rate': 1.5181208053691275e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52392/75000 [51:25<20:47, 18.12it/s]

{'loss': 0.395, 'grad_norm': 3.9290080070495605, 'learning_rate': 1.5174496644295302e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52402/75000 [51:26<21:39, 17.39it/s]

{'loss': 0.3626, 'grad_norm': 2.5064706802368164, 'learning_rate': 1.5167785234899329e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52412/75000 [51:26<20:59, 17.94it/s]

{'loss': 0.3742, 'grad_norm': 2.74259352684021, 'learning_rate': 1.5161073825503356e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52422/75000 [51:27<21:34, 17.44it/s]

{'loss': 0.3112, 'grad_norm': 12.571529388427734, 'learning_rate': 1.5154362416107384e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52433/75000 [51:28<22:25, 16.77it/s]

{'loss': 0.4367, 'grad_norm': 3.7030882835388184, 'learning_rate': 1.5147651006711411e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52444/75000 [51:28<19:33, 19.22it/s]

{'loss': 0.3158, 'grad_norm': 2.0089633464813232, 'learning_rate': 1.5140939597315436e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52454/75000 [51:29<20:13, 18.58it/s]

{'loss': 0.3645, 'grad_norm': 8.734687805175781, 'learning_rate': 1.5134228187919463e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52463/75000 [51:29<21:19, 17.61it/s]

{'loss': 0.2826, 'grad_norm': 5.934815883636475, 'learning_rate': 1.512751677852349e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52472/75000 [51:30<23:51, 15.74it/s]

{'loss': 0.3421, 'grad_norm': 2.5119733810424805, 'learning_rate': 1.5120805369127517e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52481/75000 [51:30<21:00, 17.86it/s]

{'loss': 0.3473, 'grad_norm': 3.795386552810669, 'learning_rate': 1.5114093959731546e-05, 'epoch': 2.1}


                                                     
 70%|██████▉   | 52494/75000 [51:31<20:10, 18.60it/s]

{'loss': 0.2919, 'grad_norm': 1.8873833417892456, 'learning_rate': 1.5107382550335572e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52500/75000 [51:31<21:42, 17.27it/s]

{'loss': 0.3439, 'grad_norm': 1.6168626546859741, 'learning_rate': 1.51006711409396e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52514/75000 [51:33<24:24, 15.36it/s]

{'loss': 0.2487, 'grad_norm': 6.512723445892334, 'learning_rate': 1.5093959731543624e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52523/75000 [51:33<20:50, 17.98it/s]

{'loss': 0.258, 'grad_norm': 5.240638256072998, 'learning_rate': 1.5087248322147651e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52533/75000 [51:34<21:39, 17.29it/s]

{'loss': 0.3479, 'grad_norm': 2.318140745162964, 'learning_rate': 1.5080536912751678e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52543/75000 [51:34<22:00, 17.01it/s]

{'loss': 0.3037, 'grad_norm': 9.480192184448242, 'learning_rate': 1.5073825503355703e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52553/75000 [51:35<22:12, 16.84it/s]

{'loss': 0.3921, 'grad_norm': 1.6531575918197632, 'learning_rate': 1.5067114093959734e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52561/75000 [51:35<20:26, 18.30it/s]

{'loss': 0.2709, 'grad_norm': 2.03136944770813, 'learning_rate': 1.506040268456376e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52574/75000 [51:36<19:33, 19.11it/s]

{'loss': 0.3995, 'grad_norm': 6.080961227416992, 'learning_rate': 1.5053691275167786e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52582/75000 [51:36<20:14, 18.46it/s]

{'loss': 0.2302, 'grad_norm': 7.938789367675781, 'learning_rate': 1.5046979865771812e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52591/75000 [51:37<21:28, 17.39it/s]

{'loss': 0.2984, 'grad_norm': 3.500830888748169, 'learning_rate': 1.504026845637584e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52604/75000 [51:38<18:43, 19.94it/s]

{'loss': 0.3844, 'grad_norm': 3.9072329998016357, 'learning_rate': 1.5033557046979866e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52612/75000 [51:38<21:53, 17.05it/s]

{'loss': 0.2695, 'grad_norm': 3.4046103954315186, 'learning_rate': 1.5026845637583895e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52622/75000 [51:39<21:53, 17.04it/s]

{'loss': 0.5169, 'grad_norm': 6.415530204772949, 'learning_rate': 1.5020134228187922e-05, 'epoch': 2.1}


                                                     
 70%|███████   | 52633/75000 [51:39<19:34, 19.04it/s]

{'loss': 0.3682, 'grad_norm': 4.1584062576293945, 'learning_rate': 1.5013422818791947e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52644/75000 [51:40<20:03, 18.58it/s]

{'loss': 0.2334, 'grad_norm': 1.6689975261688232, 'learning_rate': 1.5006711409395974e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52651/75000 [51:40<19:57, 18.66it/s]

{'loss': 0.3367, 'grad_norm': 7.101025104522705, 'learning_rate': 1.5e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52662/75000 [51:41<19:56, 18.67it/s]

{'loss': 0.3948, 'grad_norm': 2.2047226428985596, 'learning_rate': 1.4993288590604027e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52671/75000 [51:41<21:31, 17.29it/s]

{'loss': 0.2189, 'grad_norm': 1.7872929573059082, 'learning_rate': 1.4986577181208056e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52682/75000 [51:42<19:10, 19.39it/s]

{'loss': 0.3145, 'grad_norm': 4.851925849914551, 'learning_rate': 1.4979865771812083e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52692/75000 [51:42<21:47, 17.06it/s]

{'loss': 0.228, 'grad_norm': 0.5957075953483582, 'learning_rate': 1.4973154362416108e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52703/75000 [51:43<21:09, 17.57it/s]

{'loss': 0.3557, 'grad_norm': 1.0975236892700195, 'learning_rate': 1.4966442953020135e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52712/75000 [51:43<20:13, 18.36it/s]

{'loss': 0.2147, 'grad_norm': 2.565312623977661, 'learning_rate': 1.4959731543624161e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52722/75000 [51:44<23:05, 16.08it/s]

{'loss': 0.2463, 'grad_norm': 2.1546030044555664, 'learning_rate': 1.4953020134228188e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52732/75000 [51:45<20:07, 18.44it/s]

{'loss': 0.2969, 'grad_norm': 1.6040693521499634, 'learning_rate': 1.4946308724832214e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52743/75000 [51:45<20:28, 18.12it/s]

{'loss': 0.2467, 'grad_norm': 4.828829288482666, 'learning_rate': 1.4939597315436244e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52751/75000 [51:46<21:19, 17.39it/s]

{'loss': 0.3011, 'grad_norm': 20.45180892944336, 'learning_rate': 1.493288590604027e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52764/75000 [51:46<18:56, 19.57it/s]

{'loss': 0.2842, 'grad_norm': 14.813343048095703, 'learning_rate': 1.4926174496644296e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52773/75000 [51:47<21:23, 17.32it/s]

{'loss': 0.269, 'grad_norm': 7.2623820304870605, 'learning_rate': 1.4919463087248323e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52781/75000 [51:47<23:39, 15.66it/s]

{'loss': 0.2829, 'grad_norm': 1.3508822917938232, 'learning_rate': 1.491275167785235e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52793/75000 [51:48<19:29, 18.98it/s]

{'loss': 0.3538, 'grad_norm': 5.5294718742370605, 'learning_rate': 1.4906040268456375e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52802/75000 [51:48<20:16, 18.25it/s]

{'loss': 0.2945, 'grad_norm': 4.8351006507873535, 'learning_rate': 1.4899328859060405e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52813/75000 [51:49<18:49, 19.65it/s]

{'loss': 0.2902, 'grad_norm': 2.630648136138916, 'learning_rate': 1.4892617449664432e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52822/75000 [51:50<20:14, 18.26it/s]

{'loss': 0.3047, 'grad_norm': 12.132680892944336, 'learning_rate': 1.4885906040268457e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52834/75000 [51:50<19:02, 19.39it/s]

{'loss': 0.276, 'grad_norm': 0.5760338306427002, 'learning_rate': 1.4879194630872484e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52842/75000 [51:51<19:42, 18.74it/s]

{'loss': 0.4001, 'grad_norm': 7.108128070831299, 'learning_rate': 1.487248322147651e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52853/75000 [51:51<20:02, 18.41it/s]

{'loss': 0.3105, 'grad_norm': 2.840344190597534, 'learning_rate': 1.4865771812080537e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52862/75000 [51:52<20:20, 18.14it/s]

{'loss': 0.2206, 'grad_norm': 4.26975154876709, 'learning_rate': 1.4859060402684563e-05, 'epoch': 2.11}


                                                     
 70%|███████   | 52872/75000 [51:52<22:52, 16.12it/s]

{'loss': 0.3485, 'grad_norm': 2.2602577209472656, 'learning_rate': 1.4852348993288593e-05, 'epoch': 2.11}


                                                     
 71%|███████   | 52882/75000 [51:53<19:55, 18.50it/s]

{'loss': 0.3027, 'grad_norm': 1.0021882057189941, 'learning_rate': 1.4845637583892618e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52892/75000 [51:53<22:54, 16.08it/s]

{'loss': 0.3513, 'grad_norm': 1.7306058406829834, 'learning_rate': 1.4838926174496645e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52901/75000 [51:54<21:56, 16.78it/s]

{'loss': 0.3302, 'grad_norm': 2.995670795440674, 'learning_rate': 1.4832214765100672e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52913/75000 [51:55<19:15, 19.12it/s]

{'loss': 0.2659, 'grad_norm': 3.4744763374328613, 'learning_rate': 1.4825503355704699e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52922/75000 [51:55<22:09, 16.61it/s]

{'loss': 0.3078, 'grad_norm': 2.2135837078094482, 'learning_rate': 1.4818791946308724e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52932/75000 [51:56<21:17, 17.27it/s]

{'loss': 0.2594, 'grad_norm': 12.200780868530273, 'learning_rate': 1.4812080536912754e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52942/75000 [51:56<19:39, 18.70it/s]

{'loss': 0.3281, 'grad_norm': 3.6382815837860107, 'learning_rate': 1.4805369127516779e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52953/75000 [51:57<20:25, 18.00it/s]

{'loss': 0.2112, 'grad_norm': 4.193390369415283, 'learning_rate': 1.4798657718120806e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52962/75000 [51:57<19:50, 18.52it/s]

{'loss': 0.4557, 'grad_norm': 6.045137882232666, 'learning_rate': 1.4791946308724833e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52972/75000 [51:58<19:01, 19.31it/s]

{'loss': 0.2255, 'grad_norm': 2.8587148189544678, 'learning_rate': 1.478523489932886e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52982/75000 [51:58<20:38, 17.78it/s]

{'loss': 0.2272, 'grad_norm': 2.0691909790039062, 'learning_rate': 1.4778523489932885e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 52994/75000 [51:59<19:50, 18.48it/s]

{'loss': 0.2778, 'grad_norm': 8.547279357910156, 'learning_rate': 1.4771812080536915e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53000/75000 [51:59<19:24, 18.90it/s]

{'loss': 0.2674, 'grad_norm': 4.28836727142334, 'learning_rate': 1.4765100671140942e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53014/75000 [52:01<22:32, 16.25it/s]

{'loss': 0.2954, 'grad_norm': 3.195711851119995, 'learning_rate': 1.4758389261744967e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53022/75000 [52:01<21:38, 16.93it/s]

{'loss': 0.3589, 'grad_norm': 1.6768896579742432, 'learning_rate': 1.4751677852348994e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53034/75000 [52:02<18:43, 19.55it/s]

{'loss': 0.2794, 'grad_norm': 4.412993431091309, 'learning_rate': 1.474496644295302e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53044/75000 [52:02<18:23, 19.90it/s]

{'loss': 0.3338, 'grad_norm': 19.395158767700195, 'learning_rate': 1.4738255033557048e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53052/75000 [52:03<20:32, 17.81it/s]

{'loss': 0.2197, 'grad_norm': 0.5273337364196777, 'learning_rate': 1.4731543624161073e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53064/75000 [52:03<18:15, 20.02it/s]

{'loss': 0.2834, 'grad_norm': 3.8573739528656006, 'learning_rate': 1.4724832214765103e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53072/75000 [52:04<21:15, 17.19it/s]

{'loss': 0.363, 'grad_norm': 5.076435565948486, 'learning_rate': 1.4718120805369128e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53083/75000 [52:04<19:26, 18.78it/s]

{'loss': 0.4328, 'grad_norm': 8.233036041259766, 'learning_rate': 1.4711409395973155e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53094/75000 [52:05<19:21, 18.87it/s]

{'loss': 0.2849, 'grad_norm': 3.6529488563537598, 'learning_rate': 1.4704697986577182e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53102/75000 [52:05<20:41, 17.64it/s]

{'loss': 0.4312, 'grad_norm': 5.798299312591553, 'learning_rate': 1.4697986577181209e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53114/75000 [52:06<18:37, 19.58it/s]

{'loss': 0.2649, 'grad_norm': 3.950092077255249, 'learning_rate': 1.4691275167785234e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53123/75000 [52:06<20:19, 17.94it/s]

{'loss': 0.3881, 'grad_norm': 2.248946189880371, 'learning_rate': 1.4684563758389264e-05, 'epoch': 2.12}


                                                     
 71%|███████   | 53131/75000 [52:07<21:52, 16.67it/s]

{'loss': 0.4735, 'grad_norm': 8.25291633605957, 'learning_rate': 1.467785234899329e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53142/75000 [52:07<19:03, 19.11it/s]

{'loss': 0.3499, 'grad_norm': 4.216649055480957, 'learning_rate': 1.4671140939597316e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53153/75000 [52:08<21:30, 16.93it/s]

{'loss': 0.2095, 'grad_norm': 5.6163458824157715, 'learning_rate': 1.4664429530201343e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53162/75000 [52:09<19:49, 18.36it/s]

{'loss': 0.4226, 'grad_norm': 4.068430423736572, 'learning_rate': 1.465771812080537e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53174/75000 [52:09<18:15, 19.92it/s]

{'loss': 0.4057, 'grad_norm': 2.3657948970794678, 'learning_rate': 1.4651006711409395e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53182/75000 [52:10<18:44, 19.40it/s]

{'loss': 0.3065, 'grad_norm': 3.3810818195343018, 'learning_rate': 1.4644295302013422e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53194/75000 [52:10<17:54, 20.30it/s]

{'loss': 0.3576, 'grad_norm': 2.059706211090088, 'learning_rate': 1.4637583892617452e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53203/75000 [52:11<19:41, 18.45it/s]

{'loss': 0.2596, 'grad_norm': 4.2805891036987305, 'learning_rate': 1.4630872483221477e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53212/75000 [52:11<18:56, 19.17it/s]

{'loss': 0.2386, 'grad_norm': 2.2696707248687744, 'learning_rate': 1.4624161073825504e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53222/75000 [52:12<18:37, 19.49it/s]

{'loss': 0.3044, 'grad_norm': 7.747654914855957, 'learning_rate': 1.4617449664429531e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53232/75000 [52:12<20:02, 18.10it/s]

{'loss': 0.2389, 'grad_norm': 4.889095306396484, 'learning_rate': 1.4610738255033556e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53242/75000 [52:13<20:26, 17.74it/s]

{'loss': 0.2984, 'grad_norm': 1.2435282468795776, 'learning_rate': 1.4604026845637583e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53252/75000 [52:13<19:42, 18.39it/s]

{'loss': 0.2464, 'grad_norm': 5.190005779266357, 'learning_rate': 1.4597315436241613e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53264/75000 [52:14<19:14, 18.82it/s]

{'loss': 0.3563, 'grad_norm': 12.231809616088867, 'learning_rate': 1.4590604026845638e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53272/75000 [52:14<20:47, 17.41it/s]

{'loss': 0.257, 'grad_norm': 1.5143065452575684, 'learning_rate': 1.4583892617449665e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53282/75000 [52:15<20:04, 18.03it/s]

{'loss': 0.2697, 'grad_norm': 2.3452553749084473, 'learning_rate': 1.4577181208053692e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53294/75000 [52:16<18:37, 19.42it/s]

{'loss': 0.3577, 'grad_norm': 23.518964767456055, 'learning_rate': 1.4570469798657719e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53302/75000 [52:16<19:42, 18.35it/s]

{'loss': 0.3508, 'grad_norm': 1.2700539827346802, 'learning_rate': 1.4563758389261744e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53314/75000 [52:17<18:03, 20.02it/s]

{'loss': 0.2573, 'grad_norm': 1.2175549268722534, 'learning_rate': 1.4557046979865774e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53322/75000 [52:17<19:16, 18.74it/s]

{'loss': 0.4092, 'grad_norm': 6.923733711242676, 'learning_rate': 1.45503355704698e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53331/75000 [52:18<19:44, 18.30it/s]

{'loss': 0.3038, 'grad_norm': 2.7805635929107666, 'learning_rate': 1.4543624161073826e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53344/75000 [52:18<18:49, 19.17it/s]

{'loss': 0.2328, 'grad_norm': 0.5571154356002808, 'learning_rate': 1.4536912751677853e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53354/75000 [52:19<18:25, 19.58it/s]

{'loss': 0.2492, 'grad_norm': 1.0347280502319336, 'learning_rate': 1.453020134228188e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53363/75000 [52:19<19:15, 18.72it/s]

{'loss': 0.2824, 'grad_norm': 2.409562826156616, 'learning_rate': 1.4523489932885905e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53373/75000 [52:20<19:59, 18.03it/s]

{'loss': 0.2245, 'grad_norm': 1.0147182941436768, 'learning_rate': 1.4516778523489932e-05, 'epoch': 2.13}


                                                     
 71%|███████   | 53382/75000 [52:20<19:22, 18.60it/s]

{'loss': 0.3652, 'grad_norm': 3.3546926975250244, 'learning_rate': 1.451006711409396e-05, 'epoch': 2.14}


                                                     
 71%|███████   | 53393/75000 [52:21<19:33, 18.42it/s]

{'loss': 0.3369, 'grad_norm': 1.1225006580352783, 'learning_rate': 1.4503355704697988e-05, 'epoch': 2.14}


                                                     
 71%|███████   | 53402/75000 [52:21<20:12, 17.81it/s]

{'loss': 0.2659, 'grad_norm': 2.0880022048950195, 'learning_rate': 1.4496644295302014e-05, 'epoch': 2.14}


                                                     
 71%|███████   | 53413/75000 [52:22<18:35, 19.35it/s]

{'loss': 0.3409, 'grad_norm': 6.609926223754883, 'learning_rate': 1.4489932885906041e-05, 'epoch': 2.14}


                                                     
 71%|███████   | 53423/75000 [52:22<18:35, 19.35it/s]

{'loss': 0.1404, 'grad_norm': 8.743509292602539, 'learning_rate': 1.4483221476510066e-05, 'epoch': 2.14}


                                                     
 71%|███████   | 53434/75000 [52:23<18:55, 18.99it/s]

{'loss': 0.3535, 'grad_norm': 4.614091396331787, 'learning_rate': 1.4476510067114093e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53443/75000 [52:24<20:15, 17.74it/s]

{'loss': 0.3607, 'grad_norm': 7.886016368865967, 'learning_rate': 1.4469798657718123e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53452/75000 [52:24<19:09, 18.75it/s]

{'loss': 0.2507, 'grad_norm': 0.4439835846424103, 'learning_rate': 1.4463087248322149e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53461/75000 [52:25<19:01, 18.86it/s]

{'loss': 0.2601, 'grad_norm': 5.389326095581055, 'learning_rate': 1.4456375838926175e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53474/75000 [52:25<18:20, 19.56it/s]

{'loss': 0.3723, 'grad_norm': 4.359541893005371, 'learning_rate': 1.4449664429530202e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53483/75000 [52:26<20:41, 17.34it/s]

{'loss': 0.2925, 'grad_norm': 1.2702158689498901, 'learning_rate': 1.444295302013423e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53492/75000 [52:26<19:25, 18.46it/s]

{'loss': 0.2187, 'grad_norm': 6.440054893493652, 'learning_rate': 1.4436241610738254e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53500/75000 [52:27<19:54, 18.00it/s]

{'loss': 0.1785, 'grad_norm': 8.324795722961426, 'learning_rate': 1.4429530201342285e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53512/75000 [52:28<25:01, 14.31it/s]

{'loss': 0.4273, 'grad_norm': 8.20262336730957, 'learning_rate': 1.442281879194631e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53522/75000 [52:28<20:14, 17.68it/s]

{'loss': 0.2117, 'grad_norm': 2.994011163711548, 'learning_rate': 1.4416107382550337e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53534/75000 [52:29<19:57, 17.93it/s]

{'loss': 0.4181, 'grad_norm': 3.5128326416015625, 'learning_rate': 1.4409395973154363e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53542/75000 [52:29<19:52, 17.99it/s]

{'loss': 0.4289, 'grad_norm': 2.7448105812072754, 'learning_rate': 1.440268456375839e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53552/75000 [52:30<21:33, 16.59it/s]

{'loss': 0.2024, 'grad_norm': 4.427102565765381, 'learning_rate': 1.4395973154362415e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53561/75000 [52:31<21:45, 16.43it/s]

{'loss': 0.3074, 'grad_norm': 3.854525327682495, 'learning_rate': 1.4389261744966442e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53573/75000 [52:31<18:49, 18.97it/s]

{'loss': 0.3155, 'grad_norm': 1.858178734779358, 'learning_rate': 1.4382550335570471e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53581/75000 [52:32<19:24, 18.40it/s]

{'loss': 0.3317, 'grad_norm': 2.684999942779541, 'learning_rate': 1.4375838926174498e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53594/75000 [52:32<18:32, 19.24it/s]

{'loss': 0.2152, 'grad_norm': 2.5237321853637695, 'learning_rate': 1.4369127516778525e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53601/75000 [52:33<19:12, 18.57it/s]

{'loss': 0.3375, 'grad_norm': 5.131639003753662, 'learning_rate': 1.4362416107382551e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53614/75000 [52:33<19:16, 18.50it/s]

{'loss': 0.2427, 'grad_norm': 0.7169638872146606, 'learning_rate': 1.4355704697986577e-05, 'epoch': 2.14}


                                                     
 71%|███████▏  | 53622/75000 [52:34<19:15, 18.50it/s]

{'loss': 0.2234, 'grad_norm': 4.250764846801758, 'learning_rate': 1.4348993288590603e-05, 'epoch': 2.14}


                                                     
 72%|███████▏  | 53632/75000 [52:34<18:39, 19.09it/s]

{'loss': 0.1973, 'grad_norm': 3.4362525939941406, 'learning_rate': 1.4342281879194634e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53642/75000 [52:35<20:18, 17.53it/s]

{'loss': 0.4793, 'grad_norm': 2.6646828651428223, 'learning_rate': 1.4335570469798659e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53653/75000 [52:36<19:48, 17.95it/s]

{'loss': 0.4646, 'grad_norm': 3.2177720069885254, 'learning_rate': 1.4328859060402686e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53663/75000 [52:36<18:48, 18.90it/s]

{'loss': 0.374, 'grad_norm': 2.6332366466522217, 'learning_rate': 1.4322147651006713e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53673/75000 [52:37<19:34, 18.16it/s]

{'loss': 0.2755, 'grad_norm': 7.203131198883057, 'learning_rate': 1.4315436241610738e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53681/75000 [52:37<18:53, 18.80it/s]

{'loss': 0.238, 'grad_norm': 5.103825092315674, 'learning_rate': 1.4308724832214765e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53694/75000 [52:38<18:05, 19.63it/s]

{'loss': 0.3919, 'grad_norm': 5.526641368865967, 'learning_rate': 1.4302013422818791e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53702/75000 [52:38<18:11, 19.51it/s]

{'loss': 0.3682, 'grad_norm': 7.065728664398193, 'learning_rate': 1.429530201342282e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53714/75000 [52:39<19:50, 17.87it/s]

{'loss': 0.4251, 'grad_norm': 5.2137770652771, 'learning_rate': 1.4288590604026847e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53723/75000 [52:39<19:14, 18.43it/s]

{'loss': 0.3089, 'grad_norm': 2.9141287803649902, 'learning_rate': 1.4281879194630874e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53733/75000 [52:40<18:50, 18.82it/s]

{'loss': 0.3041, 'grad_norm': 3.86293888092041, 'learning_rate': 1.42751677852349e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53742/75000 [52:40<18:41, 18.95it/s]

{'loss': 0.2818, 'grad_norm': 3.198775053024292, 'learning_rate': 1.4268456375838926e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53754/75000 [52:41<18:47, 18.85it/s]

{'loss': 0.2627, 'grad_norm': 5.76242733001709, 'learning_rate': 1.4261744966442953e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53762/75000 [52:41<20:48, 17.01it/s]

{'loss': 0.3406, 'grad_norm': 1.6917959451675415, 'learning_rate': 1.4255033557046981e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53773/75000 [52:42<18:09, 19.49it/s]

{'loss': 0.2461, 'grad_norm': 2.823042869567871, 'learning_rate': 1.4248322147651008e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53782/75000 [52:43<21:02, 16.80it/s]

{'loss': 0.38, 'grad_norm': 4.261473178863525, 'learning_rate': 1.4241610738255035e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53792/75000 [52:43<20:19, 17.39it/s]

{'loss': 0.2379, 'grad_norm': 2.682088851928711, 'learning_rate': 1.4234899328859062e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53803/75000 [52:44<19:26, 18.17it/s]

{'loss': 0.2201, 'grad_norm': 1.829531192779541, 'learning_rate': 1.4228187919463087e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53812/75000 [52:44<18:46, 18.82it/s]

{'loss': 0.2747, 'grad_norm': 9.462262153625488, 'learning_rate': 1.4221476510067114e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53821/75000 [52:45<21:18, 16.56it/s]

{'loss': 0.2573, 'grad_norm': 4.466641426086426, 'learning_rate': 1.4214765100671142e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53831/75000 [52:45<18:30, 19.07it/s]

{'loss': 0.3771, 'grad_norm': 4.6156110763549805, 'learning_rate': 1.4208053691275169e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53841/75000 [52:46<18:00, 19.57it/s]

{'loss': 0.2349, 'grad_norm': 1.7756808996200562, 'learning_rate': 1.4201342281879196e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53854/75000 [52:46<18:29, 19.06it/s]

{'loss': 0.3292, 'grad_norm': 3.0758562088012695, 'learning_rate': 1.4194630872483223e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53863/75000 [52:47<19:56, 17.67it/s]

{'loss': 0.4631, 'grad_norm': 2.1317176818847656, 'learning_rate': 1.4187919463087248e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53872/75000 [52:47<18:59, 18.55it/s]

{'loss': 0.3358, 'grad_norm': 6.870753765106201, 'learning_rate': 1.4181208053691275e-05, 'epoch': 2.15}


                                                     
 72%|███████▏  | 53883/75000 [52:48<19:57, 17.64it/s]

{'loss': 0.3698, 'grad_norm': 2.7531161308288574, 'learning_rate': 1.4174496644295302e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53893/75000 [52:49<21:04, 16.69it/s]

{'loss': 0.3234, 'grad_norm': 2.8536465167999268, 'learning_rate': 1.416778523489933e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53902/75000 [52:49<23:00, 15.29it/s]

{'loss': 0.28, 'grad_norm': 5.754598617553711, 'learning_rate': 1.4161073825503357e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53914/75000 [52:50<18:40, 18.82it/s]

{'loss': 0.4186, 'grad_norm': 23.243911743164062, 'learning_rate': 1.4154362416107384e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53921/75000 [52:50<18:21, 19.14it/s]

{'loss': 0.3354, 'grad_norm': 13.089303970336914, 'learning_rate': 1.4147651006711409e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53934/75000 [52:51<17:47, 19.73it/s]

{'loss': 0.207, 'grad_norm': 2.9801931381225586, 'learning_rate': 1.4140939597315436e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53941/75000 [52:51<17:59, 19.52it/s]

{'loss': 0.2563, 'grad_norm': 2.1109137535095215, 'learning_rate': 1.4134228187919463e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53954/75000 [52:52<18:30, 18.95it/s]

{'loss': 0.2852, 'grad_norm': 3.971371650695801, 'learning_rate': 1.4127516778523491e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53963/75000 [52:52<18:27, 19.00it/s]

{'loss': 0.3318, 'grad_norm': 1.4983595609664917, 'learning_rate': 1.4120805369127518e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53973/75000 [52:53<18:14, 19.20it/s]

{'loss': 0.3244, 'grad_norm': 1.604346752166748, 'learning_rate': 1.4114093959731545e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53982/75000 [52:53<20:28, 17.11it/s]

{'loss': 0.3307, 'grad_norm': 1.2918519973754883, 'learning_rate': 1.4107382550335572e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 53992/75000 [52:54<18:44, 18.68it/s]

{'loss': 0.3372, 'grad_norm': 3.2027201652526855, 'learning_rate': 1.4100671140939597e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54000/75000 [52:54<18:02, 19.39it/s]

{'loss': 0.1962, 'grad_norm': 5.895308017730713, 'learning_rate': 1.4093959731543624e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54013/75000 [52:56<22:14, 15.72it/s]

{'loss': 0.284, 'grad_norm': 3.382445812225342, 'learning_rate': 1.408724832214765e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54022/75000 [52:56<20:11, 17.31it/s]

{'loss': 0.3813, 'grad_norm': 6.532087326049805, 'learning_rate': 1.408053691275168e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54033/75000 [52:57<18:59, 18.40it/s]

{'loss': 0.2371, 'grad_norm': 2.3180322647094727, 'learning_rate': 1.4073825503355706e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54042/75000 [52:57<19:12, 18.18it/s]

{'loss': 0.2841, 'grad_norm': 2.714996099472046, 'learning_rate': 1.4067114093959733e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54052/75000 [52:58<20:28, 17.06it/s]

{'loss': 0.3803, 'grad_norm': 1.2855864763259888, 'learning_rate': 1.4060402684563758e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54062/75000 [52:58<19:52, 17.55it/s]

{'loss': 0.2752, 'grad_norm': 2.430501937866211, 'learning_rate': 1.4053691275167785e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54074/75000 [52:59<17:57, 19.42it/s]

{'loss': 0.2507, 'grad_norm': 1.7380887269973755, 'learning_rate': 1.4046979865771812e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54083/75000 [52:59<18:23, 18.96it/s]

{'loss': 0.2774, 'grad_norm': 4.247598171234131, 'learning_rate': 1.404026845637584e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54092/75000 [53:00<19:32, 17.84it/s]

{'loss': 0.2326, 'grad_norm': 6.028590202331543, 'learning_rate': 1.4033557046979867e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54102/75000 [53:00<19:55, 17.48it/s]

{'loss': 0.2123, 'grad_norm': 5.118809700012207, 'learning_rate': 1.4026845637583894e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54112/75000 [53:01<19:29, 17.86it/s]

{'loss': 0.3405, 'grad_norm': 2.895439386367798, 'learning_rate': 1.402013422818792e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54123/75000 [53:02<19:52, 17.51it/s]

{'loss': 0.4418, 'grad_norm': 7.485966682434082, 'learning_rate': 1.4013422818791946e-05, 'epoch': 2.16}


                                                     
 72%|███████▏  | 54133/75000 [53:02<19:59, 17.39it/s]

{'loss': 0.2909, 'grad_norm': 6.998027801513672, 'learning_rate': 1.4006711409395973e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54143/75000 [53:03<18:25, 18.86it/s]

{'loss': 0.2208, 'grad_norm': 8.574065208435059, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54152/75000 [53:03<18:38, 18.64it/s]

{'loss': 0.3062, 'grad_norm': 3.9313931465148926, 'learning_rate': 1.3993288590604028e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54163/75000 [53:04<18:20, 18.94it/s]

{'loss': 0.2857, 'grad_norm': 2.7634830474853516, 'learning_rate': 1.3986577181208055e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54172/75000 [53:04<18:24, 18.87it/s]

{'loss': 0.2638, 'grad_norm': 0.9747405648231506, 'learning_rate': 1.3979865771812082e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54181/75000 [53:05<17:53, 19.40it/s]

{'loss': 0.3933, 'grad_norm': 4.3719964027404785, 'learning_rate': 1.3973154362416107e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54193/75000 [53:05<19:17, 17.98it/s]

{'loss': 0.2012, 'grad_norm': 1.3018900156021118, 'learning_rate': 1.3966442953020134e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54202/75000 [53:06<18:55, 18.32it/s]

{'loss': 0.2452, 'grad_norm': 3.4445903301239014, 'learning_rate': 1.3959731543624161e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54213/75000 [53:06<20:01, 17.30it/s]

{'loss': 0.3743, 'grad_norm': 4.941854953765869, 'learning_rate': 1.395302013422819e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54222/75000 [53:07<19:28, 17.78it/s]

{'loss': 0.2973, 'grad_norm': 0.6373755931854248, 'learning_rate': 1.3946308724832216e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54233/75000 [53:08<19:27, 17.79it/s]

{'loss': 0.3827, 'grad_norm': 9.240166664123535, 'learning_rate': 1.3939597315436243e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54243/75000 [53:08<19:49, 17.46it/s]

{'loss': 0.3583, 'grad_norm': 1.3802300691604614, 'learning_rate': 1.3932885906040268e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54253/75000 [53:09<21:19, 16.22it/s]

{'loss': 0.3656, 'grad_norm': 3.727034091949463, 'learning_rate': 1.3926174496644295e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54261/75000 [53:09<23:09, 14.93it/s]

{'loss': 0.4298, 'grad_norm': 7.253148555755615, 'learning_rate': 1.3919463087248322e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54273/75000 [53:10<21:05, 16.38it/s]

{'loss': 0.2526, 'grad_norm': 2.0569586753845215, 'learning_rate': 1.391275167785235e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54283/75000 [53:11<20:48, 16.59it/s]

{'loss': 0.2292, 'grad_norm': 5.867495536804199, 'learning_rate': 1.3906040268456377e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54293/75000 [53:11<20:57, 16.46it/s]

{'loss': 0.2691, 'grad_norm': 5.86207914352417, 'learning_rate': 1.3899328859060404e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54303/75000 [53:12<22:46, 15.15it/s]

{'loss': 0.2386, 'grad_norm': 0.7786951065063477, 'learning_rate': 1.389261744966443e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54313/75000 [53:13<22:26, 15.37it/s]

{'loss': 0.3241, 'grad_norm': 2.38193941116333, 'learning_rate': 1.3885906040268456e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54323/75000 [53:13<19:17, 17.87it/s]

{'loss': 0.3192, 'grad_norm': 10.160478591918945, 'learning_rate': 1.3879194630872483e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54333/75000 [53:14<21:05, 16.33it/s]

{'loss': 0.2978, 'grad_norm': 3.5467021465301514, 'learning_rate': 1.387248322147651e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54343/75000 [53:14<21:28, 16.03it/s]

{'loss': 0.3046, 'grad_norm': 2.9884026050567627, 'learning_rate': 1.3865771812080539e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54353/75000 [53:15<19:53, 17.29it/s]

{'loss': 0.3793, 'grad_norm': 2.0500645637512207, 'learning_rate': 1.3859060402684565e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54363/75000 [53:15<20:54, 16.44it/s]

{'loss': 0.2049, 'grad_norm': 1.8359003067016602, 'learning_rate': 1.385234899328859e-05, 'epoch': 2.17}


                                                     
 72%|███████▏  | 54373/75000 [53:16<19:29, 17.63it/s]

{'loss': 0.3315, 'grad_norm': 3.1420156955718994, 'learning_rate': 1.3845637583892617e-05, 'epoch': 2.17}


                                                     
 73%|███████▎  | 54383/75000 [53:17<21:17, 16.14it/s]

{'loss': 0.1968, 'grad_norm': 4.273358345031738, 'learning_rate': 1.3838926174496644e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54393/75000 [53:17<19:15, 17.83it/s]

{'loss': 0.3759, 'grad_norm': 8.596030235290527, 'learning_rate': 1.3832214765100671e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54403/75000 [53:18<19:18, 17.79it/s]

{'loss': 0.4677, 'grad_norm': 11.92183780670166, 'learning_rate': 1.38255033557047e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54413/75000 [53:18<19:59, 17.17it/s]

{'loss': 0.1982, 'grad_norm': 3.001786708831787, 'learning_rate': 1.3818791946308727e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54423/75000 [53:19<20:16, 16.91it/s]

{'loss': 0.4, 'grad_norm': 4.904927730560303, 'learning_rate': 1.3812080536912753e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54431/75000 [53:19<20:05, 17.06it/s]

{'loss': 0.4015, 'grad_norm': 0.3585834205150604, 'learning_rate': 1.3805369127516779e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54442/75000 [53:20<18:51, 18.17it/s]

{'loss': 0.27, 'grad_norm': 3.6444272994995117, 'learning_rate': 1.3798657718120805e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54452/75000 [53:21<20:15, 16.90it/s]

{'loss': 0.2726, 'grad_norm': 4.875497817993164, 'learning_rate': 1.3791946308724832e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54462/75000 [53:21<21:57, 15.58it/s]

{'loss': 0.2919, 'grad_norm': 1.4871532917022705, 'learning_rate': 1.378523489932886e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54472/75000 [53:22<19:59, 17.12it/s]

{'loss': 0.2607, 'grad_norm': 20.02155876159668, 'learning_rate': 1.3778523489932888e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54483/75000 [53:22<21:25, 15.96it/s]

{'loss': 0.3591, 'grad_norm': 1.4292895793914795, 'learning_rate': 1.3771812080536914e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54493/75000 [53:23<19:14, 17.76it/s]

{'loss': 0.3557, 'grad_norm': 5.445595741271973, 'learning_rate': 1.376510067114094e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54500/75000 [53:23<20:48, 16.42it/s]

{'loss': 0.3241, 'grad_norm': 4.518190860748291, 'learning_rate': 1.3758389261744966e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54511/75000 [53:25<25:20, 13.47it/s]

{'loss': 0.2292, 'grad_norm': 6.450045585632324, 'learning_rate': 1.3751677852348993e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54522/75000 [53:25<20:19, 16.80it/s]

{'loss': 0.3161, 'grad_norm': 2.047412157058716, 'learning_rate': 1.374496644295302e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54533/75000 [53:26<19:25, 17.57it/s]

{'loss': 0.3205, 'grad_norm': 5.113556861877441, 'learning_rate': 1.3738255033557049e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54541/75000 [53:26<19:24, 17.56it/s]

{'loss': 0.3707, 'grad_norm': 2.5657873153686523, 'learning_rate': 1.3731543624161076e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54551/75000 [53:27<18:55, 18.01it/s]

{'loss': 0.1554, 'grad_norm': 1.3760161399841309, 'learning_rate': 1.37248322147651e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54563/75000 [53:28<20:11, 16.87it/s]

{'loss': 0.1323, 'grad_norm': 0.5239811539649963, 'learning_rate': 1.3718120805369128e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54573/75000 [53:28<19:44, 17.25it/s]

{'loss': 0.216, 'grad_norm': 4.099476337432861, 'learning_rate': 1.3711409395973154e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54583/75000 [53:29<19:45, 17.23it/s]

{'loss': 0.3968, 'grad_norm': 5.934929370880127, 'learning_rate': 1.3704697986577181e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54593/75000 [53:29<20:16, 16.77it/s]

{'loss': 0.2612, 'grad_norm': 2.229353904724121, 'learning_rate': 1.369798657718121e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54603/75000 [53:30<20:40, 16.44it/s]

{'loss': 0.3574, 'grad_norm': 11.164237976074219, 'learning_rate': 1.3691275167785237e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54613/75000 [53:31<21:31, 15.79it/s]

{'loss': 0.3253, 'grad_norm': 3.0614171028137207, 'learning_rate': 1.3684563758389264e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54623/75000 [53:31<21:18, 15.94it/s]

{'loss': 0.3257, 'grad_norm': 1.5333236455917358, 'learning_rate': 1.3677852348993289e-05, 'epoch': 2.18}


                                                     
 73%|███████▎  | 54631/75000 [53:32<21:34, 15.74it/s]

{'loss': 0.2489, 'grad_norm': 14.253534317016602, 'learning_rate': 1.3671140939597316e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54642/75000 [53:32<18:59, 17.87it/s]

{'loss': 0.3599, 'grad_norm': 0.9556788206100464, 'learning_rate': 1.3664429530201342e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54652/75000 [53:33<19:33, 17.33it/s]

{'loss': 0.2828, 'grad_norm': 0.7167569994926453, 'learning_rate': 1.3657718120805371e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54662/75000 [53:33<18:50, 17.99it/s]

{'loss': 0.1596, 'grad_norm': 2.4330151081085205, 'learning_rate': 1.3651006711409398e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54671/75000 [53:34<18:33, 18.25it/s]

{'loss': 0.2407, 'grad_norm': 3.422053337097168, 'learning_rate': 1.3644295302013425e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54683/75000 [53:35<19:14, 17.60it/s]

{'loss': 0.2935, 'grad_norm': 1.3043186664581299, 'learning_rate': 1.363758389261745e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54693/75000 [53:35<20:02, 16.88it/s]

{'loss': 0.3199, 'grad_norm': 0.23991671204566956, 'learning_rate': 1.3630872483221477e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54701/75000 [53:36<19:47, 17.09it/s]

{'loss': 0.3132, 'grad_norm': 15.449811935424805, 'learning_rate': 1.3624161073825504e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54712/75000 [53:36<19:10, 17.63it/s]

{'loss': 0.3176, 'grad_norm': 4.900258541107178, 'learning_rate': 1.361744966442953e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54723/75000 [53:37<19:45, 17.10it/s]

{'loss': 0.4389, 'grad_norm': 6.285644054412842, 'learning_rate': 1.3610738255033559e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54733/75000 [53:38<19:35, 17.25it/s]

{'loss': 0.2642, 'grad_norm': 1.9541971683502197, 'learning_rate': 1.3604026845637586e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54743/75000 [53:38<19:42, 17.12it/s]

{'loss': 0.3214, 'grad_norm': 0.6811342239379883, 'learning_rate': 1.3597315436241611e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54753/75000 [53:39<21:14, 15.89it/s]

{'loss': 0.3455, 'grad_norm': 3.3327178955078125, 'learning_rate': 1.3590604026845638e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54762/75000 [53:39<19:29, 17.31it/s]

{'loss': 0.2726, 'grad_norm': 6.8316874504089355, 'learning_rate': 1.3583892617449665e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54772/75000 [53:40<22:40, 14.87it/s]

{'loss': 0.305, 'grad_norm': 2.2277352809906006, 'learning_rate': 1.3577181208053692e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54782/75000 [53:40<20:10, 16.70it/s]

{'loss': 0.3046, 'grad_norm': 2.8715410232543945, 'learning_rate': 1.357046979865772e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54792/75000 [53:41<20:24, 16.50it/s]

{'loss': 0.3394, 'grad_norm': 5.115046977996826, 'learning_rate': 1.3563758389261747e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54803/75000 [53:42<20:27, 16.46it/s]

{'loss': 0.2193, 'grad_norm': 2.046926975250244, 'learning_rate': 1.3557046979865772e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54813/75000 [53:42<18:53, 17.81it/s]

{'loss': 0.2463, 'grad_norm': 3.6092865467071533, 'learning_rate': 1.3550335570469799e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54823/75000 [53:43<18:51, 17.83it/s]

{'loss': 0.3292, 'grad_norm': 1.2951487302780151, 'learning_rate': 1.3543624161073826e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54833/75000 [53:43<19:05, 17.60it/s]

{'loss': 0.3523, 'grad_norm': 3.8623476028442383, 'learning_rate': 1.3536912751677853e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54843/75000 [53:44<18:10, 18.49it/s]

{'loss': 0.2765, 'grad_norm': 0.5823124647140503, 'learning_rate': 1.3530201342281878e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54853/75000 [53:45<20:13, 16.60it/s]

{'loss': 0.2471, 'grad_norm': 1.8115211725234985, 'learning_rate': 1.3523489932885908e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54863/75000 [53:45<19:18, 17.38it/s]

{'loss': 0.2872, 'grad_norm': 3.0083746910095215, 'learning_rate': 1.3516778523489935e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54873/75000 [53:46<18:48, 17.84it/s]

{'loss': 0.2775, 'grad_norm': 23.303173065185547, 'learning_rate': 1.351006711409396e-05, 'epoch': 2.19}


                                                     
 73%|███████▎  | 54883/75000 [53:46<18:04, 18.55it/s]

{'loss': 0.2525, 'grad_norm': 1.8354756832122803, 'learning_rate': 1.3503355704697987e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54892/75000 [53:47<17:53, 18.74it/s]

{'loss': 0.2034, 'grad_norm': 3.0470004081726074, 'learning_rate': 1.3496644295302014e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54904/75000 [53:47<16:49, 19.90it/s]

{'loss': 0.2416, 'grad_norm': 8.30711841583252, 'learning_rate': 1.3489932885906039e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54914/75000 [53:48<18:12, 18.39it/s]

{'loss': 0.2744, 'grad_norm': 0.9496325254440308, 'learning_rate': 1.348322147651007e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54921/75000 [53:48<17:58, 18.62it/s]

{'loss': 0.289, 'grad_norm': 4.327313423156738, 'learning_rate': 1.3476510067114096e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54934/75000 [53:49<17:45, 18.84it/s]

{'loss': 0.3093, 'grad_norm': 4.501514911651611, 'learning_rate': 1.3469798657718121e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54943/75000 [53:49<18:44, 17.84it/s]

{'loss': 0.2678, 'grad_norm': 4.679005146026611, 'learning_rate': 1.3463087248322148e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54953/75000 [53:50<19:06, 17.48it/s]

{'loss': 0.3589, 'grad_norm': 5.575288772583008, 'learning_rate': 1.3456375838926175e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54962/75000 [53:50<18:05, 18.46it/s]

{'loss': 0.3343, 'grad_norm': 3.176260471343994, 'learning_rate': 1.3449664429530202e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54972/75000 [53:51<18:39, 17.89it/s]

{'loss': 0.4268, 'grad_norm': 1.9212244749069214, 'learning_rate': 1.344295302013423e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54983/75000 [53:52<18:37, 17.91it/s]

{'loss': 0.1778, 'grad_norm': 18.486099243164062, 'learning_rate': 1.3436241610738257e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 54993/75000 [53:52<17:22, 19.20it/s]

{'loss': 0.2765, 'grad_norm': 3.3532612323760986, 'learning_rate': 1.3429530201342282e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55000/75000 [53:53<18:39, 17.87it/s]

{'loss': 0.2548, 'grad_norm': 12.596073150634766, 'learning_rate': 1.3422818791946309e-05, 'epoch': 2.2}


                                                       
 73%|███████▎  | 55014/75000 [53:57<50:36,  6.58it/s]  

{'loss': 0.2832, 'grad_norm': 9.257750511169434, 'learning_rate': 1.3416107382550336e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55023/75000 [53:58<27:49, 11.97it/s]

{'loss': 0.2733, 'grad_norm': 4.757006645202637, 'learning_rate': 1.3409395973154363e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55032/75000 [53:58<21:29, 15.49it/s]

{'loss': 0.3236, 'grad_norm': 2.7941880226135254, 'learning_rate': 1.3402684563758388e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55042/75000 [53:59<21:40, 15.34it/s]

{'loss': 0.3108, 'grad_norm': 2.992339849472046, 'learning_rate': 1.3395973154362418e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55053/75000 [53:59<18:43, 17.75it/s]

{'loss': 0.3796, 'grad_norm': 1.8123143911361694, 'learning_rate': 1.3389261744966443e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55062/75000 [54:00<18:00, 18.45it/s]

{'loss': 0.3835, 'grad_norm': 11.284992218017578, 'learning_rate': 1.338255033557047e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55074/75000 [54:01<17:09, 19.36it/s]

{'loss': 0.3639, 'grad_norm': 5.032965660095215, 'learning_rate': 1.3375838926174497e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55082/75000 [54:01<17:15, 19.23it/s]

{'loss': 0.3429, 'grad_norm': 2.1844053268432617, 'learning_rate': 1.3369127516778524e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55092/75000 [54:02<17:26, 19.03it/s]

{'loss': 0.3524, 'grad_norm': 1.1695573329925537, 'learning_rate': 1.3362416107382549e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55103/75000 [54:02<17:51, 18.57it/s]

{'loss': 0.423, 'grad_norm': 2.1430234909057617, 'learning_rate': 1.335570469798658e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55112/75000 [54:03<19:08, 17.31it/s]

{'loss': 0.3193, 'grad_norm': 2.094538450241089, 'learning_rate': 1.3348993288590606e-05, 'epoch': 2.2}


                                                     
 73%|███████▎  | 55123/75000 [54:03<18:28, 17.92it/s]

{'loss': 0.3009, 'grad_norm': 4.3844380378723145, 'learning_rate': 1.3342281879194631e-05, 'epoch': 2.2}


                                                     
 74%|███████▎  | 55133/75000 [54:04<17:59, 18.41it/s]

{'loss': 0.2426, 'grad_norm': 3.5104095935821533, 'learning_rate': 1.3335570469798658e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55142/75000 [54:04<17:46, 18.62it/s]

{'loss': 0.2749, 'grad_norm': 5.914731025695801, 'learning_rate': 1.3328859060402685e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55152/75000 [54:05<17:40, 18.71it/s]

{'loss': 0.1968, 'grad_norm': 1.8149694204330444, 'learning_rate': 1.3322147651006712e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55164/75000 [54:05<16:56, 19.52it/s]

{'loss': 0.2677, 'grad_norm': 1.6255919933319092, 'learning_rate': 1.3315436241610737e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55172/75000 [54:06<19:44, 16.74it/s]

{'loss': 0.2862, 'grad_norm': 3.419236660003662, 'learning_rate': 1.3308724832214767e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55182/75000 [54:06<18:04, 18.28it/s]

{'loss': 0.3201, 'grad_norm': 7.89718770980835, 'learning_rate': 1.3302013422818793e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55193/75000 [54:07<19:03, 17.33it/s]

{'loss': 0.3948, 'grad_norm': 7.862819671630859, 'learning_rate': 1.329530201342282e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55202/75000 [54:08<18:55, 17.43it/s]

{'loss': 0.3393, 'grad_norm': 5.565108776092529, 'learning_rate': 1.3288590604026846e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55213/75000 [54:08<17:22, 18.98it/s]

{'loss': 0.323, 'grad_norm': 3.2412049770355225, 'learning_rate': 1.3281879194630873e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55223/75000 [54:09<18:03, 18.26it/s]

{'loss': 0.2677, 'grad_norm': 1.069211483001709, 'learning_rate': 1.3275167785234898e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55233/75000 [54:09<19:03, 17.29it/s]

{'loss': 0.2565, 'grad_norm': 2.3597018718719482, 'learning_rate': 1.3268456375838928e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55242/75000 [54:10<17:35, 18.71it/s]

{'loss': 0.3605, 'grad_norm': 4.173304557800293, 'learning_rate': 1.3261744966442954e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55252/75000 [54:10<17:03, 19.30it/s]

{'loss': 0.3148, 'grad_norm': 5.870636463165283, 'learning_rate': 1.325503355704698e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55264/75000 [54:11<16:46, 19.61it/s]

{'loss': 0.2397, 'grad_norm': 2.7182841300964355, 'learning_rate': 1.3248322147651007e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55272/75000 [54:11<18:06, 18.15it/s]

{'loss': 0.3046, 'grad_norm': 4.324755668640137, 'learning_rate': 1.3241610738255034e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55283/75000 [54:12<18:09, 18.10it/s]

{'loss': 0.2468, 'grad_norm': 19.014080047607422, 'learning_rate': 1.323489932885906e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55293/75000 [54:12<17:28, 18.79it/s]

{'loss': 0.3891, 'grad_norm': 0.7423162460327148, 'learning_rate': 1.322818791946309e-05, 'epoch': 2.21}


                                                     
 74%|███████▎  | 55303/75000 [54:13<18:31, 17.72it/s]

{'loss': 0.4466, 'grad_norm': 8.19005298614502, 'learning_rate': 1.3221476510067116e-05, 'epoch': 2.21}


                                                     
 74%|███████▍  | 55314/75000 [54:14<17:19, 18.93it/s]

{'loss': 0.2582, 'grad_norm': 4.950973987579346, 'learning_rate': 1.3214765100671142e-05, 'epoch': 2.21}


                                                     
 74%|███████▍  | 55322/75000 [54:14<18:48, 17.44it/s]

{'loss': 0.3212, 'grad_norm': 5.778600692749023, 'learning_rate': 1.3208053691275168e-05, 'epoch': 2.21}


                                                     
 74%|███████▍  | 55333/75000 [54:15<17:16, 18.97it/s]

{'loss': 0.406, 'grad_norm': 6.570546627044678, 'learning_rate': 1.3201342281879195e-05, 'epoch': 2.21}


                                                     
 74%|███████▍  | 55341/75000 [54:15<16:44, 19.58it/s]

{'loss': 0.2961, 'grad_norm': 1.444923996925354, 'learning_rate': 1.319463087248322e-05, 'epoch': 2.21}


                                                     
 74%|███████▍  | 55352/75000 [54:16<17:29, 18.72it/s]

{'loss': 0.318, 'grad_norm': 0.9864258766174316, 'learning_rate': 1.3187919463087247e-05, 'epoch': 2.21}


                                                     
 74%|███████▍  | 55363/75000 [54:16<18:00, 18.18it/s]

{'loss': 0.4046, 'grad_norm': 10.680691719055176, 'learning_rate': 1.3181208053691278e-05, 'epoch': 2.21}


                                                     
 74%|███████▍  | 55373/75000 [54:17<18:51, 17.34it/s]

{'loss': 0.3146, 'grad_norm': 4.211999416351318, 'learning_rate': 1.3174496644295303e-05, 'epoch': 2.21}


                                                     
 74%|███████▍  | 55382/75000 [54:17<17:34, 18.61it/s]

{'loss': 0.2146, 'grad_norm': 2.2573390007019043, 'learning_rate': 1.316778523489933e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55394/75000 [54:18<16:53, 19.34it/s]

{'loss': 0.2857, 'grad_norm': 1.9848121404647827, 'learning_rate': 1.3161073825503356e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55401/75000 [54:18<16:56, 19.28it/s]

{'loss': 0.2473, 'grad_norm': 1.258313536643982, 'learning_rate': 1.3154362416107383e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55414/75000 [54:19<16:43, 19.53it/s]

{'loss': 0.3115, 'grad_norm': 2.9552347660064697, 'learning_rate': 1.3147651006711408e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55422/75000 [54:19<18:03, 18.07it/s]

{'loss': 0.2736, 'grad_norm': 1.3333008289337158, 'learning_rate': 1.3140939597315439e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55433/75000 [54:20<17:10, 18.99it/s]

{'loss': 0.2808, 'grad_norm': 3.9507381916046143, 'learning_rate': 1.3134228187919464e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55442/75000 [54:20<17:19, 18.81it/s]

{'loss': 0.3727, 'grad_norm': 3.043694257736206, 'learning_rate': 1.312751677852349e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55454/75000 [54:21<16:34, 19.65it/s]

{'loss': 0.1736, 'grad_norm': 1.7619845867156982, 'learning_rate': 1.3120805369127518e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55462/75000 [54:22<17:55, 18.17it/s]

{'loss': 0.474, 'grad_norm': 3.046482801437378, 'learning_rate': 1.3114093959731544e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55474/75000 [54:22<17:37, 18.46it/s]

{'loss': 0.2844, 'grad_norm': 4.310312271118164, 'learning_rate': 1.310738255033557e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55483/75000 [54:23<17:18, 18.80it/s]

{'loss': 0.2761, 'grad_norm': 7.258706569671631, 'learning_rate': 1.3100671140939596e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55492/75000 [54:23<17:31, 18.55it/s]

{'loss': 0.2852, 'grad_norm': 4.444472789764404, 'learning_rate': 1.3093959731543625e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55500/75000 [54:24<18:29, 17.58it/s]

{'loss': 0.3724, 'grad_norm': 5.282463550567627, 'learning_rate': 1.3087248322147652e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55513/75000 [54:25<20:22, 15.94it/s]

{'loss': 0.3688, 'grad_norm': 13.346319198608398, 'learning_rate': 1.3080536912751679e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55522/75000 [54:25<18:03, 17.98it/s]

{'loss': 0.3144, 'grad_norm': 0.879989743232727, 'learning_rate': 1.3073825503355706e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55532/75000 [54:26<17:31, 18.52it/s]

{'loss': 0.2943, 'grad_norm': 2.5575757026672363, 'learning_rate': 1.306711409395973e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55541/75000 [54:26<18:44, 17.30it/s]

{'loss': 0.316, 'grad_norm': 1.944535255432129, 'learning_rate': 1.3060402684563758e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55554/75000 [54:27<16:30, 19.63it/s]

{'loss': 0.2984, 'grad_norm': 17.389087677001953, 'learning_rate': 1.3053691275167788e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55562/75000 [54:28<18:40, 17.35it/s]

{'loss': 0.306, 'grad_norm': 4.236956596374512, 'learning_rate': 1.3046979865771813e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55572/75000 [54:28<17:00, 19.04it/s]

{'loss': 0.3059, 'grad_norm': 1.1855939626693726, 'learning_rate': 1.304026845637584e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55583/75000 [54:29<18:22, 17.62it/s]

{'loss': 0.3057, 'grad_norm': 0.9533373713493347, 'learning_rate': 1.3033557046979867e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55592/75000 [54:29<18:17, 17.69it/s]

{'loss': 0.243, 'grad_norm': 2.2208800315856934, 'learning_rate': 1.3026845637583893e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55603/75000 [54:30<17:56, 18.01it/s]

{'loss': 0.2951, 'grad_norm': 1.834484338760376, 'learning_rate': 1.3020134228187919e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55613/75000 [54:30<17:41, 18.26it/s]

{'loss': 0.4012, 'grad_norm': 4.248534679412842, 'learning_rate': 1.3013422818791949e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55623/75000 [54:31<17:15, 18.71it/s]

{'loss': 0.3592, 'grad_norm': 8.888628959655762, 'learning_rate': 1.3006711409395974e-05, 'epoch': 2.22}


                                                     
 74%|███████▍  | 55633/75000 [54:31<17:42, 18.23it/s]

{'loss': 0.3268, 'grad_norm': 4.277370452880859, 'learning_rate': 1.3000000000000001e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55641/75000 [54:32<16:52, 19.12it/s]

{'loss': 0.2616, 'grad_norm': 5.646435737609863, 'learning_rate': 1.2993288590604028e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55654/75000 [54:33<16:44, 19.25it/s]

{'loss': 0.2752, 'grad_norm': 3.2665810585021973, 'learning_rate': 1.2986577181208055e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55663/75000 [54:33<18:13, 17.69it/s]

{'loss': 0.3815, 'grad_norm': 4.263194561004639, 'learning_rate': 1.297986577181208e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55672/75000 [54:34<17:57, 17.93it/s]

{'loss': 0.2745, 'grad_norm': 3.6730942726135254, 'learning_rate': 1.2973154362416107e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55682/75000 [54:34<18:30, 17.40it/s]

{'loss': 0.275, 'grad_norm': 2.2291040420532227, 'learning_rate': 1.2966442953020135e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55693/75000 [54:35<17:03, 18.87it/s]

{'loss': 0.283, 'grad_norm': 2.6097259521484375, 'learning_rate': 1.2959731543624162e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55702/75000 [54:35<17:25, 18.46it/s]

{'loss': 0.2429, 'grad_norm': 7.325225353240967, 'learning_rate': 1.2953020134228189e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55713/75000 [54:36<16:58, 18.94it/s]

{'loss': 0.3343, 'grad_norm': 1.3702905178070068, 'learning_rate': 1.2946308724832216e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55723/75000 [54:36<18:31, 17.34it/s]

{'loss': 0.2548, 'grad_norm': 7.403981685638428, 'learning_rate': 1.2939597315436241e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55733/75000 [54:37<17:03, 18.83it/s]

{'loss': 0.2797, 'grad_norm': 8.094199180603027, 'learning_rate': 1.2932885906040268e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55742/75000 [54:37<17:47, 18.04it/s]

{'loss': 0.2449, 'grad_norm': 3.2126455307006836, 'learning_rate': 1.2926174496644298e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55752/75000 [54:38<17:06, 18.75it/s]

{'loss': 0.2591, 'grad_norm': 1.466554045677185, 'learning_rate': 1.2919463087248323e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55761/75000 [54:38<17:51, 17.96it/s]

{'loss': 0.184, 'grad_norm': 2.976231813430786, 'learning_rate': 1.291275167785235e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55772/75000 [54:39<17:22, 18.44it/s]

{'loss': 0.347, 'grad_norm': 2.4619667530059814, 'learning_rate': 1.2906040268456377e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55783/75000 [54:40<17:32, 18.26it/s]

{'loss': 0.2945, 'grad_norm': 3.1030795574188232, 'learning_rate': 1.2899328859060402e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55793/75000 [54:40<17:06, 18.72it/s]

{'loss': 0.301, 'grad_norm': 3.487191915512085, 'learning_rate': 1.2892617449664429e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55802/75000 [54:41<18:51, 16.97it/s]

{'loss': 0.392, 'grad_norm': 7.011122226715088, 'learning_rate': 1.2885906040268459e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55811/75000 [54:41<18:07, 17.64it/s]

{'loss': 0.3361, 'grad_norm': 3.1783971786499023, 'learning_rate': 1.2879194630872484e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55824/75000 [54:42<16:01, 19.94it/s]

{'loss': 0.3682, 'grad_norm': 3.182053565979004, 'learning_rate': 1.2872483221476511e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55833/75000 [54:42<18:42, 17.08it/s]

{'loss': 0.1807, 'grad_norm': 1.4998830556869507, 'learning_rate': 1.2865771812080538e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55843/75000 [54:43<16:56, 18.84it/s]

{'loss': 0.1942, 'grad_norm': 4.425924301147461, 'learning_rate': 1.2859060402684565e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55852/75000 [54:43<17:17, 18.45it/s]

{'loss': 0.3118, 'grad_norm': 8.283442497253418, 'learning_rate': 1.285234899328859e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55864/75000 [54:44<16:27, 19.39it/s]

{'loss': 0.4369, 'grad_norm': 6.74714469909668, 'learning_rate': 1.2845637583892617e-05, 'epoch': 2.23}


                                                     
 74%|███████▍  | 55873/75000 [54:44<16:47, 18.98it/s]

{'loss': 0.2983, 'grad_norm': 2.9353630542755127, 'learning_rate': 1.2838926174496645e-05, 'epoch': 2.23}


                                                     
 75%|███████▍  | 55883/75000 [54:45<18:57, 16.80it/s]

{'loss': 0.2881, 'grad_norm': 1.6257511377334595, 'learning_rate': 1.2832214765100672e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55893/75000 [54:46<17:13, 18.49it/s]

{'loss': 0.1977, 'grad_norm': 0.9697868824005127, 'learning_rate': 1.2825503355704699e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55904/75000 [54:46<18:12, 17.48it/s]

{'loss': 0.304, 'grad_norm': 23.421083450317383, 'learning_rate': 1.2818791946308726e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55913/75000 [54:47<18:16, 17.40it/s]

{'loss': 0.2013, 'grad_norm': 1.5538619756698608, 'learning_rate': 1.2812080536912751e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55923/75000 [54:47<17:17, 18.39it/s]

{'loss': 0.3094, 'grad_norm': 7.115929126739502, 'learning_rate': 1.2805369127516778e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55932/75000 [54:48<17:33, 18.09it/s]

{'loss': 0.3296, 'grad_norm': 5.241269588470459, 'learning_rate': 1.2798657718120806e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55942/75000 [54:48<16:50, 18.86it/s]

{'loss': 0.18, 'grad_norm': 1.7894673347473145, 'learning_rate': 1.2791946308724833e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55953/75000 [54:49<16:49, 18.86it/s]

{'loss': 0.2593, 'grad_norm': 1.174811601638794, 'learning_rate': 1.278523489932886e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55962/75000 [54:49<17:53, 17.74it/s]

{'loss': 0.3123, 'grad_norm': 1.327734351158142, 'learning_rate': 1.2778523489932887e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55971/75000 [54:50<17:33, 18.06it/s]

{'loss': 0.278, 'grad_norm': 0.5524246692657471, 'learning_rate': 1.2771812080536912e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55984/75000 [54:51<16:47, 18.88it/s]

{'loss': 0.3753, 'grad_norm': 2.4235007762908936, 'learning_rate': 1.2765100671140939e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 55991/75000 [54:51<16:43, 18.94it/s]

{'loss': 0.3889, 'grad_norm': 5.266941547393799, 'learning_rate': 1.2758389261744966e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56000/75000 [54:51<17:15, 18.36it/s]

{'loss': 0.1969, 'grad_norm': 2.139763832092285, 'learning_rate': 1.2751677852348994e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56012/75000 [54:53<20:32, 15.41it/s]

{'loss': 0.1853, 'grad_norm': 1.1205952167510986, 'learning_rate': 1.2744966442953021e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56023/75000 [54:53<18:43, 16.89it/s]

{'loss': 0.2192, 'grad_norm': 5.074583530426025, 'learning_rate': 1.2738255033557048e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56033/75000 [54:54<16:46, 18.85it/s]

{'loss': 0.2456, 'grad_norm': 1.9061447381973267, 'learning_rate': 1.2731543624161073e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56041/75000 [54:54<19:06, 16.54it/s]

{'loss': 0.2949, 'grad_norm': 11.493097305297852, 'learning_rate': 1.27248322147651e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56054/75000 [54:55<16:17, 19.38it/s]

{'loss': 0.3997, 'grad_norm': 6.606407165527344, 'learning_rate': 1.2718120805369127e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56062/75000 [54:55<17:41, 17.84it/s]

{'loss': 0.2267, 'grad_norm': 0.9737808704376221, 'learning_rate': 1.2711409395973156e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56073/75000 [54:56<16:49, 18.75it/s]

{'loss': 0.3, 'grad_norm': 1.185176968574524, 'learning_rate': 1.2704697986577182e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56082/75000 [54:56<18:02, 17.47it/s]

{'loss': 0.4444, 'grad_norm': 6.417211532592773, 'learning_rate': 1.269798657718121e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56094/75000 [54:57<15:57, 19.75it/s]

{'loss': 0.3231, 'grad_norm': 3.00628662109375, 'learning_rate': 1.2691275167785236e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56102/75000 [54:58<18:08, 17.36it/s]

{'loss': 0.1918, 'grad_norm': 2.825592517852783, 'learning_rate': 1.2684563758389261e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56112/75000 [54:58<18:10, 17.33it/s]

{'loss': 0.3769, 'grad_norm': 9.970854759216309, 'learning_rate': 1.2677852348993288e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56124/75000 [54:59<16:30, 19.07it/s]

{'loss': 0.3327, 'grad_norm': 6.507930755615234, 'learning_rate': 1.2671140939597317e-05, 'epoch': 2.24}


                                                     
 75%|███████▍  | 56133/75000 [54:59<17:18, 18.16it/s]

{'loss': 0.3154, 'grad_norm': 2.223832845687866, 'learning_rate': 1.2664429530201344e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56143/75000 [55:00<17:26, 18.03it/s]

{'loss': 0.3702, 'grad_norm': 3.0214998722076416, 'learning_rate': 1.265771812080537e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56153/75000 [55:00<16:57, 18.51it/s]

{'loss': 0.2806, 'grad_norm': 2.0247044563293457, 'learning_rate': 1.2651006711409397e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56162/75000 [55:01<19:00, 16.52it/s]

{'loss': 0.2497, 'grad_norm': 3.7929656505584717, 'learning_rate': 1.2644295302013422e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56172/75000 [55:01<19:36, 16.01it/s]

{'loss': 0.3364, 'grad_norm': 13.076546669006348, 'learning_rate': 1.263758389261745e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56183/75000 [55:02<17:03, 18.38it/s]

{'loss': 0.2833, 'grad_norm': 1.8860360383987427, 'learning_rate': 1.2630872483221476e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56193/75000 [55:03<16:32, 18.95it/s]

{'loss': 0.3847, 'grad_norm': 1.91335928440094, 'learning_rate': 1.2624161073825505e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56203/75000 [55:03<16:24, 19.10it/s]

{'loss': 0.2492, 'grad_norm': 16.761865615844727, 'learning_rate': 1.2617449664429532e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56213/75000 [55:04<16:44, 18.70it/s]

{'loss': 0.2864, 'grad_norm': 1.8702853918075562, 'learning_rate': 1.2610738255033558e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56222/75000 [55:04<16:28, 18.99it/s]

{'loss': 0.3672, 'grad_norm': 8.152432441711426, 'learning_rate': 1.2604026845637584e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56233/75000 [55:05<18:04, 17.30it/s]

{'loss': 0.4342, 'grad_norm': 5.4918622970581055, 'learning_rate': 1.259731543624161e-05, 'epoch': 2.25}


                                                     
 75%|███████▍  | 56241/75000 [55:05<16:43, 18.69it/s]

{'loss': 0.3149, 'grad_norm': 8.921409606933594, 'learning_rate': 1.2590604026845637e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56254/75000 [55:06<16:09, 19.34it/s]

{'loss': 0.2489, 'grad_norm': 2.5468461513519287, 'learning_rate': 1.2583892617449666e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56264/75000 [55:06<16:00, 19.51it/s]

{'loss': 0.2737, 'grad_norm': 3.7343647480010986, 'learning_rate': 1.2577181208053693e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56273/75000 [55:07<16:22, 19.07it/s]

{'loss': 0.3604, 'grad_norm': 8.248912811279297, 'learning_rate': 1.257046979865772e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56282/75000 [55:07<18:41, 16.68it/s]

{'loss': 0.3376, 'grad_norm': 7.445686340332031, 'learning_rate': 1.2563758389261746e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56293/75000 [55:08<16:55, 18.42it/s]

{'loss': 0.3046, 'grad_norm': 2.003035068511963, 'learning_rate': 1.2557046979865772e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56303/75000 [55:08<16:32, 18.85it/s]

{'loss': 0.2787, 'grad_norm': 2.9077064990997314, 'learning_rate': 1.2550335570469798e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56313/75000 [55:09<17:43, 17.57it/s]

{'loss': 0.3904, 'grad_norm': 2.232846260070801, 'learning_rate': 1.2543624161073825e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56322/75000 [55:09<17:25, 17.86it/s]

{'loss': 0.2143, 'grad_norm': 0.8288600444793701, 'learning_rate': 1.2536912751677854e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56334/75000 [55:10<16:48, 18.52it/s]

{'loss': 0.3032, 'grad_norm': 7.964498519897461, 'learning_rate': 1.253020134228188e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56342/75000 [55:11<16:35, 18.74it/s]

{'loss': 0.314, 'grad_norm': 1.818697452545166, 'learning_rate': 1.2523489932885907e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56352/75000 [55:11<21:11, 14.67it/s]

{'loss': 0.2119, 'grad_norm': 20.152219772338867, 'learning_rate': 1.2516778523489933e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56362/75000 [55:12<22:37, 13.73it/s]

{'loss': 0.3144, 'grad_norm': 2.77243971824646, 'learning_rate': 1.251006711409396e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56372/75000 [55:13<22:01, 14.10it/s]

{'loss': 0.1988, 'grad_norm': 7.66150426864624, 'learning_rate': 1.2503355704697986e-05, 'epoch': 2.25}


                                                     
 75%|███████▌  | 56382/75000 [55:13<22:06, 14.04it/s]

{'loss': 0.4017, 'grad_norm': 3.760241746902466, 'learning_rate': 1.2496644295302013e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56392/75000 [55:14<21:24, 14.49it/s]

{'loss': 0.4035, 'grad_norm': 3.1368887424468994, 'learning_rate': 1.248993288590604e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56402/75000 [55:14<19:37, 15.79it/s]

{'loss': 0.302, 'grad_norm': 5.604199409484863, 'learning_rate': 1.2483221476510069e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56412/75000 [55:15<20:24, 15.18it/s]

{'loss': 0.2608, 'grad_norm': 4.913351058959961, 'learning_rate': 1.2476510067114094e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56422/75000 [55:16<18:56, 16.34it/s]

{'loss': 0.2392, 'grad_norm': 1.8811339139938354, 'learning_rate': 1.246979865771812e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56432/75000 [55:16<18:58, 16.31it/s]

{'loss': 0.2652, 'grad_norm': 2.6406493186950684, 'learning_rate': 1.2463087248322149e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56442/75000 [55:17<18:55, 16.34it/s]

{'loss': 0.2451, 'grad_norm': 3.6253578662872314, 'learning_rate': 1.2456375838926174e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56452/75000 [55:18<19:18, 16.01it/s]

{'loss': 0.3306, 'grad_norm': 8.614760398864746, 'learning_rate': 1.2449664429530201e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56462/75000 [55:18<19:41, 15.68it/s]

{'loss': 0.3517, 'grad_norm': 2.6974267959594727, 'learning_rate': 1.244295302013423e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56472/75000 [55:19<18:40, 16.53it/s]

{'loss': 0.2376, 'grad_norm': 2.947882652282715, 'learning_rate': 1.2436241610738255e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56483/75000 [55:19<17:55, 17.22it/s]

{'loss': 0.2228, 'grad_norm': 1.9735279083251953, 'learning_rate': 1.2429530201342282e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56491/75000 [55:20<18:29, 16.68it/s]

{'loss': 0.3424, 'grad_norm': 5.2875542640686035, 'learning_rate': 1.242281879194631e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56500/75000 [55:20<18:20, 16.82it/s]

{'loss': 0.2774, 'grad_norm': 4.008317947387695, 'learning_rate': 1.2416107382550337e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56512/75000 [55:22<21:53, 14.08it/s]

{'loss': 0.2451, 'grad_norm': 11.040691375732422, 'learning_rate': 1.2409395973154362e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56522/75000 [55:22<18:39, 16.51it/s]

{'loss': 0.2667, 'grad_norm': 1.9662739038467407, 'learning_rate': 1.240268456375839e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56532/75000 [55:23<18:14, 16.87it/s]

{'loss': 0.1858, 'grad_norm': 1.1525574922561646, 'learning_rate': 1.2395973154362418e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56543/75000 [55:24<17:38, 17.43it/s]

{'loss': 0.3089, 'grad_norm': 4.295631408691406, 'learning_rate': 1.2389261744966443e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56553/75000 [55:24<17:46, 17.30it/s]

{'loss': 0.2668, 'grad_norm': 9.821187973022461, 'learning_rate': 1.238255033557047e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56563/75000 [55:25<18:22, 16.73it/s]

{'loss': 0.4009, 'grad_norm': 4.32317590713501, 'learning_rate': 1.2375838926174498e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56571/75000 [55:25<19:12, 15.99it/s]

{'loss': 0.2873, 'grad_norm': 5.445983409881592, 'learning_rate': 1.2369127516778523e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56583/75000 [55:26<17:34, 17.46it/s]

{'loss': 0.2445, 'grad_norm': 5.616762161254883, 'learning_rate': 1.236241610738255e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56592/75000 [55:26<16:52, 18.18it/s]

{'loss': 0.215, 'grad_norm': 2.16800856590271, 'learning_rate': 1.2355704697986579e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56601/75000 [55:27<16:23, 18.71it/s]

{'loss': 0.2689, 'grad_norm': 1.9882214069366455, 'learning_rate': 1.2348993288590604e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56613/75000 [55:27<14:50, 20.66it/s]

{'loss': 0.2962, 'grad_norm': 5.174687385559082, 'learning_rate': 1.234228187919463e-05, 'epoch': 2.26}


                                                     
 75%|███████▌  | 56622/75000 [55:28<15:10, 20.19it/s]

{'loss': 0.3032, 'grad_norm': 2.0274927616119385, 'learning_rate': 1.233557046979866e-05, 'epoch': 2.26}


                                                     
 76%|███████▌  | 56634/75000 [55:28<15:18, 20.01it/s]

{'loss': 0.3981, 'grad_norm': 6.161724090576172, 'learning_rate': 1.2328859060402685e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56642/75000 [55:29<16:03, 19.05it/s]

{'loss': 0.3831, 'grad_norm': 3.803571939468384, 'learning_rate': 1.2322147651006711e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56651/75000 [55:29<15:21, 19.92it/s]

{'loss': 0.2757, 'grad_norm': 4.182573318481445, 'learning_rate': 1.231543624161074e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56664/75000 [55:30<14:42, 20.77it/s]

{'loss': 0.278, 'grad_norm': 1.5936448574066162, 'learning_rate': 1.2308724832214765e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56670/75000 [55:30<15:07, 20.19it/s]

{'loss': 0.2746, 'grad_norm': 6.825850486755371, 'learning_rate': 1.2302013422818792e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56683/75000 [55:31<16:04, 19.00it/s]

{'loss': 0.2654, 'grad_norm': 13.170461654663086, 'learning_rate': 1.229530201342282e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56693/75000 [55:32<15:55, 19.16it/s]

{'loss': 0.3286, 'grad_norm': 4.611583709716797, 'learning_rate': 1.2288590604026846e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56702/75000 [55:32<16:06, 18.93it/s]

{'loss': 0.254, 'grad_norm': 10.90008544921875, 'learning_rate': 1.2281879194630872e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56712/75000 [55:32<15:30, 19.65it/s]

{'loss': 0.2703, 'grad_norm': 5.8673176765441895, 'learning_rate': 1.22751677852349e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56723/75000 [55:33<16:15, 18.74it/s]

{'loss': 0.3721, 'grad_norm': 1.677722692489624, 'learning_rate': 1.2268456375838928e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56731/75000 [55:34<16:23, 18.58it/s]

{'loss': 0.2191, 'grad_norm': 2.239607334136963, 'learning_rate': 1.2261744966442953e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56744/75000 [55:34<16:22, 18.59it/s]

{'loss': 0.2498, 'grad_norm': 3.644374132156372, 'learning_rate': 1.225503355704698e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56754/75000 [55:35<15:45, 19.29it/s]

{'loss': 0.4789, 'grad_norm': 5.355072975158691, 'learning_rate': 1.2248322147651008e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56762/75000 [55:35<16:35, 18.32it/s]

{'loss': 0.2488, 'grad_norm': 4.410010814666748, 'learning_rate': 1.2241610738255034e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56772/75000 [55:36<18:22, 16.53it/s]

{'loss': 0.357, 'grad_norm': 5.882137298583984, 'learning_rate': 1.223489932885906e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56783/75000 [55:36<17:07, 17.72it/s]

{'loss': 0.2745, 'grad_norm': 49.77983093261719, 'learning_rate': 1.2228187919463089e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56793/75000 [55:37<16:46, 18.09it/s]

{'loss': 0.4398, 'grad_norm': 3.484384059906006, 'learning_rate': 1.2221476510067114e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56804/75000 [55:37<15:26, 19.64it/s]

{'loss': 0.2903, 'grad_norm': 1.7042338848114014, 'learning_rate': 1.2214765100671141e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56811/75000 [55:38<16:04, 18.85it/s]

{'loss': 0.2921, 'grad_norm': 1.7994232177734375, 'learning_rate': 1.220805369127517e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56821/75000 [55:38<16:08, 18.77it/s]

{'loss': 0.3577, 'grad_norm': 6.242742538452148, 'learning_rate': 1.2201342281879195e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56834/75000 [55:39<16:33, 18.29it/s]

{'loss': 0.2372, 'grad_norm': 2.678424596786499, 'learning_rate': 1.2194630872483222e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56843/75000 [55:40<16:33, 18.27it/s]

{'loss': 0.3388, 'grad_norm': 7.061399459838867, 'learning_rate': 1.218791946308725e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56852/75000 [55:40<16:04, 18.81it/s]

{'loss': 0.3403, 'grad_norm': 6.224881649017334, 'learning_rate': 1.2181208053691275e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56863/75000 [55:41<16:09, 18.72it/s]

{'loss': 0.2194, 'grad_norm': 1.7856106758117676, 'learning_rate': 1.2174496644295302e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56872/75000 [55:41<17:44, 17.02it/s]

{'loss': 0.3011, 'grad_norm': 3.1841094493865967, 'learning_rate': 1.2167785234899329e-05, 'epoch': 2.27}


                                                     
 76%|███████▌  | 56883/75000 [55:42<17:21, 17.39it/s]

{'loss': 0.2946, 'grad_norm': 0.8314830660820007, 'learning_rate': 1.2161073825503356e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56891/75000 [55:42<16:03, 18.79it/s]

{'loss': 0.253, 'grad_norm': 4.09940767288208, 'learning_rate': 1.2154362416107383e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56903/75000 [55:43<15:02, 20.05it/s]

{'loss': 0.1918, 'grad_norm': 1.570876121520996, 'learning_rate': 1.214765100671141e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56912/75000 [55:43<15:02, 20.05it/s]

{'loss': 0.3446, 'grad_norm': 15.142085075378418, 'learning_rate': 1.2140939597315436e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56923/75000 [55:44<15:28, 19.48it/s]

{'loss': 0.4348, 'grad_norm': 1.3212172985076904, 'learning_rate': 1.2134228187919463e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56931/75000 [55:44<15:49, 19.04it/s]

{'loss': 0.2382, 'grad_norm': 8.087878227233887, 'learning_rate': 1.212751677852349e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56944/75000 [55:45<14:55, 20.16it/s]

{'loss': 0.2857, 'grad_norm': 5.528975963592529, 'learning_rate': 1.2120805369127517e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56953/75000 [55:45<15:10, 19.83it/s]

{'loss': 0.2115, 'grad_norm': 3.7591774463653564, 'learning_rate': 1.2114093959731544e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56962/75000 [55:46<19:18, 15.57it/s]

{'loss': 0.4462, 'grad_norm': 4.26521110534668, 'learning_rate': 1.210738255033557e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56972/75000 [55:47<17:28, 17.19it/s]

{'loss': 0.4314, 'grad_norm': 4.517203330993652, 'learning_rate': 1.21006711409396e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56981/75000 [55:47<20:00, 15.01it/s]

{'loss': 0.2727, 'grad_norm': 1.7045341730117798, 'learning_rate': 1.2093959731543624e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 56991/75000 [55:48<16:18, 18.40it/s]

{'loss': 0.3605, 'grad_norm': 0.9770117998123169, 'learning_rate': 1.2087248322147651e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57000/75000 [55:48<15:41, 19.13it/s]

{'loss': 0.336, 'grad_norm': 2.1665544509887695, 'learning_rate': 1.208053691275168e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57013/75000 [55:49<18:51, 15.89it/s]

{'loss': 0.3371, 'grad_norm': 2.0886991024017334, 'learning_rate': 1.2073825503355705e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57021/75000 [55:50<17:43, 16.90it/s]

{'loss': 0.3026, 'grad_norm': 7.182929039001465, 'learning_rate': 1.2067114093959732e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57032/75000 [55:50<18:29, 16.19it/s]

{'loss': 0.286, 'grad_norm': 11.719376564025879, 'learning_rate': 1.206040268456376e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57043/75000 [55:51<16:02, 18.66it/s]

{'loss': 0.3386, 'grad_norm': 6.308013916015625, 'learning_rate': 1.2053691275167785e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57054/75000 [55:52<15:44, 18.99it/s]

{'loss': 0.2219, 'grad_norm': 5.834385871887207, 'learning_rate': 1.2046979865771812e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57063/75000 [55:52<16:18, 18.33it/s]

{'loss': 0.2646, 'grad_norm': 2.83123779296875, 'learning_rate': 1.204026845637584e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57071/75000 [55:53<24:32, 12.18it/s]

{'loss': 0.42, 'grad_norm': 2.728332281112671, 'learning_rate': 1.2033557046979866e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57082/75000 [55:53<17:30, 17.06it/s]

{'loss': 0.1835, 'grad_norm': 1.472012996673584, 'learning_rate': 1.2026845637583893e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57092/75000 [55:54<15:26, 19.33it/s]

{'loss': 0.3095, 'grad_norm': 2.5501372814178467, 'learning_rate': 1.202013422818792e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57101/75000 [55:54<18:22, 16.24it/s]

{'loss': 0.2076, 'grad_norm': 3.2352471351623535, 'learning_rate': 1.2013422818791947e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57114/75000 [55:55<14:53, 20.02it/s]

{'loss': 0.3357, 'grad_norm': 6.744874477386475, 'learning_rate': 1.2006711409395973e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57122/75000 [55:55<15:06, 19.72it/s]

{'loss': 0.2493, 'grad_norm': 6.94868278503418, 'learning_rate': 1.2e-05, 'epoch': 2.28}


                                                     
 76%|███████▌  | 57133/75000 [55:56<14:36, 20.38it/s]

{'loss': 0.3675, 'grad_norm': 3.757187843322754, 'learning_rate': 1.1993288590604027e-05, 'epoch': 2.29}


                                                     
 76%|███████▌  | 57143/75000 [55:56<15:12, 19.57it/s]

{'loss': 0.2258, 'grad_norm': 3.2297184467315674, 'learning_rate': 1.1986577181208054e-05, 'epoch': 2.29}


                                                     
 76%|███████▌  | 57152/75000 [55:57<16:02, 18.55it/s]

{'loss': 0.451, 'grad_norm': 1.7772321701049805, 'learning_rate': 1.1979865771812081e-05, 'epoch': 2.29}


                                                     
 76%|███████▌  | 57162/75000 [55:57<15:22, 19.33it/s]

{'loss': 0.3045, 'grad_norm': 4.853542327880859, 'learning_rate': 1.1973154362416108e-05, 'epoch': 2.29}


                                                     
 76%|███████▌  | 57173/75000 [55:58<15:52, 18.72it/s]

{'loss': 0.2414, 'grad_norm': 3.196107864379883, 'learning_rate': 1.1966442953020135e-05, 'epoch': 2.29}


                                                     
 76%|███████▌  | 57183/75000 [55:59<16:02, 18.51it/s]

{'loss': 0.2729, 'grad_norm': 5.267942428588867, 'learning_rate': 1.1959731543624161e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57191/75000 [55:59<16:29, 18.00it/s]

{'loss': 0.3782, 'grad_norm': 1.7701464891433716, 'learning_rate': 1.195302013422819e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57202/75000 [56:00<16:25, 18.06it/s]

{'loss': 0.4325, 'grad_norm': 10.357253074645996, 'learning_rate': 1.1946308724832215e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57212/75000 [56:00<15:30, 19.11it/s]

{'loss': 0.3026, 'grad_norm': 4.821307182312012, 'learning_rate': 1.1939597315436242e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57222/75000 [56:01<17:11, 17.24it/s]

{'loss': 0.3747, 'grad_norm': 6.245381832122803, 'learning_rate': 1.1932885906040269e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57230/75000 [56:01<16:54, 17.51it/s]

{'loss': 0.4408, 'grad_norm': 2.9012444019317627, 'learning_rate': 1.1926174496644296e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57242/75000 [56:02<17:24, 17.00it/s]

{'loss': 0.2678, 'grad_norm': 3.1899776458740234, 'learning_rate': 1.1919463087248323e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57252/75000 [56:03<15:16, 19.37it/s]

{'loss': 0.2692, 'grad_norm': 2.9386425018310547, 'learning_rate': 1.191275167785235e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57262/75000 [56:03<16:57, 17.44it/s]

{'loss': 0.3144, 'grad_norm': 3.91853404045105, 'learning_rate': 1.1906040268456376e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57273/75000 [56:04<15:43, 18.79it/s]

{'loss': 0.2996, 'grad_norm': 4.304131507873535, 'learning_rate': 1.1899328859060403e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57282/75000 [56:04<15:28, 19.08it/s]

{'loss': 0.2518, 'grad_norm': 1.920103669166565, 'learning_rate': 1.189261744966443e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57291/75000 [56:05<15:08, 19.50it/s]

{'loss': 0.3582, 'grad_norm': 5.959941387176514, 'learning_rate': 1.1885906040268457e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57304/75000 [56:05<14:44, 20.00it/s]

{'loss': 0.3096, 'grad_norm': 2.188699245452881, 'learning_rate': 1.1879194630872484e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57312/75000 [56:06<15:54, 18.53it/s]

{'loss': 0.2802, 'grad_norm': 2.3213093280792236, 'learning_rate': 1.187248322147651e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57323/75000 [56:06<15:14, 19.34it/s]

{'loss': 0.3491, 'grad_norm': 8.519075393676758, 'learning_rate': 1.1865771812080537e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57333/75000 [56:07<15:52, 18.54it/s]

{'loss': 0.3407, 'grad_norm': 4.313930988311768, 'learning_rate': 1.1859060402684564e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57342/75000 [56:07<17:42, 16.62it/s]

{'loss': 0.3281, 'grad_norm': 2.931128978729248, 'learning_rate': 1.1852348993288591e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57354/75000 [56:08<16:22, 17.96it/s]

{'loss': 0.2705, 'grad_norm': 4.882409572601318, 'learning_rate': 1.1845637583892618e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57362/75000 [56:08<15:06, 19.45it/s]

{'loss': 0.224, 'grad_norm': 1.5772932767868042, 'learning_rate': 1.1838926174496645e-05, 'epoch': 2.29}


                                                     
 76%|███████▋  | 57372/75000 [56:09<16:16, 18.06it/s]

{'loss': 0.1688, 'grad_norm': 2.4258711338043213, 'learning_rate': 1.1832214765100672e-05, 'epoch': 2.29}


                                                     
 77%|███████▋  | 57384/75000 [56:10<14:48, 19.83it/s]

{'loss': 0.2172, 'grad_norm': 1.8374638557434082, 'learning_rate': 1.1825503355704698e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57393/75000 [56:10<14:30, 20.23it/s]

{'loss': 0.324, 'grad_norm': 11.093463897705078, 'learning_rate': 1.1818791946308725e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57402/75000 [56:10<14:42, 19.95it/s]

{'loss': 0.2585, 'grad_norm': 1.7018014192581177, 'learning_rate': 1.1812080536912752e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57413/75000 [56:11<17:09, 17.08it/s]

{'loss': 0.2546, 'grad_norm': 3.2429299354553223, 'learning_rate': 1.1805369127516779e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57423/75000 [56:12<18:33, 15.78it/s]

{'loss': 0.468, 'grad_norm': 7.976778507232666, 'learning_rate': 1.1798657718120806e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57432/75000 [56:12<16:09, 18.11it/s]

{'loss': 0.2529, 'grad_norm': 1.2776228189468384, 'learning_rate': 1.1791946308724833e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57442/75000 [56:13<15:09, 19.30it/s]

{'loss': 0.2766, 'grad_norm': 4.585659980773926, 'learning_rate': 1.178523489932886e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57454/75000 [56:13<14:41, 19.90it/s]

{'loss': 0.2691, 'grad_norm': 8.479568481445312, 'learning_rate': 1.1778523489932886e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57462/75000 [56:14<14:59, 19.50it/s]

{'loss': 0.4455, 'grad_norm': 0.4819457232952118, 'learning_rate': 1.1771812080536913e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57473/75000 [56:14<14:05, 20.73it/s]

{'loss': 0.2432, 'grad_norm': 1.8843262195587158, 'learning_rate': 1.176510067114094e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57484/75000 [56:15<15:06, 19.33it/s]

{'loss': 0.3041, 'grad_norm': 2.5680811405181885, 'learning_rate': 1.1758389261744967e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57493/75000 [56:15<15:02, 19.40it/s]

{'loss': 0.239, 'grad_norm': 6.939138889312744, 'learning_rate': 1.1751677852348994e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57500/75000 [56:16<14:37, 19.94it/s]

{'loss': 0.3079, 'grad_norm': 3.630279064178467, 'learning_rate': 1.174496644295302e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57512/75000 [56:17<18:20, 15.89it/s]

{'loss': 0.2938, 'grad_norm': 3.839479446411133, 'learning_rate': 1.1738255033557048e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57523/75000 [56:17<16:15, 17.91it/s]

{'loss': 0.2379, 'grad_norm': 2.8063547611236572, 'learning_rate': 1.1731543624161074e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57531/75000 [56:18<17:40, 16.47it/s]

{'loss': 0.2884, 'grad_norm': 3.185885429382324, 'learning_rate': 1.1724832214765101e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57543/75000 [56:19<14:49, 19.63it/s]

{'loss': 0.2637, 'grad_norm': 4.001757621765137, 'learning_rate': 1.1718120805369128e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57552/75000 [56:19<14:19, 20.30it/s]

{'loss': 0.2631, 'grad_norm': 8.768131256103516, 'learning_rate': 1.1711409395973155e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57562/75000 [56:19<15:12, 19.11it/s]

{'loss': 0.238, 'grad_norm': 13.52148723602295, 'learning_rate': 1.1704697986577182e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57574/75000 [56:20<14:14, 20.40it/s]

{'loss': 0.4185, 'grad_norm': 6.317442417144775, 'learning_rate': 1.1697986577181209e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57581/75000 [56:20<15:31, 18.69it/s]

{'loss': 0.3575, 'grad_norm': 5.279815673828125, 'learning_rate': 1.1691275167785236e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57593/75000 [56:21<14:06, 20.57it/s]

{'loss': 0.3363, 'grad_norm': 5.032188415527344, 'learning_rate': 1.1684563758389262e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57602/75000 [56:22<14:16, 20.31it/s]

{'loss': 0.4146, 'grad_norm': 2.449014186859131, 'learning_rate': 1.167785234899329e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57614/75000 [56:22<14:06, 20.53it/s]

{'loss': 0.3252, 'grad_norm': 4.1477837562561035, 'learning_rate': 1.1671140939597316e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57622/75000 [56:23<17:18, 16.74it/s]

{'loss': 0.2418, 'grad_norm': 3.2806410789489746, 'learning_rate': 1.1664429530201343e-05, 'epoch': 2.3}


                                                     
 77%|███████▋  | 57632/75000 [56:23<19:21, 14.95it/s]

{'loss': 0.3012, 'grad_norm': 3.1715784072875977, 'learning_rate': 1.165771812080537e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57641/75000 [56:24<15:57, 18.13it/s]

{'loss': 0.3522, 'grad_norm': 1.5649878978729248, 'learning_rate': 1.1651006711409397e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57654/75000 [56:24<14:40, 19.70it/s]

{'loss': 0.3685, 'grad_norm': 1.1690763235092163, 'learning_rate': 1.1644295302013424e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57664/75000 [56:25<15:43, 18.38it/s]

{'loss': 0.3297, 'grad_norm': 3.798661470413208, 'learning_rate': 1.163758389261745e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57673/75000 [56:25<15:10, 19.03it/s]

{'loss': 0.2999, 'grad_norm': 2.0379326343536377, 'learning_rate': 1.1630872483221477e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57683/75000 [56:26<16:29, 17.49it/s]

{'loss': 0.1944, 'grad_norm': 3.4408605098724365, 'learning_rate': 1.1624161073825504e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57693/75000 [56:27<15:15, 18.90it/s]

{'loss': 0.3483, 'grad_norm': 5.961141586303711, 'learning_rate': 1.1617449664429531e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57702/75000 [56:27<14:43, 19.57it/s]

{'loss': 0.3625, 'grad_norm': 5.815366744995117, 'learning_rate': 1.1610738255033558e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57711/75000 [56:27<15:13, 18.93it/s]

{'loss': 0.2797, 'grad_norm': 0.4687230885028839, 'learning_rate': 1.1604026845637585e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57723/75000 [56:28<14:04, 20.47it/s]

{'loss': 0.2508, 'grad_norm': 5.906847953796387, 'learning_rate': 1.1597315436241611e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57732/75000 [56:28<14:47, 19.46it/s]

{'loss': 0.3755, 'grad_norm': 3.285290002822876, 'learning_rate': 1.1590604026845638e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57743/75000 [56:29<14:09, 20.31it/s]

{'loss': 0.3157, 'grad_norm': 2.771709680557251, 'learning_rate': 1.1583892617449665e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57753/75000 [56:30<14:50, 19.36it/s]

{'loss': 0.2477, 'grad_norm': 3.2769970893859863, 'learning_rate': 1.1577181208053692e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57762/75000 [56:30<15:49, 18.15it/s]

{'loss': 0.3761, 'grad_norm': 2.71287202835083, 'learning_rate': 1.1570469798657719e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57773/75000 [56:31<16:03, 17.88it/s]

{'loss': 0.3895, 'grad_norm': 1.803439736366272, 'learning_rate': 1.1563758389261746e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57783/75000 [56:31<14:31, 19.76it/s]

{'loss': 0.2469, 'grad_norm': 1.007777452468872, 'learning_rate': 1.1557046979865773e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57794/75000 [56:32<14:36, 19.64it/s]

{'loss': 0.2501, 'grad_norm': 2.3079161643981934, 'learning_rate': 1.15503355704698e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57804/75000 [56:32<14:29, 19.77it/s]

{'loss': 0.2309, 'grad_norm': 3.7182838916778564, 'learning_rate': 1.1543624161073826e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57814/75000 [56:33<15:54, 18.00it/s]

{'loss': 0.3175, 'grad_norm': 4.154588222503662, 'learning_rate': 1.1536912751677853e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57824/75000 [56:33<15:17, 18.72it/s]

{'loss': 0.2762, 'grad_norm': 2.1404731273651123, 'learning_rate': 1.153020134228188e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57834/75000 [56:34<15:11, 18.83it/s]

{'loss': 0.3469, 'grad_norm': 2.8124279975891113, 'learning_rate': 1.1523489932885907e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57843/75000 [56:34<15:17, 18.69it/s]

{'loss': 0.4314, 'grad_norm': 4.038090705871582, 'learning_rate': 1.1516778523489934e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57852/75000 [56:35<14:59, 19.07it/s]

{'loss': 0.5314, 'grad_norm': 3.0088610649108887, 'learning_rate': 1.151006711409396e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57862/75000 [56:36<19:46, 14.44it/s]

{'loss': 0.3551, 'grad_norm': 2.4433281421661377, 'learning_rate': 1.1503355704697986e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57872/75000 [56:36<17:14, 16.56it/s]

{'loss': 0.2789, 'grad_norm': 0.4890025854110718, 'learning_rate': 1.1496644295302014e-05, 'epoch': 2.31}


                                                     
 77%|███████▋  | 57882/75000 [56:37<15:02, 18.96it/s]

{'loss': 0.2862, 'grad_norm': 5.083117485046387, 'learning_rate': 1.1489932885906041e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57893/75000 [56:37<13:52, 20.55it/s]

{'loss': 0.2561, 'grad_norm': 0.7792137861251831, 'learning_rate': 1.1483221476510066e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57904/75000 [56:38<14:43, 19.35it/s]

{'loss': 0.2549, 'grad_norm': 4.279103755950928, 'learning_rate': 1.1476510067114095e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57913/75000 [56:38<14:10, 20.10it/s]

{'loss': 0.3014, 'grad_norm': 7.171991348266602, 'learning_rate': 1.1469798657718122e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57922/75000 [56:39<14:23, 19.78it/s]

{'loss': 0.2555, 'grad_norm': 2.652782678604126, 'learning_rate': 1.1463087248322147e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57934/75000 [56:39<13:49, 20.57it/s]

{'loss': 0.2921, 'grad_norm': 2.695657968521118, 'learning_rate': 1.1456375838926175e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57943/75000 [56:40<14:04, 20.19it/s]

{'loss': 0.4071, 'grad_norm': 1.5479785203933716, 'learning_rate': 1.1449664429530202e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57952/75000 [56:40<14:06, 20.15it/s]

{'loss': 0.4184, 'grad_norm': 4.502756118774414, 'learning_rate': 1.1442953020134229e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57964/75000 [56:41<13:53, 20.45it/s]

{'loss': 0.3231, 'grad_norm': 4.185515880584717, 'learning_rate': 1.1436241610738256e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57973/75000 [56:41<14:09, 20.04it/s]

{'loss': 0.3853, 'grad_norm': 4.178532123565674, 'learning_rate': 1.1429530201342283e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57982/75000 [56:42<14:06, 20.10it/s]

{'loss': 0.3131, 'grad_norm': 1.4259997606277466, 'learning_rate': 1.142281879194631e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 57994/75000 [56:42<14:00, 20.24it/s]

{'loss': 0.3061, 'grad_norm': 3.591113567352295, 'learning_rate': 1.1416107382550337e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58000/75000 [56:42<13:53, 20.39it/s]

{'loss': 0.3811, 'grad_norm': 3.3033699989318848, 'learning_rate': 1.1409395973154363e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58013/75000 [56:44<18:14, 15.52it/s]

{'loss': 0.2959, 'grad_norm': 2.6205122470855713, 'learning_rate': 1.140268456375839e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58023/75000 [56:44<16:34, 17.08it/s]

{'loss': 0.2969, 'grad_norm': 2.559792995452881, 'learning_rate': 1.1395973154362415e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58032/75000 [56:45<15:27, 18.30it/s]

{'loss': 0.3413, 'grad_norm': 4.258942604064941, 'learning_rate': 1.1389261744966444e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58041/75000 [56:45<14:29, 19.49it/s]

{'loss': 0.252, 'grad_norm': 4.7615861892700195, 'learning_rate': 1.138255033557047e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58053/75000 [56:46<15:09, 18.63it/s]

{'loss': 0.3743, 'grad_norm': 1.4225654602050781, 'learning_rate': 1.1375838926174496e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58063/75000 [56:46<14:56, 18.90it/s]

{'loss': 0.3511, 'grad_norm': 0.9061503410339355, 'learning_rate': 1.1369127516778524e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58073/75000 [56:47<15:37, 18.06it/s]

{'loss': 0.4001, 'grad_norm': 5.1590576171875, 'learning_rate': 1.1362416107382551e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58082/75000 [56:47<14:46, 19.09it/s]

{'loss': 0.3367, 'grad_norm': 1.502463936805725, 'learning_rate': 1.1355704697986577e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58094/75000 [56:48<13:38, 20.65it/s]

{'loss': 0.3028, 'grad_norm': 4.550867557525635, 'learning_rate': 1.1348993288590605e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58103/75000 [56:48<14:15, 19.75it/s]

{'loss': 0.2241, 'grad_norm': 1.4302712678909302, 'learning_rate': 1.1342281879194632e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58113/75000 [56:49<15:09, 18.57it/s]

{'loss': 0.4416, 'grad_norm': 3.0969743728637695, 'learning_rate': 1.1335570469798657e-05, 'epoch': 2.32}


                                                     
 77%|███████▋  | 58124/75000 [56:50<13:49, 20.35it/s]

{'loss': 0.3459, 'grad_norm': 2.5852837562561035, 'learning_rate': 1.1328859060402686e-05, 'epoch': 2.32}


                                                     
 78%|███████▊  | 58133/75000 [56:50<14:20, 19.60it/s]

{'loss': 0.184, 'grad_norm': 1.8942339420318604, 'learning_rate': 1.1322147651006712e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58143/75000 [56:51<14:51, 18.90it/s]

{'loss': 0.219, 'grad_norm': 2.3720998764038086, 'learning_rate': 1.1315436241610738e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58153/75000 [56:51<14:28, 19.41it/s]

{'loss': 0.2804, 'grad_norm': 3.0704073905944824, 'learning_rate': 1.1308724832214766e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58162/75000 [56:52<14:28, 19.38it/s]

{'loss': 0.3575, 'grad_norm': 3.7897608280181885, 'learning_rate': 1.1302013422818793e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58172/75000 [56:52<16:16, 17.22it/s]

{'loss': 0.351, 'grad_norm': 2.617060422897339, 'learning_rate': 1.129530201342282e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58183/75000 [56:53<15:21, 18.25it/s]

{'loss': 0.3574, 'grad_norm': 5.720829010009766, 'learning_rate': 1.1288590604026847e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58192/75000 [56:53<14:52, 18.83it/s]

{'loss': 0.211, 'grad_norm': 3.2684500217437744, 'learning_rate': 1.1281879194630874e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58203/75000 [56:54<15:13, 18.39it/s]

{'loss': 0.2964, 'grad_norm': 2.851430654525757, 'learning_rate': 1.12751677852349e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58212/75000 [56:54<15:07, 18.51it/s]

{'loss': 0.3612, 'grad_norm': 6.052224636077881, 'learning_rate': 1.1268456375838926e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58224/75000 [56:55<14:01, 19.93it/s]

{'loss': 0.3531, 'grad_norm': 2.1776366233825684, 'learning_rate': 1.1261744966442954e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58233/75000 [56:55<14:07, 19.79it/s]

{'loss': 0.2621, 'grad_norm': 7.261565685272217, 'learning_rate': 1.1255033557046981e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58243/75000 [56:56<14:55, 18.70it/s]

{'loss': 0.2781, 'grad_norm': 7.000263690948486, 'learning_rate': 1.1248322147651006e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58252/75000 [56:56<14:40, 19.02it/s]

{'loss': 0.3659, 'grad_norm': 5.820667743682861, 'learning_rate': 1.1241610738255035e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58263/75000 [56:57<14:21, 19.44it/s]

{'loss': 0.3885, 'grad_norm': 4.453564167022705, 'learning_rate': 1.1234899328859062e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58272/75000 [56:57<14:22, 19.41it/s]

{'loss': 0.2248, 'grad_norm': 1.83283531665802, 'learning_rate': 1.1228187919463087e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58282/75000 [56:58<14:56, 18.65it/s]

{'loss': 0.2704, 'grad_norm': 12.634108543395996, 'learning_rate': 1.1221476510067115e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58292/75000 [56:59<18:51, 14.77it/s]

{'loss': 0.3102, 'grad_norm': 2.201141595840454, 'learning_rate': 1.1214765100671142e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58302/75000 [56:59<16:51, 16.50it/s]

{'loss': 0.4723, 'grad_norm': 6.7949018478393555, 'learning_rate': 1.1208053691275167e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58314/75000 [57:00<13:56, 19.95it/s]

{'loss': 0.4066, 'grad_norm': 8.830379486083984, 'learning_rate': 1.1201342281879196e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58322/75000 [57:00<14:22, 19.33it/s]

{'loss': 0.2637, 'grad_norm': 8.973456382751465, 'learning_rate': 1.1194630872483223e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58334/75000 [57:01<14:28, 19.20it/s]

{'loss': 0.3727, 'grad_norm': 4.8823466300964355, 'learning_rate': 1.1187919463087248e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58344/75000 [57:01<13:47, 20.14it/s]

{'loss': 0.3001, 'grad_norm': 1.6655207872390747, 'learning_rate': 1.1181208053691276e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58352/75000 [57:02<14:07, 19.65it/s]

{'loss': 0.2914, 'grad_norm': 0.5908915400505066, 'learning_rate': 1.1174496644295303e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58361/75000 [57:02<14:23, 19.26it/s]

{'loss': 0.2225, 'grad_norm': 3.26792311668396, 'learning_rate': 1.1167785234899328e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58374/75000 [57:03<13:24, 20.66it/s]

{'loss': 0.2165, 'grad_norm': 3.7765653133392334, 'learning_rate': 1.1161073825503355e-05, 'epoch': 2.33}


                                                     
 78%|███████▊  | 58383/75000 [57:03<13:49, 20.03it/s]

{'loss': 0.4217, 'grad_norm': 3.165625810623169, 'learning_rate': 1.1154362416107384e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58392/75000 [57:04<13:51, 19.96it/s]

{'loss': 0.2839, 'grad_norm': 1.5976711511611938, 'learning_rate': 1.114765100671141e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58401/75000 [57:04<13:57, 19.82it/s]

{'loss': 0.2949, 'grad_norm': 0.6019169688224792, 'learning_rate': 1.1140939597315436e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58413/75000 [57:05<13:34, 20.36it/s]

{'loss': 0.3093, 'grad_norm': 1.2271796464920044, 'learning_rate': 1.1134228187919464e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58422/75000 [57:05<13:33, 20.37it/s]

{'loss': 0.2393, 'grad_norm': 8.118721008300781, 'learning_rate': 1.1127516778523491e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58434/75000 [57:06<13:43, 20.12it/s]

{'loss': 0.2084, 'grad_norm': 4.622836589813232, 'learning_rate': 1.1120805369127516e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58443/75000 [57:06<13:51, 19.90it/s]

{'loss': 0.2506, 'grad_norm': 4.4622721672058105, 'learning_rate': 1.1114093959731545e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58451/75000 [57:07<14:05, 19.57it/s]

{'loss': 0.329, 'grad_norm': 3.450303316116333, 'learning_rate': 1.1107382550335572e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58461/75000 [57:07<14:18, 19.26it/s]

{'loss': 0.2406, 'grad_norm': 8.177777290344238, 'learning_rate': 1.1100671140939597e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58473/75000 [57:08<13:28, 20.45it/s]

{'loss': 0.2437, 'grad_norm': 2.027125597000122, 'learning_rate': 1.1093959731543625e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58482/75000 [57:08<13:49, 19.92it/s]

{'loss': 0.3232, 'grad_norm': 1.8584957122802734, 'learning_rate': 1.1087248322147652e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58493/75000 [57:09<13:22, 20.57it/s]

{'loss': 0.4063, 'grad_norm': 2.749164342880249, 'learning_rate': 1.1080536912751677e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58500/75000 [57:09<15:13, 18.06it/s]

{'loss': 0.3524, 'grad_norm': 1.479278802871704, 'learning_rate': 1.1073825503355706e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58514/75000 [57:11<16:53, 16.26it/s]

{'loss': 0.2356, 'grad_norm': 5.834621429443359, 'learning_rate': 1.1067114093959733e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58522/75000 [57:11<16:01, 17.14it/s]

{'loss': 0.3207, 'grad_norm': 14.66981315612793, 'learning_rate': 1.1060402684563758e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58532/75000 [57:12<18:14, 15.04it/s]

{'loss': 0.2906, 'grad_norm': 4.8891754150390625, 'learning_rate': 1.1053691275167785e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58541/75000 [57:12<16:29, 16.63it/s]

{'loss': 0.2685, 'grad_norm': 3.427936315536499, 'learning_rate': 1.1046979865771813e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58554/75000 [57:13<13:52, 19.75it/s]

{'loss': 0.289, 'grad_norm': 2.2510693073272705, 'learning_rate': 1.1040268456375839e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58562/75000 [57:13<14:13, 19.25it/s]

{'loss': 0.4007, 'grad_norm': 5.076165676116943, 'learning_rate': 1.1033557046979865e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58572/75000 [57:14<15:43, 17.41it/s]

{'loss': 0.2173, 'grad_norm': 0.9629449844360352, 'learning_rate': 1.1026845637583894e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58584/75000 [57:14<14:23, 19.02it/s]

{'loss': 0.3632, 'grad_norm': 4.421663284301758, 'learning_rate': 1.102013422818792e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58594/75000 [57:15<14:00, 19.52it/s]

{'loss': 0.276, 'grad_norm': 1.072937250137329, 'learning_rate': 1.1013422818791946e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58603/75000 [57:15<14:27, 18.90it/s]

{'loss': 0.3369, 'grad_norm': 4.627259731292725, 'learning_rate': 1.1006711409395975e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58612/75000 [57:16<14:05, 19.38it/s]

{'loss': 0.3554, 'grad_norm': 5.817415714263916, 'learning_rate': 1.1000000000000001e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58621/75000 [57:16<13:45, 19.85it/s]

{'loss': 0.2274, 'grad_norm': 7.945312023162842, 'learning_rate': 1.0993288590604027e-05, 'epoch': 2.34}


                                                     
 78%|███████▊  | 58633/75000 [57:17<14:45, 18.48it/s]

{'loss': 0.3286, 'grad_norm': 2.4985671043395996, 'learning_rate': 1.0986577181208055e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58644/75000 [57:18<13:55, 19.58it/s]

{'loss': 0.3501, 'grad_norm': 3.122109889984131, 'learning_rate': 1.0979865771812082e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58653/75000 [57:18<13:52, 19.62it/s]

{'loss': 0.3167, 'grad_norm': 8.136518478393555, 'learning_rate': 1.0973154362416107e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58662/75000 [57:18<13:49, 19.69it/s]

{'loss': 0.2001, 'grad_norm': 5.260977268218994, 'learning_rate': 1.0966442953020136e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58672/75000 [57:19<14:33, 18.69it/s]

{'loss': 0.3525, 'grad_norm': 15.973980903625488, 'learning_rate': 1.0959731543624163e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58684/75000 [57:20<13:20, 20.37it/s]

{'loss': 0.387, 'grad_norm': 1.6345323324203491, 'learning_rate': 1.0953020134228188e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58693/75000 [57:20<13:29, 20.15it/s]

{'loss': 0.4024, 'grad_norm': 5.497142791748047, 'learning_rate': 1.0946308724832215e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58701/75000 [57:20<15:01, 18.07it/s]

{'loss': 0.4024, 'grad_norm': 5.215239524841309, 'learning_rate': 1.0939597315436243e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58713/75000 [57:21<13:20, 20.33it/s]

{'loss': 0.319, 'grad_norm': 0.4825302064418793, 'learning_rate': 1.0932885906040268e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58722/75000 [57:21<13:24, 20.24it/s]

{'loss': 0.2013, 'grad_norm': 0.8330471515655518, 'learning_rate': 1.0926174496644295e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58733/75000 [57:22<13:54, 19.50it/s]

{'loss': 0.3645, 'grad_norm': 14.418007850646973, 'learning_rate': 1.0919463087248324e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58743/75000 [57:23<13:57, 19.41it/s]

{'loss': 0.26, 'grad_norm': 3.4197676181793213, 'learning_rate': 1.0912751677852349e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58752/75000 [57:23<14:49, 18.27it/s]

{'loss': 0.1517, 'grad_norm': 0.5876306295394897, 'learning_rate': 1.0906040268456376e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58762/75000 [57:24<13:58, 19.36it/s]

{'loss': 0.3875, 'grad_norm': 5.069260597229004, 'learning_rate': 1.0899328859060404e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58773/75000 [57:24<14:00, 19.31it/s]

{'loss': 0.1863, 'grad_norm': 0.9425206780433655, 'learning_rate': 1.089261744966443e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58783/75000 [57:25<15:13, 17.75it/s]

{'loss': 0.3739, 'grad_norm': 2.258692979812622, 'learning_rate': 1.0885906040268456e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58793/75000 [57:25<14:19, 18.85it/s]

{'loss': 0.3543, 'grad_norm': 14.720544815063477, 'learning_rate': 1.0879194630872485e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58802/75000 [57:26<13:46, 19.61it/s]

{'loss': 0.3578, 'grad_norm': 8.168656349182129, 'learning_rate': 1.087248322147651e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58813/75000 [57:26<15:08, 17.81it/s]

{'loss': 0.219, 'grad_norm': 8.454512596130371, 'learning_rate': 1.0865771812080537e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58824/75000 [57:27<13:39, 19.74it/s]

{'loss': 0.3648, 'grad_norm': 2.9432666301727295, 'learning_rate': 1.0859060402684565e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58833/75000 [57:27<13:35, 19.81it/s]

{'loss': 0.3823, 'grad_norm': 4.476864814758301, 'learning_rate': 1.0852348993288592e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58842/75000 [57:28<13:54, 19.37it/s]

{'loss': 0.3044, 'grad_norm': 6.737762451171875, 'learning_rate': 1.0845637583892617e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58853/75000 [57:28<13:29, 19.94it/s]

{'loss': 0.3449, 'grad_norm': 6.251192569732666, 'learning_rate': 1.0838926174496644e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58863/75000 [57:29<13:25, 20.04it/s]

{'loss': 0.3789, 'grad_norm': 4.184727668762207, 'learning_rate': 1.0832214765100673e-05, 'epoch': 2.35}


                                                     
 78%|███████▊  | 58871/75000 [57:29<13:34, 19.81it/s]

{'loss': 0.2714, 'grad_norm': 1.89129638671875, 'learning_rate': 1.0825503355704698e-05, 'epoch': 2.35}


                                                     
 79%|███████▊  | 58883/75000 [57:30<14:33, 18.45it/s]

{'loss': 0.3101, 'grad_norm': 1.8676856756210327, 'learning_rate': 1.0818791946308725e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58892/75000 [57:30<14:08, 18.99it/s]

{'loss': 0.37, 'grad_norm': 2.4741413593292236, 'learning_rate': 1.0812080536912753e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58902/75000 [57:31<13:42, 19.58it/s]

{'loss': 0.2377, 'grad_norm': 2.50528883934021, 'learning_rate': 1.0805369127516778e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58913/75000 [57:31<14:06, 19.00it/s]

{'loss': 0.276, 'grad_norm': 0.7360959053039551, 'learning_rate': 1.0798657718120805e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58923/75000 [57:32<14:27, 18.52it/s]

{'loss': 0.2615, 'grad_norm': 4.411472797393799, 'learning_rate': 1.0791946308724834e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58932/75000 [57:32<14:26, 18.55it/s]

{'loss': 0.3303, 'grad_norm': 4.629745006561279, 'learning_rate': 1.0785234899328859e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58944/75000 [57:33<12:59, 20.61it/s]

{'loss': 0.2679, 'grad_norm': 0.786339521408081, 'learning_rate': 1.0778523489932886e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58953/75000 [57:33<13:27, 19.88it/s]

{'loss': 0.4421, 'grad_norm': 5.006807804107666, 'learning_rate': 1.0771812080536914e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58964/75000 [57:34<13:18, 20.08it/s]

{'loss': 0.2067, 'grad_norm': 5.036550045013428, 'learning_rate': 1.076510067114094e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58972/75000 [57:35<14:38, 18.25it/s]

{'loss': 0.3315, 'grad_norm': 2.6796422004699707, 'learning_rate': 1.0758389261744966e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58982/75000 [57:35<14:49, 18.00it/s]

{'loss': 0.3063, 'grad_norm': 1.2399924993515015, 'learning_rate': 1.0751677852348995e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 58992/75000 [57:36<14:11, 18.80it/s]

{'loss': 0.2701, 'grad_norm': 3.2505815029144287, 'learning_rate': 1.074496644295302e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 59000/75000 [57:36<13:36, 19.60it/s]

{'loss': 0.2188, 'grad_norm': 2.087297201156616, 'learning_rate': 1.0738255033557047e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 59013/75000 [57:37<16:37, 16.03it/s]

{'loss': 0.294, 'grad_norm': 2.695974111557007, 'learning_rate': 1.0731543624161074e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 59021/75000 [57:38<15:22, 17.32it/s]

{'loss': 0.3208, 'grad_norm': 5.997664451599121, 'learning_rate': 1.07248322147651e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 59033/75000 [57:38<14:56, 17.81it/s]

{'loss': 0.257, 'grad_norm': 1.381671667098999, 'learning_rate': 1.0718120805369128e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 59042/75000 [57:39<14:24, 18.45it/s]

{'loss': 0.3572, 'grad_norm': 7.8907670974731445, 'learning_rate': 1.0711409395973154e-05, 'epoch': 2.36}


                                                     
 79%|███████▊  | 59052/75000 [57:39<13:42, 19.40it/s]

{'loss': 0.1661, 'grad_norm': 3.3894736766815186, 'learning_rate': 1.0704697986577181e-05, 'epoch': 2.36}


                                                     
 79%|███████▉  | 59064/75000 [57:40<13:09, 20.19it/s]

{'loss': 0.3384, 'grad_norm': 2.7297682762145996, 'learning_rate': 1.0697986577181208e-05, 'epoch': 2.36}


                                                     
 79%|███████▉  | 59073/75000 [57:40<13:06, 20.26it/s]

{'loss': 0.2691, 'grad_norm': 1.017262578010559, 'learning_rate': 1.0691275167785235e-05, 'epoch': 2.36}


                                                     
 79%|███████▉  | 59082/75000 [57:41<13:05, 20.25it/s]

{'loss': 0.399, 'grad_norm': 22.265518188476562, 'learning_rate': 1.0684563758389264e-05, 'epoch': 2.36}


                                                     
 79%|███████▉  | 59094/75000 [57:42<13:33, 19.56it/s]

{'loss': 0.3675, 'grad_norm': 12.42797565460205, 'learning_rate': 1.0677852348993289e-05, 'epoch': 2.36}


                                                     
 79%|███████▉  | 59103/75000 [57:42<13:21, 19.82it/s]

{'loss': 0.2781, 'grad_norm': 1.6838454008102417, 'learning_rate': 1.0671140939597316e-05, 'epoch': 2.36}


                                                     
 79%|███████▉  | 59112/75000 [57:42<13:14, 19.99it/s]

{'loss': 0.3301, 'grad_norm': 9.204493522644043, 'learning_rate': 1.0664429530201344e-05, 'epoch': 2.36}


                                                     
 79%|███████▉  | 59121/75000 [57:43<14:02, 18.84it/s]

{'loss': 0.3311, 'grad_norm': 4.317111492156982, 'learning_rate': 1.065771812080537e-05, 'epoch': 2.36}


                                                     
 79%|███████▉  | 59134/75000 [57:44<13:10, 20.07it/s]

{'loss': 0.3002, 'grad_norm': 3.6204886436462402, 'learning_rate': 1.0651006711409396e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59143/75000 [57:44<13:10, 20.05it/s]

{'loss': 0.3486, 'grad_norm': 2.2159981727600098, 'learning_rate': 1.0644295302013425e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59152/75000 [57:44<13:04, 20.21it/s]

{'loss': 0.3388, 'grad_norm': 6.784678936004639, 'learning_rate': 1.063758389261745e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59162/75000 [57:45<14:36, 18.08it/s]

{'loss': 0.4032, 'grad_norm': 4.721554756164551, 'learning_rate': 1.0630872483221477e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59173/75000 [57:46<14:52, 17.74it/s]

{'loss': 0.3447, 'grad_norm': 3.1433863639831543, 'learning_rate': 1.0624161073825503e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59181/75000 [57:46<13:44, 19.18it/s]

{'loss': 0.36, 'grad_norm': 2.962639331817627, 'learning_rate': 1.061744966442953e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59193/75000 [57:47<14:57, 17.61it/s]

{'loss': 0.2999, 'grad_norm': 11.451214790344238, 'learning_rate': 1.0610738255033557e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59203/75000 [57:47<13:41, 19.24it/s]

{'loss': 0.19, 'grad_norm': 0.8811101913452148, 'learning_rate': 1.0604026845637584e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59213/75000 [57:48<14:50, 17.72it/s]

{'loss': 0.3061, 'grad_norm': 2.2787349224090576, 'learning_rate': 1.0597315436241611e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59223/75000 [57:48<15:48, 16.64it/s]

{'loss': 0.3661, 'grad_norm': 2.0249762535095215, 'learning_rate': 1.0590604026845638e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59233/75000 [57:49<15:58, 16.44it/s]

{'loss': 0.222, 'grad_norm': 6.5979485511779785, 'learning_rate': 1.0583892617449665e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59243/75000 [57:50<16:28, 15.93it/s]

{'loss': 0.2667, 'grad_norm': 5.044346809387207, 'learning_rate': 1.0577181208053691e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59253/75000 [57:50<15:35, 16.84it/s]

{'loss': 0.2904, 'grad_norm': 3.0598926544189453, 'learning_rate': 1.0570469798657718e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59261/75000 [57:51<15:04, 17.39it/s]

{'loss': 0.2779, 'grad_norm': 1.803756833076477, 'learning_rate': 1.0563758389261745e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59272/75000 [57:51<16:37, 15.77it/s]

{'loss': 0.2503, 'grad_norm': 3.2872214317321777, 'learning_rate': 1.0557046979865772e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59282/75000 [57:52<16:05, 16.29it/s]

{'loss': 0.3181, 'grad_norm': 16.505998611450195, 'learning_rate': 1.0550335570469799e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59292/75000 [57:53<15:59, 16.37it/s]

{'loss': 0.3754, 'grad_norm': 6.273672103881836, 'learning_rate': 1.0543624161073826e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59304/75000 [57:53<14:12, 18.42it/s]

{'loss': 0.2428, 'grad_norm': 0.7995092868804932, 'learning_rate': 1.0536912751677854e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59312/75000 [57:54<14:38, 17.86it/s]

{'loss': 0.2869, 'grad_norm': 4.879308700561523, 'learning_rate': 1.053020134228188e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59322/75000 [57:54<15:33, 16.79it/s]

{'loss': 0.2393, 'grad_norm': 3.7925755977630615, 'learning_rate': 1.0523489932885906e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59332/75000 [57:55<14:21, 18.18it/s]

{'loss': 0.3179, 'grad_norm': 5.915748596191406, 'learning_rate': 1.0516778523489933e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59342/75000 [57:55<16:15, 16.05it/s]

{'loss': 0.2595, 'grad_norm': 2.7733774185180664, 'learning_rate': 1.051006711409396e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59352/75000 [57:56<15:33, 16.77it/s]

{'loss': 0.2336, 'grad_norm': 1.2531682252883911, 'learning_rate': 1.0503355704697987e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59362/75000 [57:57<16:00, 16.28it/s]

{'loss': 0.272, 'grad_norm': 6.567750453948975, 'learning_rate': 1.0496644295302014e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59372/75000 [57:57<15:50, 16.44it/s]

{'loss': 0.283, 'grad_norm': 4.8581624031066895, 'learning_rate': 1.048993288590604e-05, 'epoch': 2.37}


                                                     
 79%|███████▉  | 59382/75000 [57:58<17:30, 14.87it/s]

{'loss': 0.2892, 'grad_norm': 3.754730463027954, 'learning_rate': 1.0483221476510067e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59392/75000 [57:58<14:50, 17.53it/s]

{'loss': 0.1851, 'grad_norm': 4.131990432739258, 'learning_rate': 1.0476510067114094e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59402/75000 [57:59<14:02, 18.51it/s]

{'loss': 0.2313, 'grad_norm': 19.344573974609375, 'learning_rate': 1.0469798657718121e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59413/75000 [58:00<13:55, 18.65it/s]

{'loss': 0.3736, 'grad_norm': 3.5381643772125244, 'learning_rate': 1.0463087248322148e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59424/75000 [58:00<14:07, 18.37it/s]

{'loss': 0.3865, 'grad_norm': 4.461852073669434, 'learning_rate': 1.0456375838926175e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59431/75000 [58:01<13:54, 18.65it/s]

{'loss': 0.2931, 'grad_norm': 4.935776233673096, 'learning_rate': 1.0449664429530202e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59442/75000 [58:01<13:15, 19.55it/s]

{'loss': 0.2882, 'grad_norm': 2.82437801361084, 'learning_rate': 1.0442953020134229e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59453/75000 [58:02<13:45, 18.83it/s]

{'loss': 0.3886, 'grad_norm': 4.322309494018555, 'learning_rate': 1.0436241610738255e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59462/75000 [58:02<14:01, 18.47it/s]

{'loss': 0.2895, 'grad_norm': 15.457886695861816, 'learning_rate': 1.0429530201342282e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59472/75000 [58:03<13:55, 18.58it/s]

{'loss': 0.2167, 'grad_norm': 5.833699703216553, 'learning_rate': 1.0422818791946309e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59484/75000 [58:03<12:56, 19.97it/s]

{'loss': 0.2954, 'grad_norm': 11.237370491027832, 'learning_rate': 1.0416107382550336e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59492/75000 [58:04<13:37, 18.96it/s]

{'loss': 0.2584, 'grad_norm': 2.713653087615967, 'learning_rate': 1.0409395973154363e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59500/75000 [58:04<13:35, 19.00it/s]

{'loss': 0.3057, 'grad_norm': 1.1747395992279053, 'learning_rate': 1.040268456375839e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59514/75000 [58:06<17:08, 15.05it/s]

{'loss': 0.3072, 'grad_norm': 3.890133857727051, 'learning_rate': 1.0395973154362416e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59523/75000 [58:06<15:32, 16.59it/s]

{'loss': 0.1959, 'grad_norm': 2.9545769691467285, 'learning_rate': 1.0389261744966443e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59533/75000 [58:07<13:49, 18.64it/s]

{'loss': 0.2182, 'grad_norm': 3.6306145191192627, 'learning_rate': 1.038255033557047e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59543/75000 [58:07<14:10, 18.17it/s]

{'loss': 0.3006, 'grad_norm': 3.1774933338165283, 'learning_rate': 1.0375838926174497e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59553/75000 [58:08<13:43, 18.75it/s]

{'loss': 0.3288, 'grad_norm': 1.5253113508224487, 'learning_rate': 1.0369127516778524e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59562/75000 [58:08<13:29, 19.08it/s]

{'loss': 0.392, 'grad_norm': 1.071944236755371, 'learning_rate': 1.036241610738255e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59571/75000 [58:09<15:05, 17.03it/s]

{'loss': 0.2657, 'grad_norm': 1.9511983394622803, 'learning_rate': 1.0355704697986578e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59583/75000 [58:09<13:25, 19.13it/s]

{'loss': 0.3131, 'grad_norm': 5.900038242340088, 'learning_rate': 1.0348993288590604e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59593/75000 [58:10<13:11, 19.47it/s]

{'loss': 0.356, 'grad_norm': 2.1611742973327637, 'learning_rate': 1.0342281879194631e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59602/75000 [58:10<13:34, 18.91it/s]

{'loss': 0.2884, 'grad_norm': 1.6086525917053223, 'learning_rate': 1.0335570469798658e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59612/75000 [58:11<13:04, 19.61it/s]

{'loss': 0.3149, 'grad_norm': 2.60020112991333, 'learning_rate': 1.0328859060402685e-05, 'epoch': 2.38}


                                                     
 79%|███████▉  | 59622/75000 [58:11<14:04, 18.20it/s]

{'loss': 0.3043, 'grad_norm': 3.0049729347229004, 'learning_rate': 1.0322147651006712e-05, 'epoch': 2.38}


                                                     
 80%|███████▉  | 59633/75000 [58:12<13:18, 19.24it/s]

{'loss': 0.308, 'grad_norm': 4.107614517211914, 'learning_rate': 1.0315436241610739e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59642/75000 [58:13<14:24, 17.77it/s]

{'loss': 0.3195, 'grad_norm': 3.330869197845459, 'learning_rate': 1.0308724832214766e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59654/75000 [58:13<12:47, 19.99it/s]

{'loss': 0.2136, 'grad_norm': 3.6097521781921387, 'learning_rate': 1.0302013422818792e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59662/75000 [58:14<14:22, 17.79it/s]

{'loss': 0.2695, 'grad_norm': 7.513698101043701, 'learning_rate': 1.029530201342282e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59672/75000 [58:14<13:08, 19.44it/s]

{'loss': 0.2862, 'grad_norm': 2.908437967300415, 'learning_rate': 1.0288590604026846e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59684/75000 [58:15<12:54, 19.77it/s]

{'loss': 0.2756, 'grad_norm': 8.788610458374023, 'learning_rate': 1.0281879194630873e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59693/75000 [58:15<13:39, 18.68it/s]

{'loss': 0.5031, 'grad_norm': 5.329591274261475, 'learning_rate': 1.02751677852349e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59701/75000 [58:16<13:50, 18.42it/s]

{'loss': 0.2249, 'grad_norm': 3.5598762035369873, 'learning_rate': 1.0268456375838927e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59712/75000 [58:16<13:40, 18.64it/s]

{'loss': 0.2504, 'grad_norm': 1.2599729299545288, 'learning_rate': 1.0261744966442954e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59723/75000 [58:17<13:21, 19.05it/s]

{'loss': 0.3405, 'grad_norm': 4.0029778480529785, 'learning_rate': 1.025503355704698e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59732/75000 [58:17<13:11, 19.30it/s]

{'loss': 0.2718, 'grad_norm': 0.855315089225769, 'learning_rate': 1.0248322147651007e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59743/75000 [58:18<15:27, 16.45it/s]

{'loss': 0.2319, 'grad_norm': 15.205212593078613, 'learning_rate': 1.0241610738255034e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59752/75000 [58:18<13:53, 18.30it/s]

{'loss': 0.2651, 'grad_norm': 1.9938424825668335, 'learning_rate': 1.0234899328859061e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59764/75000 [58:19<13:00, 19.52it/s]

{'loss': 0.2992, 'grad_norm': 5.4389967918396, 'learning_rate': 1.0228187919463088e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59772/75000 [58:19<13:22, 18.97it/s]

{'loss': 0.299, 'grad_norm': 11.673300743103027, 'learning_rate': 1.0221476510067115e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59782/75000 [58:20<12:57, 19.56it/s]

{'loss': 0.3281, 'grad_norm': 1.5339393615722656, 'learning_rate': 1.0214765100671142e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59792/75000 [58:20<13:08, 19.28it/s]

{'loss': 0.226, 'grad_norm': 2.7744979858398438, 'learning_rate': 1.0208053691275168e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59802/75000 [58:21<15:12, 16.65it/s]

{'loss': 0.2203, 'grad_norm': 1.7500145435333252, 'learning_rate': 1.0201342281879195e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59814/75000 [58:22<13:21, 18.95it/s]

{'loss': 0.1858, 'grad_norm': 3.2962307929992676, 'learning_rate': 1.0194630872483222e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59822/75000 [58:22<13:01, 19.43it/s]

{'loss': 0.5504, 'grad_norm': 5.260996341705322, 'learning_rate': 1.0187919463087249e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59832/75000 [58:23<13:28, 18.76it/s]

{'loss': 0.3084, 'grad_norm': 11.028231620788574, 'learning_rate': 1.0181208053691276e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59844/75000 [58:23<12:37, 20.02it/s]

{'loss': 0.3457, 'grad_norm': 3.3217110633850098, 'learning_rate': 1.0174496644295303e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59853/75000 [58:24<12:51, 19.64it/s]

{'loss': 0.2959, 'grad_norm': 15.464507102966309, 'learning_rate': 1.016778523489933e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59863/75000 [58:24<13:01, 19.36it/s]

{'loss': 0.29, 'grad_norm': 3.6575396060943604, 'learning_rate': 1.0161073825503356e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59871/75000 [58:25<13:29, 18.68it/s]

{'loss': 0.3857, 'grad_norm': 9.494500160217285, 'learning_rate': 1.0154362416107383e-05, 'epoch': 2.39}


                                                     
 80%|███████▉  | 59882/75000 [58:25<13:49, 18.23it/s]

{'loss': 0.3468, 'grad_norm': 1.6832317113876343, 'learning_rate': 1.014765100671141e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59893/75000 [58:26<14:01, 17.95it/s]

{'loss': 0.289, 'grad_norm': 2.2245681285858154, 'learning_rate': 1.0140939597315437e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59902/75000 [58:26<14:23, 17.48it/s]

{'loss': 0.2371, 'grad_norm': 11.188074111938477, 'learning_rate': 1.0134228187919464e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59911/75000 [58:27<13:51, 18.14it/s]

{'loss': 0.2716, 'grad_norm': 1.4494473934173584, 'learning_rate': 1.012751677852349e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59922/75000 [58:28<15:24, 16.31it/s]

{'loss': 0.3043, 'grad_norm': 5.793928623199463, 'learning_rate': 1.0120805369127517e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59934/75000 [58:28<13:07, 19.13it/s]

{'loss': 0.3578, 'grad_norm': 1.3528566360473633, 'learning_rate': 1.0114093959731544e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59942/75000 [58:29<13:40, 18.34it/s]

{'loss': 0.3582, 'grad_norm': 8.85605239868164, 'learning_rate': 1.0107382550335571e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59954/75000 [58:29<13:12, 18.98it/s]

{'loss': 0.3949, 'grad_norm': 5.963283538818359, 'learning_rate': 1.0100671140939598e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59962/75000 [58:30<13:18, 18.84it/s]

{'loss': 0.2736, 'grad_norm': 4.9025397300720215, 'learning_rate': 1.0093959731543625e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59973/75000 [58:30<14:15, 17.57it/s]

{'loss': 0.3628, 'grad_norm': 6.829416275024414, 'learning_rate': 1.0087248322147652e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59981/75000 [58:31<13:31, 18.50it/s]

{'loss': 0.3127, 'grad_norm': 6.222788333892822, 'learning_rate': 1.0080536912751679e-05, 'epoch': 2.4}


                                                     
 80%|███████▉  | 59991/75000 [58:31<13:52, 18.03it/s]

{'loss': 0.4053, 'grad_norm': 0.7801113128662109, 'learning_rate': 1.0073825503355705e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60000/75000 [58:32<13:07, 19.05it/s]

{'loss': 0.3084, 'grad_norm': 1.3863657712936401, 'learning_rate': 1.006711409395973e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60014/75000 [58:33<15:33, 16.05it/s]

{'loss': 0.2766, 'grad_norm': 1.8644977807998657, 'learning_rate': 1.0060402684563759e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60022/75000 [58:34<14:56, 16.71it/s]

{'loss': 0.3125, 'grad_norm': 1.544533371925354, 'learning_rate': 1.0053691275167786e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60032/75000 [58:34<13:36, 18.34it/s]

{'loss': 0.3016, 'grad_norm': 17.4085693359375, 'learning_rate': 1.0046979865771811e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60042/75000 [58:35<13:54, 17.93it/s]

{'loss': 0.3064, 'grad_norm': 3.8332295417785645, 'learning_rate': 1.004026845637584e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60052/75000 [58:35<12:36, 19.76it/s]

{'loss': 0.3627, 'grad_norm': 1.6072696447372437, 'learning_rate': 1.0033557046979867e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60062/75000 [58:36<13:14, 18.79it/s]

{'loss': 0.3675, 'grad_norm': 5.447608947753906, 'learning_rate': 1.0026845637583893e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60073/75000 [58:36<14:05, 17.65it/s]

{'loss': 0.4457, 'grad_norm': 9.202383041381836, 'learning_rate': 1.002013422818792e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60083/75000 [58:37<13:33, 18.34it/s]

{'loss': 0.2849, 'grad_norm': 2.6839773654937744, 'learning_rate': 1.0013422818791947e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60092/75000 [58:37<13:13, 18.78it/s]

{'loss': 0.2754, 'grad_norm': 4.22305154800415, 'learning_rate': 1.0006711409395974e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60104/75000 [58:38<13:02, 19.05it/s]

{'loss': 0.1724, 'grad_norm': 5.11702299118042, 'learning_rate': 1e-05, 'epoch': 2.4}


                                                     
 80%|████████  | 60113/75000 [58:39<14:02, 17.68it/s]

{'loss': 0.3326, 'grad_norm': 1.6204783916473389, 'learning_rate': 9.993288590604028e-06, 'epoch': 2.4}


                                                     
 80%|████████  | 60123/75000 [58:39<14:39, 16.91it/s] 

{'loss': 0.3443, 'grad_norm': 11.392704963684082, 'learning_rate': 9.986577181208055e-06, 'epoch': 2.4}


                                                     
 80%|████████  | 60132/75000 [58:40<13:32, 18.30it/s] 

{'loss': 0.3128, 'grad_norm': 11.875858306884766, 'learning_rate': 9.979865771812081e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60142/75000 [58:40<13:11, 18.77it/s] 

{'loss': 0.2335, 'grad_norm': 3.63394832611084, 'learning_rate': 9.973154362416108e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60153/75000 [58:41<13:45, 17.99it/s] 

{'loss': 0.2177, 'grad_norm': 3.4816620349884033, 'learning_rate': 9.966442953020135e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60161/75000 [58:41<13:45, 17.97it/s] 

{'loss': 0.29, 'grad_norm': 3.692959785461426, 'learning_rate': 9.95973154362416e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60174/75000 [58:42<12:35, 19.62it/s] 

{'loss': 0.2839, 'grad_norm': 9.40295696258545, 'learning_rate': 9.953020134228189e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60184/75000 [58:42<13:12, 18.69it/s] 

{'loss': 0.3647, 'grad_norm': 4.370217800140381, 'learning_rate': 9.946308724832216e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60192/75000 [58:43<14:28, 17.05it/s] 

{'loss': 0.1826, 'grad_norm': 0.3718712031841278, 'learning_rate': 9.93959731543624e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60202/75000 [58:43<13:19, 18.52it/s] 

{'loss': 0.2991, 'grad_norm': 4.961189270019531, 'learning_rate': 9.93288590604027e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60214/75000 [58:44<13:16, 18.57it/s] 

{'loss': 0.4356, 'grad_norm': 22.20639991760254, 'learning_rate': 9.926174496644296e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60223/75000 [58:44<12:52, 19.14it/s] 

{'loss': 0.2782, 'grad_norm': 1.7597042322158813, 'learning_rate': 9.919463087248321e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60232/75000 [58:45<13:08, 18.74it/s] 

{'loss': 0.2283, 'grad_norm': 8.546893119812012, 'learning_rate': 9.91275167785235e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60243/75000 [58:46<13:34, 18.11it/s] 

{'loss': 0.3772, 'grad_norm': 2.2785089015960693, 'learning_rate': 9.906040268456377e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60251/75000 [58:46<13:27, 18.25it/s] 

{'loss': 0.3778, 'grad_norm': 0.9142265319824219, 'learning_rate': 9.899328859060402e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60264/75000 [58:47<13:15, 18.54it/s] 

{'loss': 0.3738, 'grad_norm': 3.173631429672241, 'learning_rate': 9.89261744966443e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60272/75000 [58:47<12:45, 19.23it/s] 

{'loss': 0.2541, 'grad_norm': 1.841921091079712, 'learning_rate': 9.885906040268457e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60283/75000 [58:48<13:13, 18.54it/s] 

{'loss': 0.3379, 'grad_norm': 6.314149856567383, 'learning_rate': 9.879194630872484e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60292/75000 [58:48<13:44, 17.84it/s] 

{'loss': 0.2474, 'grad_norm': 2.5286569595336914, 'learning_rate': 9.872483221476511e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60302/75000 [58:49<13:49, 17.73it/s] 

{'loss': 0.2597, 'grad_norm': 5.397023677825928, 'learning_rate': 9.865771812080538e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60313/75000 [58:49<12:45, 19.20it/s] 

{'loss': 0.3629, 'grad_norm': 7.682268142700195, 'learning_rate': 9.859060402684565e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60324/75000 [58:50<13:00, 18.81it/s] 

{'loss': 0.2399, 'grad_norm': 1.4344764947891235, 'learning_rate': 9.85234899328859e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60333/75000 [58:50<12:53, 18.97it/s] 

{'loss': 0.4046, 'grad_norm': 6.841324806213379, 'learning_rate': 9.845637583892618e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60343/75000 [58:51<12:56, 18.88it/s] 

{'loss': 0.2224, 'grad_norm': 5.190829753875732, 'learning_rate': 9.838926174496645e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60352/75000 [58:51<14:01, 17.40it/s] 

{'loss': 0.37, 'grad_norm': 2.889831781387329, 'learning_rate': 9.83221476510067e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60361/75000 [58:52<13:49, 17.65it/s] 

{'loss': 0.3564, 'grad_norm': 7.209746837615967, 'learning_rate': 9.825503355704699e-06, 'epoch': 2.41}


                                                     
 80%|████████  | 60373/75000 [58:53<12:49, 19.00it/s] 

{'loss': 0.3112, 'grad_norm': 1.1214240789413452, 'learning_rate': 9.818791946308726e-06, 'epoch': 2.41}


                                                     
 81%|████████  | 60383/75000 [58:53<12:40, 19.23it/s] 

{'loss': 0.1931, 'grad_norm': 1.4947500228881836, 'learning_rate': 9.812080536912751e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60393/75000 [58:54<12:32, 19.42it/s] 

{'loss': 0.1809, 'grad_norm': 9.220479011535645, 'learning_rate': 9.80536912751678e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60403/75000 [58:54<13:33, 17.95it/s] 

{'loss': 0.3794, 'grad_norm': 7.896046161651611, 'learning_rate': 9.798657718120806e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60413/75000 [58:55<13:54, 17.48it/s] 

{'loss': 0.2599, 'grad_norm': 1.0002578496932983, 'learning_rate': 9.791946308724832e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60423/75000 [58:55<13:22, 18.17it/s] 

{'loss': 0.2235, 'grad_norm': 3.265638589859009, 'learning_rate': 9.78523489932886e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60433/75000 [58:56<13:30, 17.98it/s] 

{'loss': 0.3082, 'grad_norm': 8.199447631835938, 'learning_rate': 9.778523489932887e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60443/75000 [58:56<13:05, 18.53it/s] 

{'loss': 0.2698, 'grad_norm': 4.656691074371338, 'learning_rate': 9.771812080536912e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60452/75000 [58:57<13:40, 17.72it/s] 

{'loss': 0.3997, 'grad_norm': 1.7221304178237915, 'learning_rate': 9.76510067114094e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60462/75000 [58:57<12:37, 19.18it/s] 

{'loss': 0.437, 'grad_norm': 3.1663362979888916, 'learning_rate': 9.758389261744968e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60474/75000 [58:58<12:56, 18.70it/s] 

{'loss': 0.3034, 'grad_norm': 7.508614540100098, 'learning_rate': 9.751677852348993e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60482/75000 [58:59<13:46, 17.58it/s] 

{'loss': 0.287, 'grad_norm': 0.7981505393981934, 'learning_rate': 9.74496644295302e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60492/75000 [58:59<13:38, 17.72it/s] 

{'loss': 0.326, 'grad_norm': 8.099554061889648, 'learning_rate': 9.738255033557048e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60500/75000 [58:59<12:40, 19.08it/s] 

{'loss': 0.2288, 'grad_norm': 9.987056732177734, 'learning_rate': 9.731543624161075e-06, 'epoch': 2.42}


                                                       
 81%|████████  | 60514/75000 [59:04<30:16,  7.97it/s] 

{'loss': 0.3079, 'grad_norm': 1.8787344694137573, 'learning_rate': 9.7248322147651e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60522/75000 [59:05<18:53, 12.78it/s] 

{'loss': 0.2976, 'grad_norm': 6.82067346572876, 'learning_rate': 9.718120805369129e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60532/75000 [59:05<14:26, 16.70it/s] 

{'loss': 0.3303, 'grad_norm': 5.708021640777588, 'learning_rate': 9.711409395973155e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60542/75000 [59:06<12:57, 18.60it/s] 

{'loss': 0.3917, 'grad_norm': 2.163160562515259, 'learning_rate': 9.70469798657718e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60553/75000 [59:06<12:53, 18.68it/s] 

{'loss': 0.3936, 'grad_norm': 3.173175096511841, 'learning_rate': 9.69798657718121e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60562/75000 [59:07<12:19, 19.52it/s] 

{'loss': 0.3495, 'grad_norm': 1.8873263597488403, 'learning_rate': 9.691275167785236e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60572/75000 [59:07<14:13, 16.91it/s] 

{'loss': 0.2067, 'grad_norm': 1.139980435371399, 'learning_rate': 9.684563758389261e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60583/75000 [59:08<13:05, 18.36it/s] 

{'loss': 0.3403, 'grad_norm': 10.473482131958008, 'learning_rate': 9.67785234899329e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60592/75000 [59:08<12:41, 18.91it/s] 

{'loss': 0.2418, 'grad_norm': 2.7639269828796387, 'learning_rate': 9.671140939597317e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60602/75000 [59:09<13:32, 17.72it/s] 

{'loss': 0.3357, 'grad_norm': 3.42126727104187, 'learning_rate': 9.664429530201342e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60612/75000 [59:09<12:42, 18.88it/s] 

{'loss': 0.3379, 'grad_norm': 5.214052200317383, 'learning_rate': 9.65771812080537e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60622/75000 [59:10<14:15, 16.81it/s] 

{'loss': 0.3097, 'grad_norm': 2.3081605434417725, 'learning_rate': 9.651006711409397e-06, 'epoch': 2.42}


                                                     
 81%|████████  | 60633/75000 [59:11<14:05, 17.00it/s] 

{'loss': 0.3187, 'grad_norm': 2.924147844314575, 'learning_rate': 9.644295302013422e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60643/75000 [59:11<12:44, 18.79it/s] 

{'loss': 0.3528, 'grad_norm': 4.717482089996338, 'learning_rate': 9.637583892617451e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60653/75000 [59:12<12:45, 18.75it/s] 

{'loss': 0.1987, 'grad_norm': 5.7096848487854, 'learning_rate': 9.630872483221478e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60662/75000 [59:12<13:58, 17.09it/s] 

{'loss': 0.3732, 'grad_norm': 2.3408355712890625, 'learning_rate': 9.624161073825503e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60673/75000 [59:13<12:27, 19.17it/s] 

{'loss': 0.3412, 'grad_norm': 6.779079914093018, 'learning_rate': 9.61744966442953e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60682/75000 [59:13<12:25, 19.21it/s] 

{'loss': 0.215, 'grad_norm': 1.6640831232070923, 'learning_rate': 9.610738255033558e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60694/75000 [59:14<12:21, 19.29it/s] 

{'loss': 0.2574, 'grad_norm': 0.6449772715568542, 'learning_rate': 9.604026845637583e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60703/75000 [59:14<12:27, 19.13it/s] 

{'loss': 0.375, 'grad_norm': 3.9077858924865723, 'learning_rate': 9.59731543624161e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60712/75000 [59:15<12:45, 18.67it/s] 

{'loss': 0.2548, 'grad_norm': 1.6048047542572021, 'learning_rate': 9.590604026845639e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60723/75000 [59:15<12:20, 19.29it/s] 

{'loss': 0.3286, 'grad_norm': 6.325371265411377, 'learning_rate': 9.583892617449666e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60732/75000 [59:16<12:33, 18.93it/s] 

{'loss': 0.3465, 'grad_norm': 5.242615222930908, 'learning_rate': 9.577181208053691e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60742/75000 [59:16<13:39, 17.41it/s] 

{'loss': 0.2855, 'grad_norm': 2.7648260593414307, 'learning_rate': 9.57046979865772e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60752/75000 [59:17<14:22, 16.51it/s] 

{'loss': 0.3626, 'grad_norm': 6.6340250968933105, 'learning_rate': 9.563758389261746e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60761/75000 [59:18<12:56, 18.33it/s] 

{'loss': 0.263, 'grad_norm': 8.584539413452148, 'learning_rate': 9.557046979865771e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60773/75000 [59:18<14:20, 16.53it/s] 

{'loss': 0.4131, 'grad_norm': 1.9538557529449463, 'learning_rate': 9.5503355704698e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60782/75000 [59:19<13:10, 18.00it/s] 

{'loss': 0.2909, 'grad_norm': 2.5091066360473633, 'learning_rate': 9.543624161073827e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60792/75000 [59:19<13:04, 18.11it/s] 

{'loss': 0.3223, 'grad_norm': 4.80134916305542, 'learning_rate': 9.536912751677852e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60803/75000 [59:20<13:46, 17.17it/s] 

{'loss': 0.273, 'grad_norm': 0.819895327091217, 'learning_rate': 9.53020134228188e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60812/75000 [59:20<12:44, 18.55it/s] 

{'loss': 0.3145, 'grad_norm': 3.5710911750793457, 'learning_rate': 9.523489932885907e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60824/75000 [59:21<12:02, 19.61it/s] 

{'loss': 0.3983, 'grad_norm': 4.743318557739258, 'learning_rate': 9.516778523489933e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60832/75000 [59:21<12:09, 19.43it/s] 

{'loss': 0.2701, 'grad_norm': 21.90610122680664, 'learning_rate': 9.51006711409396e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60844/75000 [59:22<12:14, 19.27it/s] 

{'loss': 0.2607, 'grad_norm': 1.326792597770691, 'learning_rate': 9.503355704697988e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60854/75000 [59:23<12:45, 18.49it/s] 

{'loss': 0.2815, 'grad_norm': 9.021342277526855, 'learning_rate': 9.496644295302013e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60862/75000 [59:23<12:59, 18.14it/s] 

{'loss': 0.3261, 'grad_norm': 8.456184387207031, 'learning_rate': 9.48993288590604e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60872/75000 [59:24<12:20, 19.09it/s] 

{'loss': 0.2976, 'grad_norm': 11.812979698181152, 'learning_rate': 9.483221476510069e-06, 'epoch': 2.43}


                                                     
 81%|████████  | 60882/75000 [59:24<13:46, 17.08it/s] 

{'loss': 0.3938, 'grad_norm': 0.39420008659362793, 'learning_rate': 9.476510067114094e-06, 'epoch': 2.44}


                                                     
 81%|████████  | 60892/75000 [59:25<12:21, 19.03it/s] 

{'loss': 0.2244, 'grad_norm': 4.363471984863281, 'learning_rate': 9.46979865771812e-06, 'epoch': 2.44}


                                                     
 81%|████████  | 60903/75000 [59:25<12:01, 19.53it/s] 

{'loss': 0.4849, 'grad_norm': 11.05412769317627, 'learning_rate': 9.463087248322149e-06, 'epoch': 2.44}


                                                     
 81%|████████  | 60911/75000 [59:26<11:57, 19.65it/s] 

{'loss': 0.2093, 'grad_norm': 1.7924739122390747, 'learning_rate': 9.456375838926174e-06, 'epoch': 2.44}


                                                     
 81%|████████  | 60922/75000 [59:26<12:44, 18.43it/s] 

{'loss': 0.2636, 'grad_norm': 2.771655321121216, 'learning_rate': 9.449664429530201e-06, 'epoch': 2.44}


                                                     
 81%|████████  | 60931/75000 [59:27<12:18, 19.05it/s] 

{'loss': 0.2163, 'grad_norm': 1.6130826473236084, 'learning_rate': 9.44295302013423e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 60944/75000 [59:27<11:43, 19.99it/s] 

{'loss': 0.2289, 'grad_norm': 4.899050235748291, 'learning_rate': 9.436241610738256e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 60954/75000 [59:28<12:05, 19.36it/s] 

{'loss': 0.3536, 'grad_norm': 2.7371628284454346, 'learning_rate': 9.429530201342282e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 60961/75000 [59:28<13:08, 17.79it/s] 

{'loss': 0.2573, 'grad_norm': 4.09152889251709, 'learning_rate': 9.42281879194631e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 60972/75000 [59:29<12:40, 18.44it/s] 

{'loss': 0.3624, 'grad_norm': 7.181490898132324, 'learning_rate': 9.416107382550337e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 60981/75000 [59:29<12:25, 18.82it/s] 

{'loss': 0.3038, 'grad_norm': 0.9944884181022644, 'learning_rate': 9.409395973154362e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 60994/75000 [59:30<11:46, 19.83it/s] 

{'loss': 0.4923, 'grad_norm': 6.216956615447998, 'learning_rate': 9.402684563758389e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61000/75000 [59:30<13:06, 17.80it/s] 

{'loss': 0.1682, 'grad_norm': 1.3302373886108398, 'learning_rate': 9.395973154362418e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61014/75000 [59:32<15:00, 15.52it/s] 

{'loss': 0.3833, 'grad_norm': 2.997126579284668, 'learning_rate': 9.389261744966443e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61022/75000 [59:32<14:13, 16.37it/s] 

{'loss': 0.201, 'grad_norm': 3.898749589920044, 'learning_rate': 9.38255033557047e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61032/75000 [59:33<12:48, 18.19it/s] 

{'loss': 0.2363, 'grad_norm': 4.131404876708984, 'learning_rate': 9.375838926174498e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61042/75000 [59:33<13:04, 17.80it/s] 

{'loss': 0.3584, 'grad_norm': 4.536296844482422, 'learning_rate': 9.369127516778523e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61052/75000 [59:34<12:53, 18.02it/s] 

{'loss': 0.3536, 'grad_norm': 2.475386381149292, 'learning_rate': 9.36241610738255e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61062/75000 [59:34<13:32, 17.15it/s] 

{'loss': 0.4018, 'grad_norm': 10.373482704162598, 'learning_rate': 9.355704697986579e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61073/75000 [59:35<12:01, 19.31it/s] 

{'loss': 0.3607, 'grad_norm': 4.473167419433594, 'learning_rate': 9.348993288590604e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61084/75000 [59:36<12:00, 19.32it/s] 

{'loss': 0.3584, 'grad_norm': 5.472954750061035, 'learning_rate': 9.34228187919463e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61092/75000 [59:36<11:43, 19.78it/s] 

{'loss': 0.3322, 'grad_norm': 6.532740592956543, 'learning_rate': 9.33557046979866e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61102/75000 [59:37<12:28, 18.57it/s] 

{'loss': 0.1982, 'grad_norm': 5.609873294830322, 'learning_rate': 9.328859060402684e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61113/75000 [59:37<12:22, 18.72it/s] 

{'loss': 0.3204, 'grad_norm': 1.3487317562103271, 'learning_rate': 9.322147651006711e-06, 'epoch': 2.44}


                                                     
 81%|████████▏ | 61123/75000 [59:38<12:22, 18.69it/s] 

{'loss': 0.3361, 'grad_norm': 11.256670951843262, 'learning_rate': 9.31543624161074e-06, 'epoch': 2.44}


                                                     
 82%|████████▏ | 61132/75000 [59:38<13:21, 17.30it/s] 

{'loss': 0.263, 'grad_norm': 0.5063266158103943, 'learning_rate': 9.308724832214765e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61142/75000 [59:39<12:09, 19.00it/s] 

{'loss': 0.1908, 'grad_norm': 5.15357780456543, 'learning_rate': 9.302013422818792e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61153/75000 [59:39<12:52, 17.92it/s] 

{'loss': 0.4409, 'grad_norm': 1.3047794103622437, 'learning_rate': 9.295302013422819e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61162/75000 [59:40<12:15, 18.81it/s] 

{'loss': 0.3169, 'grad_norm': 3.531764268875122, 'learning_rate': 9.288590604026846e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61173/75000 [59:40<12:23, 18.59it/s] 

{'loss': 0.2982, 'grad_norm': 3.7193188667297363, 'learning_rate': 9.281879194630872e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61183/75000 [59:41<12:23, 18.58it/s] 

{'loss': 0.3821, 'grad_norm': 4.022677421569824, 'learning_rate': 9.2751677852349e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61192/75000 [59:41<12:05, 19.04it/s] 

{'loss': 0.1994, 'grad_norm': 1.3642218112945557, 'learning_rate': 9.268456375838928e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61204/75000 [59:42<11:44, 19.58it/s] 

{'loss': 0.3288, 'grad_norm': 6.254345417022705, 'learning_rate': 9.261744966442953e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61212/75000 [59:42<12:00, 19.13it/s] 

{'loss': 0.4634, 'grad_norm': 1.1498569250106812, 'learning_rate': 9.25503355704698e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61222/75000 [59:43<11:57, 19.21it/s] 

{'loss': 0.2811, 'grad_norm': 1.6643275022506714, 'learning_rate': 9.248322147651008e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61233/75000 [59:43<11:56, 19.21it/s] 

{'loss': 0.2855, 'grad_norm': 1.8679416179656982, 'learning_rate': 9.241610738255034e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61243/75000 [59:44<12:23, 18.51it/s] 

{'loss': 0.3923, 'grad_norm': 5.362154483795166, 'learning_rate': 9.23489932885906e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61253/75000 [59:45<12:10, 18.81it/s] 

{'loss': 0.3968, 'grad_norm': 13.025181770324707, 'learning_rate': 9.228187919463089e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61262/75000 [59:45<12:00, 19.08it/s] 

{'loss': 0.2058, 'grad_norm': 4.992537021636963, 'learning_rate': 9.221476510067114e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61274/75000 [59:46<12:11, 18.75it/s] 

{'loss': 0.2517, 'grad_norm': 5.258659839630127, 'learning_rate': 9.214765100671141e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61281/75000 [59:46<12:00, 19.03it/s] 

{'loss': 0.2786, 'grad_norm': 3.913393259048462, 'learning_rate': 9.20805369127517e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61292/75000 [59:47<11:31, 19.82it/s] 

{'loss': 0.1874, 'grad_norm': 1.8671083450317383, 'learning_rate': 9.201342281879195e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61303/75000 [59:47<11:47, 19.37it/s] 

{'loss': 0.3216, 'grad_norm': 4.430680751800537, 'learning_rate': 9.194630872483221e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61313/75000 [59:48<13:13, 17.24it/s] 

{'loss': 0.347, 'grad_norm': 7.373463153839111, 'learning_rate': 9.187919463087248e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61322/75000 [59:48<14:41, 15.51it/s] 

{'loss': 0.3002, 'grad_norm': 0.8209002017974854, 'learning_rate': 9.181208053691275e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61332/75000 [59:49<13:06, 17.38it/s] 

{'loss': 0.2769, 'grad_norm': 3.6525914669036865, 'learning_rate': 9.174496644295302e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61344/75000 [59:49<11:32, 19.71it/s] 

{'loss': 0.2678, 'grad_norm': 0.587697446346283, 'learning_rate': 9.167785234899329e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61354/75000 [59:50<11:29, 19.79it/s] 

{'loss': 0.3465, 'grad_norm': 3.4602062702178955, 'learning_rate': 9.161073825503356e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61362/75000 [59:50<12:10, 18.68it/s] 

{'loss': 0.3444, 'grad_norm': 4.404130935668945, 'learning_rate': 9.154362416107383e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61372/75000 [59:51<13:00, 17.47it/s] 

{'loss': 0.2956, 'grad_norm': 4.807060241699219, 'learning_rate': 9.14765100671141e-06, 'epoch': 2.45}


                                                     
 82%|████████▏ | 61382/75000 [59:51<12:10, 18.65it/s] 

{'loss': 0.2843, 'grad_norm': 4.000133514404297, 'learning_rate': 9.140939597315436e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61392/75000 [59:52<12:32, 18.09it/s] 

{'loss': 0.3541, 'grad_norm': 2.7866454124450684, 'learning_rate': 9.134228187919463e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61402/75000 [59:53<12:06, 18.71it/s] 

{'loss': 0.2189, 'grad_norm': 4.302423000335693, 'learning_rate': 9.12751677852349e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61414/75000 [59:53<11:30, 19.68it/s] 

{'loss': 0.3171, 'grad_norm': 1.4968034029006958, 'learning_rate': 9.120805369127519e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61422/75000 [59:54<12:18, 18.39it/s] 

{'loss': 0.2954, 'grad_norm': 3.7987194061279297, 'learning_rate': 9.114093959731544e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61433/75000 [59:54<11:51, 19.08it/s] 

{'loss': 0.2968, 'grad_norm': 5.987823963165283, 'learning_rate': 9.10738255033557e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61442/75000 [59:55<12:35, 17.96it/s] 

{'loss': 0.3396, 'grad_norm': 1.7376528978347778, 'learning_rate': 9.100671140939599e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61453/75000 [59:55<11:52, 19.01it/s] 

{'loss': 0.2733, 'grad_norm': 1.81931471824646, 'learning_rate': 9.093959731543624e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61464/75000 [59:56<11:56, 18.89it/s] 

{'loss': 0.2586, 'grad_norm': 2.708185911178589, 'learning_rate': 9.087248322147651e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61474/75000 [59:56<11:44, 19.21it/s] 

{'loss': 0.2016, 'grad_norm': 1.1100472211837769, 'learning_rate': 9.080536912751678e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61483/75000 [59:57<12:21, 18.23it/s] 

{'loss': 0.3782, 'grad_norm': 0.5787386894226074, 'learning_rate': 9.073825503355705e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61493/75000 [59:57<12:06, 18.59it/s] 

{'loss': 0.323, 'grad_norm': 1.884966254234314, 'learning_rate': 9.067114093959732e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61500/75000 [59:58<12:40, 17.76it/s] 

{'loss': 0.2521, 'grad_norm': 1.7968615293502808, 'learning_rate': 9.060402684563759e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61513/75000 [59:59<14:40, 15.32it/s] 

{'loss': 0.3842, 'grad_norm': 6.92318058013916, 'learning_rate': 9.053691275167785e-06, 'epoch': 2.46}


                                                     
 82%|████████▏ | 61522/75000 [59:59<12:35, 17.83it/s] 

{'loss': 0.22, 'grad_norm': 11.325932502746582, 'learning_rate': 9.046979865771812e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61534/75000 [1:00:00<11:43, 19.14it/s]

{'loss': 0.2938, 'grad_norm': 1.0048753023147583, 'learning_rate': 9.040268456375839e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61543/75000 [1:00:01<11:52, 18.88it/s]

{'loss': 0.1538, 'grad_norm': 1.4500603675842285, 'learning_rate': 9.033557046979866e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61554/75000 [1:00:01<11:19, 19.78it/s]

{'loss': 0.332, 'grad_norm': 2.522794008255005, 'learning_rate': 9.026845637583893e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61563/75000 [1:00:02<11:36, 19.28it/s]

{'loss': 0.2853, 'grad_norm': 2.1097066402435303, 'learning_rate': 9.02013422818792e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61573/75000 [1:00:02<12:31, 17.87it/s]

{'loss': 0.2987, 'grad_norm': 0.5204262137413025, 'learning_rate': 9.013422818791947e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61583/75000 [1:00:03<12:15, 18.24it/s]

{'loss': 0.2371, 'grad_norm': 0.6346638202667236, 'learning_rate': 9.006711409395973e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61591/75000 [1:00:03<13:47, 16.21it/s]

{'loss': 0.2465, 'grad_norm': 4.82681131362915, 'learning_rate': 9e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61602/75000 [1:00:04<11:50, 18.85it/s]

{'loss': 0.2694, 'grad_norm': 6.731383800506592, 'learning_rate': 8.993288590604027e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61613/75000 [1:00:04<12:06, 18.42it/s]

{'loss': 0.5393, 'grad_norm': 3.1464216709136963, 'learning_rate': 8.986577181208054e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61621/75000 [1:00:05<11:54, 18.72it/s]

{'loss': 0.4398, 'grad_norm': 1.642345905303955, 'learning_rate': 8.97986577181208e-06, 'epoch': 2.46}


                                                       
 82%|████████▏ | 61634/75000 [1:00:05<11:16, 19.77it/s]

{'loss': 0.3768, 'grad_norm': 3.1274592876434326, 'learning_rate': 8.973154362416108e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61643/75000 [1:00:06<12:15, 18.15it/s]

{'loss': 0.2002, 'grad_norm': 4.033443450927734, 'learning_rate': 8.966442953020134e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61652/75000 [1:00:06<11:55, 18.67it/s]

{'loss': 0.2347, 'grad_norm': 3.7187602519989014, 'learning_rate': 8.959731543624161e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61663/75000 [1:00:07<12:45, 17.41it/s]

{'loss': 0.2543, 'grad_norm': 0.6215647459030151, 'learning_rate': 8.953020134228188e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61671/75000 [1:00:07<12:03, 18.43it/s]

{'loss': 0.219, 'grad_norm': 3.9261839389801025, 'learning_rate': 8.946308724832215e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61683/75000 [1:00:08<11:26, 19.40it/s]

{'loss': 0.4092, 'grad_norm': 4.247605800628662, 'learning_rate': 8.939597315436242e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61693/75000 [1:00:09<11:54, 18.62it/s]

{'loss': 0.3134, 'grad_norm': 5.703134059906006, 'learning_rate': 8.932885906040269e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61702/75000 [1:00:09<11:49, 18.74it/s]

{'loss': 0.2473, 'grad_norm': 3.5695245265960693, 'learning_rate': 8.926174496644296e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61712/75000 [1:00:10<13:31, 16.37it/s]

{'loss': 0.277, 'grad_norm': 1.2473363876342773, 'learning_rate': 8.919463087248322e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61723/75000 [1:00:10<12:12, 18.13it/s]

{'loss': 0.3197, 'grad_norm': 7.863103866577148, 'learning_rate': 8.91275167785235e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61733/75000 [1:00:11<11:54, 18.57it/s]

{'loss': 0.2979, 'grad_norm': 3.0379629135131836, 'learning_rate': 8.906040268456376e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61743/75000 [1:00:11<12:01, 18.37it/s]

{'loss': 0.286, 'grad_norm': 7.854949951171875, 'learning_rate': 8.899328859060403e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61754/75000 [1:00:12<11:09, 19.79it/s]

{'loss': 0.2693, 'grad_norm': 3.128878355026245, 'learning_rate': 8.89261744966443e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61763/75000 [1:00:13<11:56, 18.47it/s]

{'loss': 0.4116, 'grad_norm': 3.67183518409729, 'learning_rate': 8.885906040268457e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61771/75000 [1:00:13<12:17, 17.94it/s]

{'loss': 0.2415, 'grad_norm': 2.483884334564209, 'learning_rate': 8.879194630872484e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61783/75000 [1:00:14<12:00, 18.34it/s]

{'loss': 0.3197, 'grad_norm': 5.924315452575684, 'learning_rate': 8.87248322147651e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61792/75000 [1:00:14<11:31, 19.10it/s]

{'loss': 0.3441, 'grad_norm': 3.5741493701934814, 'learning_rate': 8.865771812080537e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61802/75000 [1:00:15<11:58, 18.38it/s]

{'loss': 0.4032, 'grad_norm': 5.006310939788818, 'learning_rate': 8.859060402684564e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61812/75000 [1:00:15<11:39, 18.85it/s]

{'loss': 0.2865, 'grad_norm': 3.985109329223633, 'learning_rate': 8.852348993288591e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61823/75000 [1:00:16<11:04, 19.82it/s]

{'loss': 0.2615, 'grad_norm': 0.7162013053894043, 'learning_rate': 8.845637583892618e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61832/75000 [1:00:16<11:15, 19.49it/s]

{'loss': 0.3137, 'grad_norm': 9.401004791259766, 'learning_rate': 8.838926174496645e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61843/75000 [1:00:17<11:35, 18.92it/s]

{'loss': 0.1873, 'grad_norm': 1.4146844148635864, 'learning_rate': 8.832214765100672e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61852/75000 [1:00:17<11:48, 18.56it/s]

{'loss': 0.3396, 'grad_norm': 1.677173137664795, 'learning_rate': 8.825503355704698e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61862/75000 [1:00:18<12:25, 17.62it/s]

{'loss': 0.2664, 'grad_norm': 1.2411956787109375, 'learning_rate': 8.818791946308725e-06, 'epoch': 2.47}


                                                       
 82%|████████▏ | 61873/75000 [1:00:18<12:45, 17.14it/s]

{'loss': 0.3671, 'grad_norm': 2.3901352882385254, 'learning_rate': 8.812080536912752e-06, 'epoch': 2.47}


                                                       
 83%|████████▎ | 61883/75000 [1:00:19<12:02, 18.16it/s]

{'loss': 0.3291, 'grad_norm': 9.943564414978027, 'learning_rate': 8.805369127516779e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61892/75000 [1:00:19<12:18, 17.75it/s]

{'loss': 0.3343, 'grad_norm': 15.620528221130371, 'learning_rate': 8.798657718120806e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61902/75000 [1:00:20<11:31, 18.93it/s]

{'loss': 0.2638, 'grad_norm': 2.22562313079834, 'learning_rate': 8.791946308724833e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61913/75000 [1:00:21<12:10, 17.92it/s]

{'loss': 0.2988, 'grad_norm': 27.4511775970459, 'learning_rate': 8.78523489932886e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61922/75000 [1:00:21<11:37, 18.75it/s]

{'loss': 0.2895, 'grad_norm': 3.8359322547912598, 'learning_rate': 8.778523489932886e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61932/75000 [1:00:22<12:05, 18.02it/s]

{'loss': 0.3588, 'grad_norm': 2.727325201034546, 'learning_rate': 8.771812080536913e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61941/75000 [1:00:22<13:38, 15.96it/s]

{'loss': 0.283, 'grad_norm': 5.435727119445801, 'learning_rate': 8.76510067114094e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61954/75000 [1:00:23<11:44, 18.51it/s]

{'loss': 0.2786, 'grad_norm': 1.2835559844970703, 'learning_rate': 8.758389261744967e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61962/75000 [1:00:23<12:19, 17.64it/s]

{'loss': 0.2597, 'grad_norm': 3.868173599243164, 'learning_rate': 8.751677852348994e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61971/75000 [1:00:24<12:05, 17.96it/s]

{'loss': 0.3537, 'grad_norm': 3.0871551036834717, 'learning_rate': 8.74496644295302e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61984/75000 [1:00:25<11:43, 18.50it/s]

{'loss': 0.2414, 'grad_norm': 2.116584539413452, 'learning_rate': 8.738255033557047e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 61992/75000 [1:00:25<11:42, 18.51it/s]

{'loss': 0.2911, 'grad_norm': 3.838632345199585, 'learning_rate': 8.731543624161074e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62000/75000 [1:00:25<11:21, 19.07it/s]

{'loss': 0.1991, 'grad_norm': 3.0383687019348145, 'learning_rate': 8.724832214765101e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62014/75000 [1:00:27<13:16, 16.30it/s]

{'loss': 0.2936, 'grad_norm': 1.7926124334335327, 'learning_rate': 8.718120805369128e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62022/75000 [1:00:27<13:10, 16.41it/s]

{'loss': 0.2699, 'grad_norm': 1.1971355676651, 'learning_rate': 8.711409395973155e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62032/75000 [1:00:28<12:29, 17.29it/s]

{'loss': 0.3047, 'grad_norm': 3.2628915309906006, 'learning_rate': 8.704697986577182e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62042/75000 [1:00:28<11:19, 19.06it/s]

{'loss': 0.3418, 'grad_norm': 3.5896027088165283, 'learning_rate': 8.697986577181209e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62052/75000 [1:00:29<11:07, 19.39it/s]

{'loss': 0.2857, 'grad_norm': 2.6422574520111084, 'learning_rate': 8.691275167785235e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62063/75000 [1:00:29<11:46, 18.32it/s]

{'loss': 0.3801, 'grad_norm': 5.687050819396973, 'learning_rate': 8.684563758389262e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62072/75000 [1:00:30<11:25, 18.87it/s]

{'loss': 0.1521, 'grad_norm': 0.8518714904785156, 'learning_rate': 8.67785234899329e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62082/75000 [1:00:30<12:15, 17.57it/s]

{'loss': 0.2724, 'grad_norm': 7.975417137145996, 'learning_rate': 8.671140939597316e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62092/75000 [1:00:31<11:32, 18.63it/s]

{'loss': 0.2457, 'grad_norm': 2.680366039276123, 'learning_rate': 8.664429530201343e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62102/75000 [1:00:31<12:07, 17.74it/s]

{'loss': 0.248, 'grad_norm': 5.043059349060059, 'learning_rate': 8.65771812080537e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62112/75000 [1:00:32<11:49, 18.17it/s]

{'loss': 0.2832, 'grad_norm': 1.3412460088729858, 'learning_rate': 8.651006711409397e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62123/75000 [1:00:33<12:21, 17.36it/s]

{'loss': 0.3456, 'grad_norm': 2.6224586963653564, 'learning_rate': 8.644295302013423e-06, 'epoch': 2.48}


                                                       
 83%|████████▎ | 62133/75000 [1:00:33<11:24, 18.79it/s]

{'loss': 0.4246, 'grad_norm': 8.141789436340332, 'learning_rate': 8.63758389261745e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62143/75000 [1:00:34<11:44, 18.24it/s]

{'loss': 0.3313, 'grad_norm': 3.220487356185913, 'learning_rate': 8.630872483221475e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62152/75000 [1:00:34<11:50, 18.08it/s]

{'loss': 0.3474, 'grad_norm': 5.830982208251953, 'learning_rate': 8.624161073825504e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62162/75000 [1:00:35<12:18, 17.37it/s]

{'loss': 0.2184, 'grad_norm': 2.7472546100616455, 'learning_rate': 8.617449664429531e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62172/75000 [1:00:35<11:03, 19.35it/s]

{'loss': 0.2774, 'grad_norm': 2.4233081340789795, 'learning_rate': 8.610738255033558e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62181/75000 [1:00:36<12:36, 16.96it/s]

{'loss': 0.3556, 'grad_norm': 0.7318183779716492, 'learning_rate': 8.604026845637585e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62193/75000 [1:00:36<11:41, 18.25it/s]

{'loss': 0.3134, 'grad_norm': 1.39055597782135, 'learning_rate': 8.597315436241611e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62203/75000 [1:00:37<11:10, 19.09it/s]

{'loss': 0.3614, 'grad_norm': 21.506315231323242, 'learning_rate': 8.590604026845638e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62213/75000 [1:00:38<12:01, 17.72it/s]

{'loss': 0.2071, 'grad_norm': 3.747403144836426, 'learning_rate': 8.583892617449665e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62222/75000 [1:00:38<11:26, 18.60it/s]

{'loss': 0.227, 'grad_norm': 3.7295920848846436, 'learning_rate': 8.577181208053692e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62233/75000 [1:00:39<11:40, 18.23it/s]

{'loss': 0.3992, 'grad_norm': 1.7075953483581543, 'learning_rate': 8.570469798657719e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62241/75000 [1:00:39<11:32, 18.43it/s]

{'loss': 0.2799, 'grad_norm': 3.1096816062927246, 'learning_rate': 8.563758389261746e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62252/75000 [1:00:40<11:19, 18.75it/s]

{'loss': 0.2872, 'grad_norm': 2.313450574874878, 'learning_rate': 8.557046979865773e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62263/75000 [1:00:40<11:10, 19.00it/s]

{'loss': 0.3024, 'grad_norm': 4.205334186553955, 'learning_rate': 8.5503355704698e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62273/75000 [1:00:41<10:56, 19.39it/s]

{'loss': 0.3723, 'grad_norm': 8.876453399658203, 'learning_rate': 8.543624161073826e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62283/75000 [1:00:41<11:28, 18.46it/s]

{'loss': 0.2691, 'grad_norm': 3.3144586086273193, 'learning_rate': 8.536912751677853e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62292/75000 [1:00:42<11:32, 18.35it/s]

{'loss': 0.4227, 'grad_norm': 1.1762903928756714, 'learning_rate': 8.53020134228188e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62301/75000 [1:00:42<11:14, 18.82it/s]

{'loss': 0.3091, 'grad_norm': 11.035667419433594, 'learning_rate': 8.523489932885905e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62312/75000 [1:00:43<11:31, 18.34it/s]

{'loss': 0.3165, 'grad_norm': 4.738178253173828, 'learning_rate': 8.516778523489934e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62322/75000 [1:00:43<12:10, 17.37it/s]

{'loss': 0.3646, 'grad_norm': 7.542205810546875, 'learning_rate': 8.51006711409396e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62332/75000 [1:00:44<11:16, 18.72it/s]

{'loss': 0.3309, 'grad_norm': 2.6774308681488037, 'learning_rate': 8.503355704697986e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62342/75000 [1:00:44<11:05, 19.02it/s]

{'loss': 0.3697, 'grad_norm': 0.7692452669143677, 'learning_rate': 8.496644295302014e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62354/75000 [1:00:45<10:48, 19.50it/s]

{'loss': 0.3044, 'grad_norm': 1.961158275604248, 'learning_rate': 8.489932885906041e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62362/75000 [1:00:45<11:05, 19.00it/s]

{'loss': 0.2219, 'grad_norm': 1.3290313482284546, 'learning_rate': 8.483221476510066e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62373/75000 [1:00:46<11:02, 19.06it/s]

{'loss': 0.3722, 'grad_norm': 2.3206522464752197, 'learning_rate': 8.476510067114095e-06, 'epoch': 2.49}


                                                       
 83%|████████▎ | 62382/75000 [1:00:46<11:19, 18.58it/s]

{'loss': 0.3188, 'grad_norm': 3.444164276123047, 'learning_rate': 8.469798657718122e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62394/75000 [1:00:47<10:37, 19.77it/s]

{'loss': 0.3138, 'grad_norm': 1.3955062627792358, 'learning_rate': 8.463087248322148e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62403/75000 [1:00:48<10:55, 19.22it/s]

{'loss': 0.2701, 'grad_norm': 1.4140901565551758, 'learning_rate': 8.456375838926175e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62413/75000 [1:00:48<11:01, 19.03it/s]

{'loss': 0.2438, 'grad_norm': 2.298321485519409, 'learning_rate': 8.449664429530202e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62423/75000 [1:00:49<11:25, 18.36it/s]

{'loss': 0.3512, 'grad_norm': 8.110042572021484, 'learning_rate': 8.442953020134229e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62433/75000 [1:00:49<11:25, 18.33it/s]

{'loss': 0.2569, 'grad_norm': 5.833204746246338, 'learning_rate': 8.436241610738256e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62442/75000 [1:00:50<11:21, 18.43it/s]

{'loss': 0.3105, 'grad_norm': 8.605605125427246, 'learning_rate': 8.429530201342283e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62451/75000 [1:00:50<11:24, 18.35it/s]

{'loss': 0.3315, 'grad_norm': 2.6724514961242676, 'learning_rate': 8.42281879194631e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62464/75000 [1:00:51<10:19, 20.25it/s]

{'loss': 0.2716, 'grad_norm': 6.173460960388184, 'learning_rate': 8.416107382550335e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62474/75000 [1:00:51<10:45, 19.39it/s]

{'loss': 0.2675, 'grad_norm': 0.5973357558250427, 'learning_rate': 8.409395973154363e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62484/75000 [1:00:52<10:38, 19.61it/s]

{'loss': 0.3487, 'grad_norm': 8.897712707519531, 'learning_rate': 8.40268456375839e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62493/75000 [1:00:52<10:52, 19.18it/s]

{'loss': 0.2642, 'grad_norm': 3.71511173248291, 'learning_rate': 8.395973154362415e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62500/75000 [1:00:53<11:19, 18.40it/s]

{'loss': 0.2723, 'grad_norm': 1.09040367603302, 'learning_rate': 8.389261744966444e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62511/75000 [1:00:54<15:35, 13.35it/s]

{'loss': 0.3108, 'grad_norm': 1.377474308013916, 'learning_rate': 8.38255033557047e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62524/75000 [1:00:55<11:00, 18.88it/s]

{'loss': 0.3827, 'grad_norm': 2.1856322288513184, 'learning_rate': 8.375838926174496e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62532/75000 [1:00:55<11:35, 17.93it/s]

{'loss': 0.314, 'grad_norm': 1.456505298614502, 'learning_rate': 8.369127516778524e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62542/75000 [1:00:56<12:54, 16.09it/s]

{'loss': 0.2925, 'grad_norm': 3.883197546005249, 'learning_rate': 8.362416107382551e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62552/75000 [1:00:56<12:35, 16.48it/s]

{'loss': 0.319, 'grad_norm': 1.1605581045150757, 'learning_rate': 8.355704697986576e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62563/75000 [1:00:57<11:28, 18.07it/s]

{'loss': 0.2418, 'grad_norm': 3.8466076850891113, 'learning_rate': 8.348993288590605e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62573/75000 [1:00:57<11:15, 18.41it/s]

{'loss': 0.2779, 'grad_norm': 8.657683372497559, 'learning_rate': 8.342281879194632e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62581/75000 [1:00:58<11:27, 18.07it/s]

{'loss': 0.3296, 'grad_norm': 2.3866405487060547, 'learning_rate': 8.335570469798657e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62592/75000 [1:00:58<11:10, 18.50it/s]

{'loss': 0.3213, 'grad_norm': 10.34543228149414, 'learning_rate': 8.328859060402686e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62603/75000 [1:00:59<10:48, 19.11it/s]

{'loss': 0.2754, 'grad_norm': 3.38061261177063, 'learning_rate': 8.322147651006712e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62614/75000 [1:01:00<10:56, 18.88it/s]

{'loss': 0.2447, 'grad_norm': 2.4043753147125244, 'learning_rate': 8.31543624161074e-06, 'epoch': 2.5}


                                                       
 83%|████████▎ | 62623/75000 [1:01:00<11:41, 17.63it/s]

{'loss': 0.2387, 'grad_norm': 1.2398391962051392, 'learning_rate': 8.308724832214764e-06, 'epoch': 2.5}


                                                       
 84%|████████▎ | 62633/75000 [1:01:01<12:34, 16.40it/s]

{'loss': 0.2682, 'grad_norm': 10.529264450073242, 'learning_rate': 8.302013422818793e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62644/75000 [1:01:01<10:42, 19.24it/s]

{'loss': 0.2646, 'grad_norm': 11.951614379882812, 'learning_rate': 8.29530201342282e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62652/75000 [1:01:02<10:54, 18.87it/s]

{'loss': 0.2562, 'grad_norm': 2.8495771884918213, 'learning_rate': 8.288590604026845e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62661/75000 [1:01:02<11:28, 17.92it/s]

{'loss': 0.2682, 'grad_norm': 6.82599401473999, 'learning_rate': 8.281879194630874e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62673/75000 [1:01:03<10:25, 19.69it/s]

{'loss': 0.3566, 'grad_norm': 1.7583818435668945, 'learning_rate': 8.2751677852349e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62682/75000 [1:01:03<11:36, 17.70it/s]

{'loss': 0.2627, 'grad_norm': 0.5185921788215637, 'learning_rate': 8.268456375838926e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62692/75000 [1:01:04<10:50, 18.93it/s]

{'loss': 0.2565, 'grad_norm': 3.794299602508545, 'learning_rate': 8.261744966442954e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62703/75000 [1:01:04<11:23, 17.99it/s]

{'loss': 0.2451, 'grad_norm': 10.882458686828613, 'learning_rate': 8.255033557046981e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62712/75000 [1:01:05<10:55, 18.74it/s]

{'loss': 0.315, 'grad_norm': 3.583470344543457, 'learning_rate': 8.248322147651006e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62721/75000 [1:01:05<11:43, 17.45it/s]

{'loss': 0.2132, 'grad_norm': 4.667368412017822, 'learning_rate': 8.241610738255035e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62733/75000 [1:01:06<10:37, 19.24it/s]

{'loss': 0.3858, 'grad_norm': 7.613881587982178, 'learning_rate': 8.234899328859061e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62741/75000 [1:01:06<10:31, 19.42it/s]

{'loss': 0.3835, 'grad_norm': 5.307513236999512, 'learning_rate': 8.228187919463087e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62754/75000 [1:01:07<10:08, 20.14it/s]

{'loss': 0.3426, 'grad_norm': 3.182680606842041, 'learning_rate': 8.221476510067115e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62762/75000 [1:01:07<10:30, 19.40it/s]

{'loss': 0.2846, 'grad_norm': 6.97776985168457, 'learning_rate': 8.214765100671142e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62774/75000 [1:01:08<10:45, 18.94it/s]

{'loss': 0.3083, 'grad_norm': 7.116309642791748, 'learning_rate': 8.208053691275167e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62783/75000 [1:01:09<10:29, 19.42it/s]

{'loss': 0.288, 'grad_norm': 0.741104781627655, 'learning_rate': 8.201342281879194e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62792/75000 [1:01:09<11:16, 18.03it/s]

{'loss': 0.2482, 'grad_norm': 1.5267378091812134, 'learning_rate': 8.194630872483223e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62802/75000 [1:01:10<10:40, 19.03it/s]

{'loss': 0.2859, 'grad_norm': 2.592818260192871, 'learning_rate': 8.187919463087248e-06, 'epoch': 2.51}


                                                       
 84%|████████▎ | 62811/75000 [1:01:10<11:55, 17.05it/s]

{'loss': 0.3327, 'grad_norm': 1.8583499193191528, 'learning_rate': 8.181208053691275e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 62822/75000 [1:01:11<10:40, 19.02it/s]

{'loss': 0.3636, 'grad_norm': 5.077148914337158, 'learning_rate': 8.174496644295303e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 62831/75000 [1:01:11<10:39, 19.04it/s]

{'loss': 0.2725, 'grad_norm': 5.560995101928711, 'learning_rate': 8.16778523489933e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 62842/75000 [1:01:12<10:50, 18.69it/s]

{'loss': 0.3236, 'grad_norm': 13.101714134216309, 'learning_rate': 8.161073825503355e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 62851/75000 [1:01:12<10:52, 18.61it/s]

{'loss': 0.1797, 'grad_norm': 3.5768725872039795, 'learning_rate': 8.154362416107384e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 62864/75000 [1:01:13<10:23, 19.45it/s]

{'loss': 0.291, 'grad_norm': 3.4948830604553223, 'learning_rate': 8.14765100671141e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 62871/75000 [1:01:13<11:35, 17.44it/s]

{'loss': 0.2531, 'grad_norm': 1.711556315422058, 'learning_rate': 8.140939597315436e-06, 'epoch': 2.51}


                                                       
 84%|████████▍ | 62882/75000 [1:01:14<10:41, 18.90it/s]

{'loss': 0.2615, 'grad_norm': 1.340492844581604, 'learning_rate': 8.134228187919464e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62893/75000 [1:01:15<11:50, 17.03it/s]

{'loss': 0.2616, 'grad_norm': 1.1819370985031128, 'learning_rate': 8.127516778523491e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62902/75000 [1:01:15<11:12, 18.00it/s]

{'loss': 0.2837, 'grad_norm': 3.5339226722717285, 'learning_rate': 8.120805369127516e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62914/75000 [1:01:16<10:29, 19.20it/s]

{'loss': 0.2753, 'grad_norm': 2.249887228012085, 'learning_rate': 8.114093959731545e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62921/75000 [1:01:16<11:19, 17.76it/s]

{'loss': 0.252, 'grad_norm': 3.0716629028320312, 'learning_rate': 8.107382550335572e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62932/75000 [1:01:17<10:58, 18.33it/s]

{'loss': 0.3047, 'grad_norm': 7.83740234375, 'learning_rate': 8.100671140939597e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62942/75000 [1:01:17<11:56, 16.82it/s]

{'loss': 0.2921, 'grad_norm': 2.0271167755126953, 'learning_rate': 8.093959731543624e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62952/75000 [1:01:18<10:43, 18.73it/s]

{'loss': 0.3686, 'grad_norm': 12.630874633789062, 'learning_rate': 8.087248322147652e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62964/75000 [1:01:18<10:13, 19.61it/s]

{'loss': 0.4712, 'grad_norm': 0.9272594451904297, 'learning_rate': 8.080536912751677e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62971/75000 [1:01:19<10:12, 19.64it/s]

{'loss': 0.2915, 'grad_norm': 16.167734146118164, 'learning_rate': 8.073825503355704e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62981/75000 [1:01:19<10:11, 19.66it/s]

{'loss': 0.2742, 'grad_norm': 4.972487449645996, 'learning_rate': 8.067114093959733e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 62992/75000 [1:01:20<10:25, 19.19it/s]

{'loss': 0.2031, 'grad_norm': 2.4018349647521973, 'learning_rate': 8.060402684563758e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63000/75000 [1:01:20<09:57, 20.07it/s]

{'loss': 0.3545, 'grad_norm': 0.5858672857284546, 'learning_rate': 8.053691275167785e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63014/75000 [1:01:22<12:55, 15.45it/s]

{'loss': 0.2139, 'grad_norm': 1.7668061256408691, 'learning_rate': 8.046979865771813e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63023/75000 [1:01:22<10:58, 18.20it/s]

{'loss': 0.3813, 'grad_norm': 1.5673003196716309, 'learning_rate': 8.040268456375839e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63033/75000 [1:01:23<12:21, 16.13it/s]

{'loss': 0.2438, 'grad_norm': 0.6661591529846191, 'learning_rate': 8.033557046979865e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63042/75000 [1:01:23<10:53, 18.29it/s]

{'loss': 0.208, 'grad_norm': 9.928655624389648, 'learning_rate': 8.026845637583894e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63052/75000 [1:01:24<10:52, 18.32it/s]

{'loss': 0.2216, 'grad_norm': 5.310250759124756, 'learning_rate': 8.02013422818792e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63063/75000 [1:01:24<10:36, 18.76it/s]

{'loss': 0.2548, 'grad_norm': 4.456537246704102, 'learning_rate': 8.013422818791946e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63072/75000 [1:01:25<10:49, 18.37it/s]

{'loss': 0.3245, 'grad_norm': 1.0697664022445679, 'learning_rate': 8.006711409395974e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63084/75000 [1:01:25<09:59, 19.87it/s]

{'loss': 0.3, 'grad_norm': 1.4393647909164429, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63093/75000 [1:01:26<10:40, 18.60it/s]

{'loss': 0.3758, 'grad_norm': 2.662278413772583, 'learning_rate': 7.993288590604026e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63103/75000 [1:01:26<10:48, 18.36it/s]

{'loss': 0.3089, 'grad_norm': 4.686275482177734, 'learning_rate': 7.986577181208055e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63111/75000 [1:01:27<10:33, 18.76it/s]

{'loss': 0.2951, 'grad_norm': 5.324670314788818, 'learning_rate': 7.979865771812082e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63123/75000 [1:01:27<09:51, 20.07it/s]

{'loss': 0.3718, 'grad_norm': 7.983702659606934, 'learning_rate': 7.973154362416107e-06, 'epoch': 2.52}


                                                       
 84%|████████▍ | 63131/75000 [1:01:28<10:05, 19.61it/s]

{'loss': 0.2113, 'grad_norm': 5.018199920654297, 'learning_rate': 7.966442953020134e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63141/75000 [1:01:28<10:37, 18.61it/s]

{'loss': 0.2842, 'grad_norm': 2.335594892501831, 'learning_rate': 7.959731543624162e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63152/75000 [1:01:29<10:46, 18.34it/s]

{'loss': 0.3591, 'grad_norm': 5.452169895172119, 'learning_rate': 7.953020134228188e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63163/75000 [1:01:30<10:36, 18.61it/s]

{'loss': 0.4377, 'grad_norm': 7.924773216247559, 'learning_rate': 7.946308724832214e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63172/75000 [1:01:30<10:51, 18.16it/s]

{'loss': 0.3468, 'grad_norm': 6.132851600646973, 'learning_rate': 7.939597315436243e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63184/75000 [1:01:31<10:19, 19.08it/s]

{'loss': 0.277, 'grad_norm': 4.398427963256836, 'learning_rate': 7.932885906040268e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63194/75000 [1:01:31<09:59, 19.69it/s]

{'loss': 0.2188, 'grad_norm': 1.083250641822815, 'learning_rate': 7.926174496644295e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63202/75000 [1:01:32<10:50, 18.15it/s]

{'loss': 0.3584, 'grad_norm': 12.86214542388916, 'learning_rate': 7.919463087248324e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63212/75000 [1:01:32<11:55, 16.48it/s]

{'loss': 0.3152, 'grad_norm': 0.8732895851135254, 'learning_rate': 7.912751677852349e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63222/75000 [1:01:33<10:47, 18.19it/s]

{'loss': 0.2861, 'grad_norm': 1.0787807703018188, 'learning_rate': 7.906040268456376e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63231/75000 [1:01:33<11:53, 16.49it/s]

{'loss': 0.3115, 'grad_norm': 3.497128963470459, 'learning_rate': 7.899328859060404e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63242/75000 [1:01:34<10:20, 18.96it/s]

{'loss': 0.3085, 'grad_norm': 7.19075870513916, 'learning_rate': 7.89261744966443e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63252/75000 [1:01:34<10:03, 19.47it/s]

{'loss': 0.3299, 'grad_norm': 8.095046043395996, 'learning_rate': 7.885906040268456e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63263/75000 [1:01:35<10:13, 19.12it/s]

{'loss': 0.2088, 'grad_norm': 5.368488311767578, 'learning_rate': 7.879194630872485e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63273/75000 [1:01:36<10:29, 18.62it/s]

{'loss': 0.2481, 'grad_norm': 3.1221725940704346, 'learning_rate': 7.87248322147651e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63282/75000 [1:01:36<10:20, 18.87it/s]

{'loss': 0.2809, 'grad_norm': 5.279118537902832, 'learning_rate': 7.865771812080537e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63293/75000 [1:01:37<10:45, 18.15it/s]

{'loss': 0.3898, 'grad_norm': 10.087167739868164, 'learning_rate': 7.859060402684564e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63304/75000 [1:01:37<09:53, 19.71it/s]

{'loss': 0.2168, 'grad_norm': 1.6834888458251953, 'learning_rate': 7.852348993288592e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63312/75000 [1:01:38<10:12, 19.07it/s]

{'loss': 0.2226, 'grad_norm': 1.137502670288086, 'learning_rate': 7.845637583892617e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63322/75000 [1:01:38<10:11, 19.10it/s]

{'loss': 0.3296, 'grad_norm': 1.8272415399551392, 'learning_rate': 7.838926174496644e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63333/75000 [1:01:39<10:49, 17.97it/s]

{'loss': 0.297, 'grad_norm': 3.284996509552002, 'learning_rate': 7.832214765100673e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63343/75000 [1:01:39<10:39, 18.23it/s]

{'loss': 0.2948, 'grad_norm': 0.9425747394561768, 'learning_rate': 7.825503355704698e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63353/75000 [1:01:40<10:04, 19.26it/s]

{'loss': 0.3475, 'grad_norm': 1.5120383501052856, 'learning_rate': 7.818791946308725e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63362/75000 [1:01:40<10:30, 18.45it/s]

{'loss': 0.2215, 'grad_norm': 4.791747093200684, 'learning_rate': 7.812080536912753e-06, 'epoch': 2.53}


                                                       
 84%|████████▍ | 63374/75000 [1:01:41<10:01, 19.34it/s]

{'loss': 0.3027, 'grad_norm': 2.2639989852905273, 'learning_rate': 7.805369127516778e-06, 'epoch': 2.53}


                                                       
 85%|████████▍ | 63384/75000 [1:01:41<09:45, 19.85it/s]

{'loss': 0.2925, 'grad_norm': 4.547506332397461, 'learning_rate': 7.798657718120805e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63392/75000 [1:01:42<10:01, 19.29it/s]

{'loss': 0.2245, 'grad_norm': 2.3061320781707764, 'learning_rate': 7.791946308724834e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63403/75000 [1:01:42<10:37, 18.19it/s]

{'loss': 0.332, 'grad_norm': 5.453913688659668, 'learning_rate': 7.785234899328859e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63412/75000 [1:01:43<10:22, 18.61it/s]

{'loss': 0.3306, 'grad_norm': 3.7351720333099365, 'learning_rate': 7.778523489932886e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63422/75000 [1:01:43<10:34, 18.24it/s]

{'loss': 0.27, 'grad_norm': 6.196408271789551, 'learning_rate': 7.771812080536914e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63433/75000 [1:01:44<10:35, 18.20it/s]

{'loss': 0.2929, 'grad_norm': 6.393050193786621, 'learning_rate': 7.76510067114094e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63443/75000 [1:01:45<11:13, 17.15it/s]

{'loss': 0.3101, 'grad_norm': 4.536973476409912, 'learning_rate': 7.758389261744966e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63452/75000 [1:01:45<10:47, 17.85it/s]

{'loss': 0.2943, 'grad_norm': 3.8331055641174316, 'learning_rate': 7.751677852348993e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63463/75000 [1:01:46<11:27, 16.79it/s]

{'loss': 0.2978, 'grad_norm': 4.672941207885742, 'learning_rate': 7.74496644295302e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63473/75000 [1:01:46<10:51, 17.68it/s]

{'loss': 0.2786, 'grad_norm': 9.122527122497559, 'learning_rate': 7.738255033557047e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63484/75000 [1:01:47<09:47, 19.60it/s]

{'loss': 0.3171, 'grad_norm': 3.5198769569396973, 'learning_rate': 7.731543624161074e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63493/75000 [1:01:47<10:42, 17.91it/s]

{'loss': 0.2698, 'grad_norm': 3.371835470199585, 'learning_rate': 7.7248322147651e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63500/75000 [1:01:48<10:05, 18.98it/s]

{'loss': 0.2699, 'grad_norm': 3.360811948776245, 'learning_rate': 7.718120805369127e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63513/75000 [1:01:49<12:31, 15.28it/s]

{'loss': 0.2363, 'grad_norm': 2.18143630027771, 'learning_rate': 7.711409395973154e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63524/75000 [1:01:50<10:24, 18.36it/s]

{'loss': 0.3457, 'grad_norm': 5.891334533691406, 'learning_rate': 7.704697986577183e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63533/75000 [1:01:50<11:14, 17.01it/s]

{'loss': 0.2365, 'grad_norm': 3.2282841205596924, 'learning_rate': 7.697986577181208e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63542/75000 [1:01:51<10:15, 18.62it/s]

{'loss': 0.3971, 'grad_norm': 4.1450042724609375, 'learning_rate': 7.691275167785235e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63552/75000 [1:01:51<10:45, 17.73it/s]

{'loss': 0.2591, 'grad_norm': 3.105917453765869, 'learning_rate': 7.684563758389263e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63563/75000 [1:01:52<10:24, 18.30it/s]

{'loss': 0.3247, 'grad_norm': 1.0949041843414307, 'learning_rate': 7.677852348993289e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63573/75000 [1:01:52<10:43, 17.75it/s]

{'loss': 0.1856, 'grad_norm': 2.953864574432373, 'learning_rate': 7.671140939597315e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63583/75000 [1:01:53<10:23, 18.30it/s]

{'loss': 0.3693, 'grad_norm': 10.952201843261719, 'learning_rate': 7.664429530201344e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63593/75000 [1:01:54<09:59, 19.03it/s]

{'loss': 0.2575, 'grad_norm': 2.193636894226074, 'learning_rate': 7.657718120805369e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63603/75000 [1:01:54<10:20, 18.37it/s]

{'loss': 0.3137, 'grad_norm': 4.401527404785156, 'learning_rate': 7.651006711409396e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63612/75000 [1:01:55<10:42, 17.72it/s]

{'loss': 0.3439, 'grad_norm': 6.235651016235352, 'learning_rate': 7.644295302013423e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63624/75000 [1:01:55<10:02, 18.89it/s]

{'loss': 0.373, 'grad_norm': 2.448301076889038, 'learning_rate': 7.63758389261745e-06, 'epoch': 2.54}


                                                       
 85%|████████▍ | 63634/75000 [1:01:56<09:44, 19.45it/s]

{'loss': 0.2872, 'grad_norm': 7.4724884033203125, 'learning_rate': 7.630872483221477e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63643/75000 [1:01:56<10:20, 18.30it/s]

{'loss': 0.2602, 'grad_norm': 0.6761742234230042, 'learning_rate': 7.624161073825503e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63653/75000 [1:01:57<10:54, 17.33it/s]

{'loss': 0.3661, 'grad_norm': 5.350281715393066, 'learning_rate': 7.617449664429531e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63662/75000 [1:01:57<10:15, 18.43it/s]

{'loss': 0.3814, 'grad_norm': 8.803608894348145, 'learning_rate': 7.610738255033557e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63673/75000 [1:01:58<10:16, 18.36it/s]

{'loss': 0.2747, 'grad_norm': 6.663046836853027, 'learning_rate': 7.604026845637584e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63683/75000 [1:01:59<10:42, 17.62it/s]

{'loss': 0.2274, 'grad_norm': 12.365743637084961, 'learning_rate': 7.597315436241612e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63692/75000 [1:01:59<09:59, 18.87it/s]

{'loss': 0.2846, 'grad_norm': 0.2875560522079468, 'learning_rate': 7.590604026845638e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63701/75000 [1:01:59<09:51, 19.11it/s]

{'loss': 0.251, 'grad_norm': 10.593164443969727, 'learning_rate': 7.5838926174496645e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63714/75000 [1:02:00<09:56, 18.94it/s]

{'loss': 0.2458, 'grad_norm': 10.908656120300293, 'learning_rate': 7.577181208053692e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63722/75000 [1:02:01<10:11, 18.45it/s]

{'loss': 0.3191, 'grad_norm': 2.0320589542388916, 'learning_rate': 7.570469798657718e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63733/75000 [1:02:01<09:49, 19.10it/s]

{'loss': 0.2323, 'grad_norm': 3.5918664932250977, 'learning_rate': 7.563758389261745e-06, 'epoch': 2.55}


                                                       
 85%|████████▍ | 63743/75000 [1:02:02<10:27, 17.93it/s]

{'loss': 0.3246, 'grad_norm': 3.359506130218506, 'learning_rate': 7.557046979865773e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63751/75000 [1:02:02<09:57, 18.84it/s]

{'loss': 0.332, 'grad_norm': 3.4706978797912598, 'learning_rate': 7.5503355704698e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63762/75000 [1:02:03<11:36, 16.14it/s]

{'loss': 0.2261, 'grad_norm': 6.759657859802246, 'learning_rate': 7.543624161073826e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63772/75000 [1:02:03<10:44, 17.42it/s]

{'loss': 0.391, 'grad_norm': 3.362058162689209, 'learning_rate': 7.536912751677852e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63782/75000 [1:02:04<10:10, 18.37it/s]

{'loss': 0.2735, 'grad_norm': 4.182936191558838, 'learning_rate': 7.53020134228188e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63792/75000 [1:02:04<09:47, 19.07it/s]

{'loss': 0.2878, 'grad_norm': 1.7878109216690063, 'learning_rate': 7.523489932885906e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63801/75000 [1:02:05<10:32, 17.72it/s]

{'loss': 0.3143, 'grad_norm': 3.915334939956665, 'learning_rate': 7.516778523489933e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63812/75000 [1:02:05<11:17, 16.51it/s]

{'loss': 0.2729, 'grad_norm': 2.692507028579712, 'learning_rate': 7.510067114093961e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63822/75000 [1:02:06<13:02, 14.29it/s]

{'loss': 0.3097, 'grad_norm': 7.020455360412598, 'learning_rate': 7.503355704697987e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63831/75000 [1:02:07<10:45, 17.30it/s]

{'loss': 0.3744, 'grad_norm': 6.681194305419922, 'learning_rate': 7.496644295302014e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63843/75000 [1:02:07<10:04, 18.46it/s]

{'loss': 0.2506, 'grad_norm': 0.8716518878936768, 'learning_rate': 7.489932885906041e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63852/75000 [1:02:08<09:41, 19.18it/s]

{'loss': 0.3029, 'grad_norm': 10.6482572555542, 'learning_rate': 7.483221476510067e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63862/75000 [1:02:08<09:35, 19.36it/s]

{'loss': 0.1944, 'grad_norm': 2.328670024871826, 'learning_rate': 7.476510067114094e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63871/75000 [1:02:09<09:20, 19.87it/s]

{'loss': 0.3255, 'grad_norm': 1.5344680547714233, 'learning_rate': 7.469798657718122e-06, 'epoch': 2.55}


                                                       
 85%|████████▌ | 63882/75000 [1:02:09<09:40, 19.15it/s]

{'loss': 0.451, 'grad_norm': 2.93540620803833, 'learning_rate': 7.463087248322148e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63893/75000 [1:02:10<09:24, 19.67it/s]

{'loss': 0.3541, 'grad_norm': 9.053183555603027, 'learning_rate': 7.456375838926175e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63904/75000 [1:02:10<09:27, 19.54it/s]

{'loss': 0.2164, 'grad_norm': 0.746790885925293, 'learning_rate': 7.4496644295302024e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63913/75000 [1:02:11<09:16, 19.91it/s]

{'loss': 0.2146, 'grad_norm': 5.198404312133789, 'learning_rate': 7.4429530201342284e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63922/75000 [1:02:11<09:04, 20.33it/s]

{'loss': 0.2312, 'grad_norm': 1.8234401941299438, 'learning_rate': 7.436241610738255e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63934/75000 [1:02:12<08:57, 20.57it/s]

{'loss': 0.1754, 'grad_norm': 3.8821022510528564, 'learning_rate': 7.429530201342281e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63943/75000 [1:02:12<09:09, 20.12it/s]

{'loss': 0.2457, 'grad_norm': 3.311462879180908, 'learning_rate': 7.422818791946309e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63952/75000 [1:02:13<09:19, 19.76it/s]

{'loss': 0.255, 'grad_norm': 2.4981496334075928, 'learning_rate': 7.416107382550336e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63963/75000 [1:02:13<09:14, 19.89it/s]

{'loss': 0.2284, 'grad_norm': 4.374796390533447, 'learning_rate': 7.409395973154362e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63972/75000 [1:02:14<09:09, 20.06it/s]

{'loss': 0.3433, 'grad_norm': 2.8340861797332764, 'learning_rate': 7.4026845637583896e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63984/75000 [1:02:14<08:59, 20.41it/s]

{'loss': 0.2389, 'grad_norm': 0.9013609290122986, 'learning_rate': 7.395973154362416e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 63992/75000 [1:02:15<09:51, 18.62it/s]

{'loss': 0.3801, 'grad_norm': 1.6746678352355957, 'learning_rate': 7.389261744966442e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64000/75000 [1:02:15<09:09, 20.01it/s]

{'loss': 0.3927, 'grad_norm': 9.486668586730957, 'learning_rate': 7.382550335570471e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64012/75000 [1:02:17<13:13, 13.85it/s]

{'loss': 0.3974, 'grad_norm': 6.05322265625, 'learning_rate': 7.375838926174497e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64022/75000 [1:02:17<10:26, 17.53it/s]

{'loss': 0.2451, 'grad_norm': 3.78263783454895, 'learning_rate': 7.369127516778524e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64032/75000 [1:02:18<10:28, 17.45it/s]

{'loss': 0.3789, 'grad_norm': 1.4371223449707031, 'learning_rate': 7.3624161073825515e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64041/75000 [1:02:18<09:40, 18.88it/s]

{'loss': 0.2693, 'grad_norm': 2.6886894702911377, 'learning_rate': 7.3557046979865775e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64054/75000 [1:02:19<08:55, 20.43it/s]

{'loss': 0.3153, 'grad_norm': 1.3453868627548218, 'learning_rate': 7.348993288590604e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64063/75000 [1:02:19<09:00, 20.23it/s]

{'loss': 0.2455, 'grad_norm': 3.2437307834625244, 'learning_rate': 7.342281879194632e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64072/75000 [1:02:20<09:03, 20.10it/s]

{'loss': 0.3161, 'grad_norm': 4.116611957550049, 'learning_rate': 7.335570469798658e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64084/75000 [1:02:20<08:55, 20.37it/s]

{'loss': 0.3648, 'grad_norm': 5.529750823974609, 'learning_rate': 7.328859060402685e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64092/75000 [1:02:21<09:13, 19.70it/s]

{'loss': 0.2854, 'grad_norm': 8.081027030944824, 'learning_rate': 7.322147651006711e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64103/75000 [1:02:21<09:47, 18.54it/s]

{'loss': 0.4261, 'grad_norm': 26.654115676879883, 'learning_rate': 7.315436241610739e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64112/75000 [1:02:22<09:19, 19.47it/s]

{'loss': 0.2821, 'grad_norm': 2.9429855346679688, 'learning_rate': 7.3087248322147655e-06, 'epoch': 2.56}


                                                       
 85%|████████▌ | 64124/75000 [1:02:22<09:32, 18.99it/s]

{'loss': 0.2079, 'grad_norm': 0.6999497413635254, 'learning_rate': 7.3020134228187915e-06, 'epoch': 2.56}


                                                       
 86%|████████▌ | 64133/75000 [1:02:23<09:35, 18.89it/s]

{'loss': 0.2872, 'grad_norm': 2.1286823749542236, 'learning_rate': 7.295302013422819e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64141/75000 [1:02:23<09:23, 19.27it/s]

{'loss': 0.2882, 'grad_norm': 3.303889751434326, 'learning_rate': 7.288590604026846e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64152/75000 [1:02:24<09:53, 18.28it/s]

{'loss': 0.2181, 'grad_norm': 10.824252128601074, 'learning_rate': 7.281879194630872e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64164/75000 [1:02:24<08:50, 20.42it/s]

{'loss': 0.3281, 'grad_norm': 4.4483256340026855, 'learning_rate': 7.2751677852349e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64172/75000 [1:02:25<09:36, 18.79it/s]

{'loss': 0.3719, 'grad_norm': 1.5935261249542236, 'learning_rate': 7.268456375838927e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64182/75000 [1:02:25<09:16, 19.42it/s]

{'loss': 0.1794, 'grad_norm': 3.406507968902588, 'learning_rate': 7.261744966442953e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64191/75000 [1:02:26<09:38, 18.68it/s]

{'loss': 0.2695, 'grad_norm': 0.9113039374351501, 'learning_rate': 7.25503355704698e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64202/75000 [1:02:26<09:00, 19.97it/s]

{'loss': 0.1532, 'grad_norm': 1.755829095840454, 'learning_rate': 7.248322147651007e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64213/75000 [1:02:27<09:31, 18.86it/s]

{'loss': 0.1729, 'grad_norm': 2.120087146759033, 'learning_rate': 7.241610738255033e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64223/75000 [1:02:27<09:19, 19.26it/s]

{'loss': 0.3364, 'grad_norm': 5.41347074508667, 'learning_rate': 7.234899328859062e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64231/75000 [1:02:28<09:36, 18.67it/s]

{'loss': 0.3616, 'grad_norm': 7.168728351593018, 'learning_rate': 7.228187919463088e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64243/75000 [1:02:28<08:59, 19.92it/s]

{'loss': 0.3574, 'grad_norm': 1.3408467769622803, 'learning_rate': 7.221476510067115e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64252/75000 [1:02:29<09:16, 19.31it/s]

{'loss': 0.2902, 'grad_norm': 3.2301907539367676, 'learning_rate': 7.214765100671142e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64264/75000 [1:02:30<08:50, 20.24it/s]

{'loss': 0.313, 'grad_norm': 4.974121570587158, 'learning_rate': 7.208053691275168e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64272/75000 [1:02:30<10:02, 17.80it/s]

{'loss': 0.231, 'grad_norm': 10.037224769592285, 'learning_rate': 7.201342281879195e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64283/75000 [1:02:31<09:09, 19.52it/s]

{'loss': 0.2254, 'grad_norm': 4.556638717651367, 'learning_rate': 7.194630872483221e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64294/75000 [1:02:31<09:21, 19.08it/s]

{'loss': 0.4292, 'grad_norm': 2.34024715423584, 'learning_rate': 7.187919463087249e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64302/75000 [1:02:32<09:02, 19.74it/s]

{'loss': 0.2073, 'grad_norm': 2.1440744400024414, 'learning_rate': 7.181208053691276e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64313/75000 [1:02:32<08:53, 20.05it/s]

{'loss': 0.239, 'grad_norm': 9.519523620605469, 'learning_rate': 7.174496644295302e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64322/75000 [1:02:33<08:57, 19.88it/s]

{'loss': 0.2996, 'grad_norm': 4.043999671936035, 'learning_rate': 7.167785234899329e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64331/75000 [1:02:33<09:51, 18.03it/s]

{'loss': 0.348, 'grad_norm': 5.581386089324951, 'learning_rate': 7.161073825503356e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64342/75000 [1:02:34<09:13, 19.26it/s]

{'loss': 0.3008, 'grad_norm': 2.0100347995758057, 'learning_rate': 7.154362416107382e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64351/75000 [1:02:34<09:02, 19.64it/s]

{'loss': 0.2333, 'grad_norm': 0.4390959143638611, 'learning_rate': 7.14765100671141e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64363/75000 [1:02:35<08:30, 20.84it/s]

{'loss': 0.2579, 'grad_norm': 6.526824474334717, 'learning_rate': 7.140939597315437e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64372/75000 [1:02:35<08:45, 20.23it/s]

{'loss': 0.3172, 'grad_norm': 5.782339096069336, 'learning_rate': 7.134228187919463e-06, 'epoch': 2.57}


                                                       
 86%|████████▌ | 64382/75000 [1:02:36<09:13, 19.18it/s]

{'loss': 0.3037, 'grad_norm': 2.989793539047241, 'learning_rate': 7.1275167785234905e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64392/75000 [1:02:36<09:15, 19.08it/s]

{'loss': 0.3105, 'grad_norm': 2.0220723152160645, 'learning_rate': 7.120805369127517e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64401/75000 [1:02:37<08:57, 19.73it/s]

{'loss': 0.4408, 'grad_norm': 7.970328330993652, 'learning_rate': 7.114093959731543e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64414/75000 [1:02:37<08:47, 20.06it/s]

{'loss': 0.2307, 'grad_norm': 4.350165843963623, 'learning_rate': 7.107382550335571e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64423/75000 [1:02:38<09:14, 19.08it/s]

{'loss': 0.2648, 'grad_norm': 3.0802619457244873, 'learning_rate': 7.100671140939598e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64432/75000 [1:02:38<08:50, 19.91it/s]

{'loss': 0.2664, 'grad_norm': 11.905723571777344, 'learning_rate': 7.093959731543624e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64441/75000 [1:02:39<08:44, 20.13it/s]

{'loss': 0.3113, 'grad_norm': 5.407517910003662, 'learning_rate': 7.087248322147651e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64453/75000 [1:02:39<08:33, 20.55it/s]

{'loss': 0.248, 'grad_norm': 2.6564371585845947, 'learning_rate': 7.0805369127516785e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64462/75000 [1:02:40<08:40, 20.26it/s]

{'loss': 0.2979, 'grad_norm': 6.425622940063477, 'learning_rate': 7.0738255033557045e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64474/75000 [1:02:40<08:36, 20.36it/s]

{'loss': 0.3865, 'grad_norm': 3.9193339347839355, 'learning_rate': 7.067114093959731e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64483/75000 [1:02:41<08:34, 20.44it/s]

{'loss': 0.3113, 'grad_norm': 3.409421682357788, 'learning_rate': 7.060402684563759e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64494/75000 [1:02:41<08:53, 19.68it/s]

{'loss': 0.4022, 'grad_norm': 3.3324484825134277, 'learning_rate': 7.053691275167786e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64500/75000 [1:02:42<08:28, 20.66it/s]

{'loss': 0.3606, 'grad_norm': 4.1192946434021, 'learning_rate': 7.046979865771812e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64512/75000 [1:02:43<12:59, 13.46it/s]

{'loss': 0.3173, 'grad_norm': 1.8974158763885498, 'learning_rate': 7.04026845637584e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64523/75000 [1:02:43<10:15, 17.01it/s]

{'loss': 0.1712, 'grad_norm': 2.4174978733062744, 'learning_rate': 7.0335570469798665e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64532/75000 [1:02:44<10:04, 17.31it/s]

{'loss': 0.3711, 'grad_norm': 2.271634578704834, 'learning_rate': 7.0268456375838925e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64542/75000 [1:02:45<10:06, 17.23it/s]

{'loss': 0.4356, 'grad_norm': 11.777588844299316, 'learning_rate': 7.02013422818792e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64551/75000 [1:02:45<09:22, 18.56it/s]

{'loss': 0.3975, 'grad_norm': 2.4873666763305664, 'learning_rate': 7.013422818791947e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64563/75000 [1:02:46<08:38, 20.14it/s]

{'loss': 0.2904, 'grad_norm': 7.04632568359375, 'learning_rate': 7.006711409395973e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64572/75000 [1:02:46<08:32, 20.36it/s]

{'loss': 0.2354, 'grad_norm': 2.1460068225860596, 'learning_rate': 7.000000000000001e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64581/75000 [1:02:46<09:06, 19.07it/s]

{'loss': 0.3215, 'grad_norm': 5.763335704803467, 'learning_rate': 6.993288590604028e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64594/75000 [1:02:47<08:30, 20.37it/s]

{'loss': 0.2935, 'grad_norm': 3.384053945541382, 'learning_rate': 6.986577181208054e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64602/75000 [1:02:48<09:16, 18.70it/s]

{'loss': 0.1549, 'grad_norm': 0.971971333026886, 'learning_rate': 6.9798657718120805e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64611/75000 [1:02:48<08:38, 20.03it/s]

{'loss': 0.2688, 'grad_norm': 2.2277255058288574, 'learning_rate': 6.973154362416108e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64624/75000 [1:02:49<08:44, 19.80it/s]

{'loss': 0.2044, 'grad_norm': 1.0838912725448608, 'learning_rate': 6.966442953020134e-06, 'epoch': 2.58}


                                                       
 86%|████████▌ | 64634/75000 [1:02:49<08:30, 20.30it/s]

{'loss': 0.2311, 'grad_norm': 4.076871871948242, 'learning_rate': 6.959731543624161e-06, 'epoch': 2.59}


                                                       
 86%|████████▌ | 64643/75000 [1:02:50<09:13, 18.71it/s]

{'loss': 0.3866, 'grad_norm': 4.623733043670654, 'learning_rate': 6.953020134228189e-06, 'epoch': 2.59}


                                                       
 86%|████████▌ | 64653/75000 [1:02:50<08:55, 19.34it/s]

{'loss': 0.2254, 'grad_norm': 3.7023608684539795, 'learning_rate': 6.946308724832215e-06, 'epoch': 2.59}


                                                       
 86%|████████▌ | 64663/75000 [1:02:51<08:50, 19.47it/s]

{'loss': 0.4459, 'grad_norm': 1.0070246458053589, 'learning_rate': 6.9395973154362416e-06, 'epoch': 2.59}


                                                       
 86%|████████▌ | 64671/75000 [1:02:51<08:45, 19.67it/s]

{'loss': 0.3219, 'grad_norm': 3.6941614151000977, 'learning_rate': 6.932885906040269e-06, 'epoch': 2.59}


                                                       
 86%|████████▌ | 64684/75000 [1:02:52<08:45, 19.65it/s]

{'loss': 0.3651, 'grad_norm': 1.8901556730270386, 'learning_rate': 6.926174496644295e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64693/75000 [1:02:52<08:53, 19.33it/s]

{'loss': 0.4074, 'grad_norm': 2.847033739089966, 'learning_rate': 6.919463087248322e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64701/75000 [1:02:53<09:30, 18.06it/s]

{'loss': 0.2616, 'grad_norm': 2.622565269470215, 'learning_rate': 6.91275167785235e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64712/75000 [1:02:53<08:25, 20.34it/s]

{'loss': 0.2964, 'grad_norm': 2.0776913166046143, 'learning_rate': 6.906040268456377e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64721/75000 [1:02:54<09:29, 18.04it/s]

{'loss': 0.3027, 'grad_norm': 2.2630422115325928, 'learning_rate': 6.899328859060403e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64733/75000 [1:02:54<09:10, 18.64it/s]

{'loss': 0.3305, 'grad_norm': 2.170056104660034, 'learning_rate': 6.89261744966443e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64742/75000 [1:02:55<08:49, 19.36it/s]

{'loss': 0.2644, 'grad_norm': 1.0630905628204346, 'learning_rate': 6.885906040268457e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64753/75000 [1:02:55<08:53, 19.22it/s]

{'loss': 0.2929, 'grad_norm': 5.985653400421143, 'learning_rate': 6.879194630872483e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64762/75000 [1:02:56<08:57, 19.04it/s]

{'loss': 0.3844, 'grad_norm': 5.577304363250732, 'learning_rate': 6.87248322147651e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64771/75000 [1:02:56<09:39, 17.67it/s]

{'loss': 0.3027, 'grad_norm': 3.3597841262817383, 'learning_rate': 6.865771812080538e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64782/75000 [1:02:57<08:30, 20.02it/s]

{'loss': 0.2481, 'grad_norm': 5.737802982330322, 'learning_rate': 6.859060402684564e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64793/75000 [1:02:57<09:04, 18.74it/s]

{'loss': 0.2076, 'grad_norm': 3.387543201446533, 'learning_rate': 6.852348993288591e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64803/75000 [1:02:58<09:09, 18.55it/s]

{'loss': 0.2342, 'grad_norm': 2.357227087020874, 'learning_rate': 6.845637583892618e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64813/75000 [1:02:59<08:38, 19.64it/s]

{'loss': 0.262, 'grad_norm': 2.276867628097534, 'learning_rate': 6.838926174496644e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64822/75000 [1:02:59<08:32, 19.85it/s]

{'loss': 0.2959, 'grad_norm': 4.926451206207275, 'learning_rate': 6.832214765100671e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64833/75000 [1:03:00<08:28, 19.99it/s]

{'loss': 0.3651, 'grad_norm': 7.054794788360596, 'learning_rate': 6.825503355704699e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64842/75000 [1:03:00<08:42, 19.42it/s]

{'loss': 0.282, 'grad_norm': 2.641038179397583, 'learning_rate': 6.818791946308725e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64851/75000 [1:03:00<08:29, 19.93it/s]

{'loss': 0.341, 'grad_norm': 3.8267648220062256, 'learning_rate': 6.812080536912752e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64864/75000 [1:03:01<08:33, 19.72it/s]

{'loss': 0.3953, 'grad_norm': 4.404625415802002, 'learning_rate': 6.8053691275167795e-06, 'epoch': 2.59}


                                                       
 86%|████████▋ | 64873/75000 [1:03:02<08:53, 18.99it/s]

{'loss': 0.2572, 'grad_norm': 4.8904547691345215, 'learning_rate': 6.7986577181208055e-06, 'epoch': 2.59}


                                                       
 87%|████████▋ | 64882/75000 [1:03:02<08:57, 18.81it/s]

{'loss': 0.3013, 'grad_norm': 13.18942928314209, 'learning_rate': 6.791946308724832e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64894/75000 [1:03:03<08:24, 20.03it/s]

{'loss': 0.2528, 'grad_norm': 0.7455040216445923, 'learning_rate': 6.78523489932886e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64902/75000 [1:03:03<09:18, 18.07it/s]

{'loss': 0.3046, 'grad_norm': 5.575438976287842, 'learning_rate': 6.778523489932886e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64914/75000 [1:03:04<08:20, 20.15it/s]

{'loss': 0.3107, 'grad_norm': 5.852664470672607, 'learning_rate': 6.771812080536913e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64924/75000 [1:03:04<08:38, 19.42it/s]

{'loss': 0.3521, 'grad_norm': 7.372004985809326, 'learning_rate': 6.765100671140939e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64930/75000 [1:03:05<08:09, 20.56it/s]

{'loss': 0.1886, 'grad_norm': 5.156833648681641, 'learning_rate': 6.7583892617449675e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64944/75000 [1:03:05<08:15, 20.28it/s]

{'loss': 0.4167, 'grad_norm': 5.84256649017334, 'learning_rate': 6.7516778523489935e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64953/75000 [1:03:06<08:37, 19.40it/s]

{'loss': 0.2479, 'grad_norm': 4.171980857849121, 'learning_rate': 6.7449664429530195e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64961/75000 [1:03:06<08:30, 19.66it/s]

{'loss': 0.3545, 'grad_norm': 10.117718696594238, 'learning_rate': 6.738255033557048e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64974/75000 [1:03:07<08:31, 19.61it/s]

{'loss': 0.3036, 'grad_norm': 8.22131633758545, 'learning_rate': 6.731543624161074e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64984/75000 [1:03:07<08:49, 18.91it/s]

{'loss': 0.2335, 'grad_norm': 4.71046781539917, 'learning_rate': 6.724832214765101e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 64994/75000 [1:03:08<08:24, 19.85it/s]

{'loss': 0.3541, 'grad_norm': 3.162785291671753, 'learning_rate': 6.7181208053691286e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65000/75000 [1:03:08<09:08, 18.25it/s]

{'loss': 0.4398, 'grad_norm': 0.572527289390564, 'learning_rate': 6.7114093959731546e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65012/75000 [1:03:09<11:41, 14.24it/s]

{'loss': 0.4452, 'grad_norm': 7.590449333190918, 'learning_rate': 6.7046979865771814e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65024/75000 [1:03:10<09:09, 18.14it/s]

{'loss': 0.2728, 'grad_norm': 8.688742637634277, 'learning_rate': 6.697986577181209e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65031/75000 [1:03:10<08:58, 18.53it/s]

{'loss': 0.1695, 'grad_norm': 1.629207730293274, 'learning_rate': 6.691275167785235e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65041/75000 [1:03:11<08:27, 19.63it/s]

{'loss': 0.2682, 'grad_norm': 4.020195007324219, 'learning_rate': 6.684563758389262e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65054/75000 [1:03:12<08:05, 20.47it/s]

{'loss': 0.351, 'grad_norm': 4.136592388153076, 'learning_rate': 6.67785234899329e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65063/75000 [1:03:12<08:16, 20.02it/s]

{'loss': 0.2766, 'grad_norm': 1.9663312435150146, 'learning_rate': 6.671140939597316e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65072/75000 [1:03:13<08:15, 20.03it/s]

{'loss': 0.2357, 'grad_norm': 2.3709256649017334, 'learning_rate': 6.6644295302013425e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65081/75000 [1:03:13<08:09, 20.25it/s]

{'loss': 0.3459, 'grad_norm': 1.677689552307129, 'learning_rate': 6.6577181208053686e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65093/75000 [1:03:14<08:05, 20.39it/s]

{'loss': 0.3069, 'grad_norm': 4.541982173919678, 'learning_rate': 6.651006711409396e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65102/75000 [1:03:14<08:18, 19.86it/s]

{'loss': 0.3131, 'grad_norm': 1.5863430500030518, 'learning_rate': 6.644295302013423e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65114/75000 [1:03:15<07:58, 20.65it/s]

{'loss': 0.2887, 'grad_norm': 2.762526512145996, 'learning_rate': 6.637583892617449e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65123/75000 [1:03:15<08:17, 19.84it/s]

{'loss': 0.2602, 'grad_norm': 2.01715350151062, 'learning_rate': 6.630872483221477e-06, 'epoch': 2.6}


                                                       
 87%|████████▋ | 65132/75000 [1:03:16<08:31, 19.29it/s]

{'loss': 0.3148, 'grad_norm': 3.8231167793273926, 'learning_rate': 6.624161073825504e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65144/75000 [1:03:16<08:03, 20.37it/s]

{'loss': 0.302, 'grad_norm': 8.769762992858887, 'learning_rate': 6.61744966442953e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65152/75000 [1:03:17<08:45, 18.76it/s]

{'loss': 0.2201, 'grad_norm': 1.613516092300415, 'learning_rate': 6.610738255033558e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65163/75000 [1:03:17<08:25, 19.46it/s]

{'loss': 0.4107, 'grad_norm': 5.228559494018555, 'learning_rate': 6.604026845637584e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65173/75000 [1:03:18<08:22, 19.55it/s]

{'loss': 0.3215, 'grad_norm': 3.5792317390441895, 'learning_rate': 6.59731543624161e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65182/75000 [1:03:18<09:18, 17.58it/s]

{'loss': 0.3196, 'grad_norm': 3.2586417198181152, 'learning_rate': 6.590604026845639e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65192/75000 [1:03:19<08:31, 19.16it/s]

{'loss': 0.3011, 'grad_norm': 7.27795934677124, 'learning_rate': 6.583892617449665e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65204/75000 [1:03:19<07:53, 20.70it/s]

{'loss': 0.2284, 'grad_norm': 6.724256992340088, 'learning_rate': 6.577181208053692e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65213/75000 [1:03:20<07:56, 20.52it/s]

{'loss': 0.255, 'grad_norm': 4.439640045166016, 'learning_rate': 6.570469798657719e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65222/75000 [1:03:20<08:20, 19.55it/s]

{'loss': 0.3417, 'grad_norm': 6.70403528213501, 'learning_rate': 6.563758389261745e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65233/75000 [1:03:21<08:02, 20.26it/s]

{'loss': 0.3085, 'grad_norm': 1.6596580743789673, 'learning_rate': 6.557046979865772e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65242/75000 [1:03:21<08:02, 20.20it/s]

{'loss': 0.3365, 'grad_norm': 2.3477554321289062, 'learning_rate': 6.550335570469798e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65252/75000 [1:03:22<08:16, 19.64it/s]

{'loss': 0.3481, 'grad_norm': 5.097607612609863, 'learning_rate': 6.543624161073826e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65264/75000 [1:03:22<08:22, 19.37it/s]

{'loss': 0.3945, 'grad_norm': 2.2338640689849854, 'learning_rate': 6.536912751677853e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65274/75000 [1:03:23<08:05, 20.04it/s]

{'loss': 0.3134, 'grad_norm': 4.544944763183594, 'learning_rate': 6.530201342281879e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65283/75000 [1:03:23<08:02, 20.15it/s]

{'loss': 0.2924, 'grad_norm': 3.2830965518951416, 'learning_rate': 6.5234899328859065e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65292/75000 [1:03:24<07:53, 20.50it/s]

{'loss': 0.3713, 'grad_norm': 4.038314342498779, 'learning_rate': 6.516778523489933e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65303/75000 [1:03:24<08:24, 19.20it/s]

{'loss': 0.4126, 'grad_norm': 4.539689064025879, 'learning_rate': 6.510067114093959e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65312/75000 [1:03:25<08:05, 19.97it/s]

{'loss': 0.2991, 'grad_norm': 6.482113361358643, 'learning_rate': 6.503355704697987e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65324/75000 [1:03:25<07:37, 21.16it/s]

{'loss': 0.4528, 'grad_norm': 3.651698112487793, 'learning_rate': 6.496644295302014e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65333/75000 [1:03:26<07:57, 20.23it/s]

{'loss': 0.2421, 'grad_norm': 5.657691955566406, 'learning_rate': 6.48993288590604e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65342/75000 [1:03:26<07:54, 20.37it/s]

{'loss': 0.2279, 'grad_norm': 2.4110584259033203, 'learning_rate': 6.483221476510068e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65351/75000 [1:03:27<07:53, 20.39it/s]

{'loss': 0.289, 'grad_norm': 0.6002044677734375, 'learning_rate': 6.4765100671140944e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65363/75000 [1:03:27<07:43, 20.77it/s]

{'loss': 0.2838, 'grad_norm': 11.501481056213379, 'learning_rate': 6.4697986577181204e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65372/75000 [1:03:28<08:15, 19.42it/s]

{'loss': 0.3379, 'grad_norm': 4.464791297912598, 'learning_rate': 6.463087248322149e-06, 'epoch': 2.61}


                                                       
 87%|████████▋ | 65384/75000 [1:03:28<07:54, 20.26it/s]

{'loss': 0.2479, 'grad_norm': 3.3315625190734863, 'learning_rate': 6.456375838926175e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65392/75000 [1:03:29<08:45, 18.27it/s]

{'loss': 0.3329, 'grad_norm': 1.7277958393096924, 'learning_rate': 6.449664429530201e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65403/75000 [1:03:29<08:33, 18.69it/s]

{'loss': 0.2671, 'grad_norm': 4.195756435394287, 'learning_rate': 6.4429530201342295e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65414/75000 [1:03:30<07:48, 20.45it/s]

{'loss': 0.3052, 'grad_norm': 2.224379777908325, 'learning_rate': 6.4362416107382556e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65424/75000 [1:03:30<08:20, 19.13it/s]

{'loss': 0.3182, 'grad_norm': 1.0668013095855713, 'learning_rate': 6.429530201342282e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65434/75000 [1:03:31<08:29, 18.79it/s]

{'loss': 0.215, 'grad_norm': 7.94727087020874, 'learning_rate': 6.422818791946308e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65442/75000 [1:03:31<08:12, 19.42it/s]

{'loss': 0.3212, 'grad_norm': 5.477234840393066, 'learning_rate': 6.416107382550336e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65454/75000 [1:03:32<07:42, 20.62it/s]

{'loss': 0.2903, 'grad_norm': 4.384069919586182, 'learning_rate': 6.409395973154363e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65464/75000 [1:03:32<08:09, 19.47it/s]

{'loss': 0.2782, 'grad_norm': 1.514292597770691, 'learning_rate': 6.402684563758389e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65473/75000 [1:03:33<08:05, 19.63it/s]

{'loss': 0.2317, 'grad_norm': 3.272249698638916, 'learning_rate': 6.395973154362417e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65483/75000 [1:03:33<08:17, 19.14it/s]

{'loss': 0.2818, 'grad_norm': 2.796236991882324, 'learning_rate': 6.3892617449664435e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65493/75000 [1:03:34<09:01, 17.56it/s]

{'loss': 0.3195, 'grad_norm': 8.898595809936523, 'learning_rate': 6.3825503355704695e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65500/75000 [1:03:34<08:29, 18.65it/s]

{'loss': 0.3189, 'grad_norm': 1.3357462882995605, 'learning_rate': 6.375838926174497e-06, 'epoch': 2.62}


                                                         
 87%|████████▋ | 65513/75000 [1:03:40<30:27,  5.19it/s]

{'loss': 0.3475, 'grad_norm': 5.639209270477295, 'learning_rate': 6.369127516778524e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65522/75000 [1:03:41<14:54, 10.60it/s]

{'loss': 0.322, 'grad_norm': 2.2635345458984375, 'learning_rate': 6.36241610738255e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65534/75000 [1:03:41<09:17, 16.99it/s]

{'loss': 0.3069, 'grad_norm': 3.514378547668457, 'learning_rate': 6.355704697986578e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65542/75000 [1:03:42<09:10, 17.17it/s]

{'loss': 0.2753, 'grad_norm': 0.6744799613952637, 'learning_rate': 6.348993288590605e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65551/75000 [1:03:42<08:01, 19.62it/s]

{'loss': 0.328, 'grad_norm': 8.172935485839844, 'learning_rate': 6.342281879194631e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65563/75000 [1:03:43<07:38, 20.60it/s]

{'loss': 0.3279, 'grad_norm': 9.225102424621582, 'learning_rate': 6.335570469798658e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65572/75000 [1:03:43<08:16, 19.00it/s]

{'loss': 0.3646, 'grad_norm': 1.1004492044448853, 'learning_rate': 6.328859060402685e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65582/75000 [1:03:44<08:02, 19.51it/s]

{'loss': 0.3088, 'grad_norm': 5.870691299438477, 'learning_rate': 6.322147651006711e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65592/75000 [1:03:44<08:00, 19.60it/s]

{'loss': 0.2853, 'grad_norm': 3.8299078941345215, 'learning_rate': 6.315436241610738e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65601/75000 [1:03:45<08:17, 18.90it/s]

{'loss': 0.3519, 'grad_norm': 8.59855842590332, 'learning_rate': 6.308724832214766e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65613/75000 [1:03:45<07:39, 20.41it/s]

{'loss': 0.3512, 'grad_norm': 2.6736700534820557, 'learning_rate': 6.302013422818792e-06, 'epoch': 2.62}


                                                       
 87%|████████▋ | 65622/75000 [1:03:46<08:00, 19.51it/s]

{'loss': 0.3176, 'grad_norm': 6.860157489776611, 'learning_rate': 6.295302013422819e-06, 'epoch': 2.62}


                                                       
 88%|████████▊ | 65631/75000 [1:03:46<07:48, 19.98it/s]

{'loss': 0.2351, 'grad_norm': 3.0304713249206543, 'learning_rate': 6.288590604026846e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65643/75000 [1:03:47<08:31, 18.30it/s]

{'loss': 0.2598, 'grad_norm': 2.5052132606506348, 'learning_rate': 6.281879194630873e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65652/75000 [1:03:48<07:57, 19.56it/s]

{'loss': 0.2687, 'grad_norm': 4.375302314758301, 'learning_rate': 6.275167785234899e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65662/75000 [1:03:48<08:08, 19.13it/s]

{'loss': 0.3942, 'grad_norm': 5.323150157928467, 'learning_rate': 6.268456375838927e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65672/75000 [1:03:49<07:46, 19.98it/s]

{'loss': 0.2582, 'grad_norm': 5.136777877807617, 'learning_rate': 6.261744966442954e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65684/75000 [1:03:49<07:27, 20.84it/s]

{'loss': 0.336, 'grad_norm': 1.640820860862732, 'learning_rate': 6.25503355704698e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65692/75000 [1:03:50<08:19, 18.63it/s]

{'loss': 0.3051, 'grad_norm': 3.2167277336120605, 'learning_rate': 6.248322147651007e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65704/75000 [1:03:50<07:39, 20.25it/s]

{'loss': 0.4049, 'grad_norm': 2.109086275100708, 'learning_rate': 6.241610738255034e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65713/75000 [1:03:51<07:31, 20.56it/s]

{'loss': 0.2305, 'grad_norm': 3.808608293533325, 'learning_rate': 6.23489932885906e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65722/75000 [1:03:51<08:28, 18.23it/s]

{'loss': 0.4304, 'grad_norm': 2.660336494445801, 'learning_rate': 6.228187919463087e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65732/75000 [1:03:52<07:58, 19.37it/s]

{'loss': 0.3187, 'grad_norm': 5.818310737609863, 'learning_rate': 6.221476510067115e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65741/75000 [1:03:52<07:39, 20.13it/s]

{'loss': 0.3204, 'grad_norm': 6.501341819763184, 'learning_rate': 6.214765100671141e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65753/75000 [1:03:53<07:22, 20.89it/s]

{'loss': 0.1552, 'grad_norm': 3.5623161792755127, 'learning_rate': 6.2080536912751686e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65762/75000 [1:03:53<07:45, 19.83it/s]

{'loss': 0.3435, 'grad_norm': 1.6195825338363647, 'learning_rate': 6.201342281879195e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65774/75000 [1:03:54<07:21, 20.89it/s]

{'loss': 0.2532, 'grad_norm': 3.602788209915161, 'learning_rate': 6.194630872483221e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65783/75000 [1:03:54<07:58, 19.27it/s]

{'loss': 0.4677, 'grad_norm': 6.332003593444824, 'learning_rate': 6.187919463087249e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65794/75000 [1:03:55<07:34, 20.26it/s]

{'loss': 0.3525, 'grad_norm': 3.4856350421905518, 'learning_rate': 6.181208053691275e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65803/75000 [1:03:55<07:55, 19.34it/s]

{'loss': 0.2856, 'grad_norm': 2.4963104724884033, 'learning_rate': 6.174496644295302e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65812/75000 [1:03:56<07:36, 20.11it/s]

{'loss': 0.2437, 'grad_norm': 4.360060691833496, 'learning_rate': 6.16778523489933e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65823/75000 [1:03:56<08:14, 18.55it/s]

{'loss': 0.3299, 'grad_norm': 6.922514915466309, 'learning_rate': 6.161073825503356e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65832/75000 [1:03:57<07:59, 19.10it/s]

{'loss': 0.365, 'grad_norm': 5.309759140014648, 'learning_rate': 6.1543624161073825e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65842/75000 [1:03:57<07:52, 19.38it/s]

{'loss': 0.3647, 'grad_norm': 2.5802927017211914, 'learning_rate': 6.14765100671141e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65851/75000 [1:03:58<08:34, 17.79it/s]

{'loss': 0.3728, 'grad_norm': 2.838430404663086, 'learning_rate': 6.140939597315436e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65862/75000 [1:03:58<07:41, 19.79it/s]

{'loss': 0.3154, 'grad_norm': 2.986365795135498, 'learning_rate': 6.134228187919464e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65871/75000 [1:03:59<07:31, 20.22it/s]

{'loss': 0.3352, 'grad_norm': 2.4427149295806885, 'learning_rate': 6.12751677852349e-06, 'epoch': 2.63}


                                                       
 88%|████████▊ | 65883/75000 [1:03:59<07:27, 20.38it/s]

{'loss': 0.3532, 'grad_norm': 7.121947765350342, 'learning_rate': 6.120805369127517e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65892/75000 [1:04:00<07:32, 20.13it/s]

{'loss': 0.4874, 'grad_norm': 7.568346977233887, 'learning_rate': 6.1140939597315445e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65904/75000 [1:04:00<07:25, 20.43it/s]

{'loss': 0.2108, 'grad_norm': 4.127390384674072, 'learning_rate': 6.1073825503355705e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65913/75000 [1:04:01<07:34, 20.00it/s]

{'loss': 0.3962, 'grad_norm': 3.678748607635498, 'learning_rate': 6.100671140939597e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65922/75000 [1:04:01<07:25, 20.40it/s]

{'loss': 0.4242, 'grad_norm': 7.1349263191223145, 'learning_rate': 6.093959731543625e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65932/75000 [1:04:02<08:12, 18.40it/s]

{'loss': 0.2445, 'grad_norm': 5.027209281921387, 'learning_rate': 6.087248322147651e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65942/75000 [1:04:02<08:16, 18.23it/s]

{'loss': 0.2974, 'grad_norm': 1.6162681579589844, 'learning_rate': 6.080536912751678e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65952/75000 [1:04:03<07:43, 19.53it/s]

{'loss': 0.2409, 'grad_norm': 3.944058895111084, 'learning_rate': 6.073825503355705e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65963/75000 [1:04:03<07:07, 21.11it/s]

{'loss': 0.306, 'grad_norm': 5.2382025718688965, 'learning_rate': 6.067114093959732e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65972/75000 [1:04:04<07:24, 20.31it/s]

{'loss': 0.2102, 'grad_norm': 5.690935134887695, 'learning_rate': 6.0604026845637585e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65984/75000 [1:04:04<07:09, 21.00it/s]

{'loss': 0.1772, 'grad_norm': 2.994652509689331, 'learning_rate': 6.053691275167785e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 65992/75000 [1:04:05<08:00, 18.75it/s]

{'loss': 0.2753, 'grad_norm': 4.238667964935303, 'learning_rate': 6.046979865771812e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66000/75000 [1:04:05<07:29, 20.00it/s]

{'loss': 0.2129, 'grad_norm': 5.296739101409912, 'learning_rate': 6.04026845637584e-06, 'epoch': 2.64}


                                                         
 88%|████████▊ | 66014/75000 [1:04:10<24:33,  6.10it/s]

{'loss': 0.3301, 'grad_norm': 3.359576463699341, 'learning_rate': 6.033557046979866e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66023/75000 [1:04:11<12:25, 12.04it/s]

{'loss': 0.3398, 'grad_norm': 8.752313613891602, 'learning_rate': 6.026845637583893e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66032/75000 [1:04:11<09:09, 16.32it/s]

{'loss': 0.2899, 'grad_norm': 2.166064739227295, 'learning_rate': 6.02013422818792e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66044/75000 [1:04:12<07:42, 19.34it/s]

{'loss': 0.3744, 'grad_norm': 5.050760746002197, 'learning_rate': 6.0134228187919464e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66052/75000 [1:04:12<07:44, 19.28it/s]

{'loss': 0.2749, 'grad_norm': 4.542212963104248, 'learning_rate': 6.006711409395973e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66061/75000 [1:04:13<07:49, 19.05it/s]

{'loss': 0.2951, 'grad_norm': 4.286739826202393, 'learning_rate': 6e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66073/75000 [1:04:13<07:13, 20.58it/s]

{'loss': 0.3319, 'grad_norm': 2.8824734687805176, 'learning_rate': 5.993288590604027e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66082/75000 [1:04:14<07:49, 18.99it/s]

{'loss': 0.281, 'grad_norm': 3.6117520332336426, 'learning_rate': 5.986577181208054e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66094/75000 [1:04:14<07:27, 19.91it/s]

{'loss': 0.2843, 'grad_norm': 1.9433685541152954, 'learning_rate': 5.979865771812081e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66102/75000 [1:04:15<07:43, 19.18it/s]

{'loss': 0.4881, 'grad_norm': 2.513212203979492, 'learning_rate': 5.9731543624161076e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66113/75000 [1:04:15<07:56, 18.65it/s]

{'loss': 0.34, 'grad_norm': 4.291930198669434, 'learning_rate': 5.966442953020134e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66122/75000 [1:04:16<07:23, 20.00it/s]

{'loss': 0.3453, 'grad_norm': 0.5945534110069275, 'learning_rate': 5.959731543624161e-06, 'epoch': 2.64}


                                                       
 88%|████████▊ | 66133/75000 [1:04:16<07:29, 19.73it/s]

{'loss': 0.2593, 'grad_norm': 1.9223238229751587, 'learning_rate': 5.953020134228188e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66142/75000 [1:04:17<07:23, 19.99it/s]

{'loss': 0.3371, 'grad_norm': 13.340499877929688, 'learning_rate': 5.946308724832215e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66154/75000 [1:04:17<07:10, 20.55it/s]

{'loss': 0.2082, 'grad_norm': 6.581479072570801, 'learning_rate': 5.939597315436242e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66163/75000 [1:04:18<07:08, 20.60it/s]

{'loss': 0.5036, 'grad_norm': 11.534536361694336, 'learning_rate': 5.932885906040269e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66172/75000 [1:04:18<07:10, 20.53it/s]

{'loss': 0.3547, 'grad_norm': 3.139819622039795, 'learning_rate': 5.9261744966442955e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66181/75000 [1:04:19<07:32, 19.50it/s]

{'loss': 0.2302, 'grad_norm': 1.505308747291565, 'learning_rate': 5.919463087248322e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66193/75000 [1:04:19<07:29, 19.57it/s]

{'loss': 0.2271, 'grad_norm': 1.2528175115585327, 'learning_rate': 5.912751677852349e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66204/75000 [1:04:20<07:17, 20.09it/s]

{'loss': 0.4672, 'grad_norm': 5.846228122711182, 'learning_rate': 5.906040268456376e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66214/75000 [1:04:20<07:18, 20.02it/s]

{'loss': 0.3146, 'grad_norm': 2.42583966255188, 'learning_rate': 5.899328859060403e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66223/75000 [1:04:21<07:13, 20.23it/s]

{'loss': 0.3724, 'grad_norm': 5.740593433380127, 'learning_rate': 5.89261744966443e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66232/75000 [1:04:21<07:06, 20.57it/s]

{'loss': 0.3014, 'grad_norm': 4.302870273590088, 'learning_rate': 5.885906040268457e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66243/75000 [1:04:22<07:47, 18.75it/s]

{'loss': 0.3089, 'grad_norm': 2.150707483291626, 'learning_rate': 5.8791946308724835e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66251/75000 [1:04:22<07:22, 19.76it/s]

{'loss': 0.2487, 'grad_norm': 2.092761754989624, 'learning_rate': 5.87248322147651e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66261/75000 [1:04:23<07:28, 19.49it/s]

{'loss': 0.2181, 'grad_norm': 5.698524475097656, 'learning_rate': 5.865771812080537e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66273/75000 [1:04:23<06:59, 20.82it/s]

{'loss': 0.2861, 'grad_norm': 3.2257068157196045, 'learning_rate': 5.859060402684564e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66282/75000 [1:04:24<07:01, 20.69it/s]

{'loss': 0.1957, 'grad_norm': 3.219599962234497, 'learning_rate': 5.852348993288591e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66294/75000 [1:04:24<07:23, 19.63it/s]

{'loss': 0.2884, 'grad_norm': 0.46621355414390564, 'learning_rate': 5.845637583892618e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66303/75000 [1:04:25<07:10, 20.20it/s]

{'loss': 0.3293, 'grad_norm': 4.79702091217041, 'learning_rate': 5.838926174496645e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66314/75000 [1:04:25<07:25, 19.50it/s]

{'loss': 0.3107, 'grad_norm': 6.083933353424072, 'learning_rate': 5.8322147651006715e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66323/75000 [1:04:26<07:06, 20.34it/s]

{'loss': 0.321, 'grad_norm': 4.59491491317749, 'learning_rate': 5.825503355704698e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66332/75000 [1:04:26<07:12, 20.04it/s]

{'loss': 0.2402, 'grad_norm': 2.688211679458618, 'learning_rate': 5.818791946308725e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66342/75000 [1:04:27<07:35, 19.02it/s]

{'loss': 0.2066, 'grad_norm': 1.3761165142059326, 'learning_rate': 5.812080536912752e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66354/75000 [1:04:27<07:08, 20.20it/s]

{'loss': 0.2512, 'grad_norm': 5.0763702392578125, 'learning_rate': 5.805369127516779e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66362/75000 [1:04:28<07:32, 19.09it/s]

{'loss': 0.4113, 'grad_norm': 2.592320203781128, 'learning_rate': 5.798657718120806e-06, 'epoch': 2.65}


                                                       
 88%|████████▊ | 66373/75000 [1:04:28<07:29, 19.18it/s]

{'loss': 0.3954, 'grad_norm': 2.523496150970459, 'learning_rate': 5.791946308724833e-06, 'epoch': 2.65}


                                                       
 89%|████████▊ | 66381/75000 [1:04:29<07:23, 19.43it/s]

{'loss': 0.2179, 'grad_norm': 4.888179302215576, 'learning_rate': 5.7852348993288594e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66394/75000 [1:04:30<07:09, 20.03it/s]

{'loss': 0.3389, 'grad_norm': 4.155853271484375, 'learning_rate': 5.778523489932886e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66402/75000 [1:04:30<08:02, 17.84it/s]

{'loss': 0.3344, 'grad_norm': 6.242702007293701, 'learning_rate': 5.771812080536913e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66412/75000 [1:04:30<07:23, 19.37it/s]

{'loss': 0.409, 'grad_norm': 5.603274822235107, 'learning_rate': 5.76510067114094e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66421/75000 [1:04:31<07:39, 18.66it/s]

{'loss': 0.2317, 'grad_norm': 3.2410709857940674, 'learning_rate': 5.758389261744967e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66433/75000 [1:04:32<07:01, 20.33it/s]

{'loss': 0.3043, 'grad_norm': 3.5266966819763184, 'learning_rate': 5.751677852348993e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66442/75000 [1:04:32<06:50, 20.85it/s]

{'loss': 0.2473, 'grad_norm': 0.35403701663017273, 'learning_rate': 5.7449664429530206e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66454/75000 [1:04:33<06:49, 20.85it/s]

{'loss': 0.299, 'grad_norm': 5.818825721740723, 'learning_rate': 5.738255033557047e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66463/75000 [1:04:33<07:07, 19.96it/s]

{'loss': 0.3755, 'grad_norm': 1.5822936296463013, 'learning_rate': 5.7315436241610734e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66472/75000 [1:04:33<06:58, 20.36it/s]

{'loss': 0.3229, 'grad_norm': 4.509469985961914, 'learning_rate': 5.724832214765101e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66482/75000 [1:04:34<07:24, 19.18it/s]

{'loss': 0.2133, 'grad_norm': 5.560707092285156, 'learning_rate': 5.718120805369128e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66492/75000 [1:04:35<07:33, 18.74it/s]

{'loss': 0.3026, 'grad_norm': 2.966531276702881, 'learning_rate': 5.711409395973155e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66500/75000 [1:04:35<07:17, 19.45it/s]

{'loss': 0.2029, 'grad_norm': 0.42597517371177673, 'learning_rate': 5.704697986577182e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66514/75000 [1:04:36<09:19, 15.17it/s]

{'loss': 0.3509, 'grad_norm': 16.475467681884766, 'learning_rate': 5.697986577181208e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66522/75000 [1:04:37<09:12, 15.33it/s]

{'loss': 0.3026, 'grad_norm': 1.513426423072815, 'learning_rate': 5.691275167785235e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66532/75000 [1:04:37<07:36, 18.56it/s]

{'loss': 0.2726, 'grad_norm': 8.186041831970215, 'learning_rate': 5.684563758389262e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66542/75000 [1:04:38<07:27, 18.92it/s]

{'loss': 0.3107, 'grad_norm': 4.29866886138916, 'learning_rate': 5.677852348993288e-06, 'epoch': 2.66}


                                                       
 89%|████████▊ | 66554/75000 [1:04:38<06:58, 20.18it/s]

{'loss': 0.3438, 'grad_norm': 6.067606449127197, 'learning_rate': 5.671140939597316e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 66563/75000 [1:04:39<06:54, 20.33it/s]

{'loss': 0.3466, 'grad_norm': 1.660862922668457, 'learning_rate': 5.664429530201343e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 66572/75000 [1:04:39<07:21, 19.11it/s]

{'loss': 0.2412, 'grad_norm': 3.168311834335327, 'learning_rate': 5.657718120805369e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 66583/75000 [1:04:40<06:47, 20.67it/s]

{'loss': 0.338, 'grad_norm': 4.647418975830078, 'learning_rate': 5.6510067114093965e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 66592/75000 [1:04:40<06:48, 20.58it/s]

{'loss': 0.2039, 'grad_norm': 4.405855655670166, 'learning_rate': 5.644295302013423e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 66602/75000 [1:04:41<07:22, 18.96it/s]

{'loss': 0.3993, 'grad_norm': 4.712143421173096, 'learning_rate': 5.63758389261745e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 66614/75000 [1:04:41<06:45, 20.69it/s]

{'loss': 0.3521, 'grad_norm': 4.215439796447754, 'learning_rate': 5.630872483221477e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 66623/75000 [1:04:42<07:26, 18.77it/s]

{'loss': 0.2622, 'grad_norm': 0.8646674156188965, 'learning_rate': 5.624161073825503e-06, 'epoch': 2.66}


                                                       
 89%|████████▉ | 66633/75000 [1:04:42<07:13, 19.31it/s]

{'loss': 0.2568, 'grad_norm': 1.5679938793182373, 'learning_rate': 5.617449664429531e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66642/75000 [1:04:43<07:05, 19.65it/s]

{'loss': 0.3136, 'grad_norm': 1.6838234663009644, 'learning_rate': 5.610738255033558e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66653/75000 [1:04:43<07:32, 18.45it/s]

{'loss': 0.2716, 'grad_norm': 4.110801696777344, 'learning_rate': 5.604026845637584e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66664/75000 [1:04:44<06:59, 19.85it/s]

{'loss': 0.2562, 'grad_norm': 3.0597734451293945, 'learning_rate': 5.597315436241611e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66674/75000 [1:04:45<07:09, 19.39it/s]

{'loss': 0.1705, 'grad_norm': 6.248417377471924, 'learning_rate': 5.590604026845638e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66682/75000 [1:04:45<06:58, 19.88it/s]

{'loss': 0.3266, 'grad_norm': 7.184589862823486, 'learning_rate': 5.583892617449664e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66694/75000 [1:04:45<06:47, 20.39it/s]

{'loss': 0.3327, 'grad_norm': 6.468574523925781, 'learning_rate': 5.577181208053692e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66703/75000 [1:04:46<06:58, 19.84it/s]

{'loss': 0.4984, 'grad_norm': 6.217138767242432, 'learning_rate': 5.570469798657718e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66712/75000 [1:04:46<06:52, 20.09it/s]

{'loss': 0.2834, 'grad_norm': 6.876789093017578, 'learning_rate': 5.563758389261746e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66724/75000 [1:04:47<06:52, 20.07it/s]

{'loss': 0.2406, 'grad_norm': 5.044064998626709, 'learning_rate': 5.5570469798657725e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66733/75000 [1:04:47<06:47, 20.27it/s]

{'loss': 0.3669, 'grad_norm': 2.3800253868103027, 'learning_rate': 5.5503355704697985e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66744/75000 [1:04:48<07:07, 19.30it/s]

{'loss': 0.2372, 'grad_norm': 4.0534257888793945, 'learning_rate': 5.543624161073826e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66752/75000 [1:04:48<06:58, 19.70it/s]

{'loss': 0.3686, 'grad_norm': 3.7895236015319824, 'learning_rate': 5.536912751677853e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66763/75000 [1:04:49<06:58, 19.70it/s]

{'loss': 0.2595, 'grad_norm': 3.656489849090576, 'learning_rate': 5.530201342281879e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66772/75000 [1:04:49<07:06, 19.28it/s]

{'loss': 0.4061, 'grad_norm': 4.207433223724365, 'learning_rate': 5.523489932885907e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66783/75000 [1:04:50<07:01, 19.50it/s]

{'loss': 0.294, 'grad_norm': 2.1723639965057373, 'learning_rate': 5.516778523489933e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66794/75000 [1:04:51<07:10, 19.06it/s]

{'loss': 0.2716, 'grad_norm': 2.556218147277832, 'learning_rate': 5.51006711409396e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66803/75000 [1:04:51<07:15, 18.81it/s]

{'loss': 0.4366, 'grad_norm': 3.395146131515503, 'learning_rate': 5.503355704697987e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66812/75000 [1:04:51<06:46, 20.14it/s]

{'loss': 0.2359, 'grad_norm': 6.714902400970459, 'learning_rate': 5.496644295302013e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66821/75000 [1:04:52<06:39, 20.49it/s]

{'loss': 0.302, 'grad_norm': 4.1562628746032715, 'learning_rate': 5.489932885906041e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66833/75000 [1:04:53<06:47, 20.05it/s]

{'loss': 0.3897, 'grad_norm': 5.794327259063721, 'learning_rate': 5.483221476510068e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66842/75000 [1:04:53<06:42, 20.26it/s]

{'loss': 0.2345, 'grad_norm': 4.1156487464904785, 'learning_rate': 5.476510067114094e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66854/75000 [1:04:54<06:50, 19.84it/s]

{'loss': 0.296, 'grad_norm': 3.36350417137146, 'learning_rate': 5.4697986577181215e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66863/75000 [1:04:54<07:17, 18.59it/s]

{'loss': 0.2502, 'grad_norm': 6.838335990905762, 'learning_rate': 5.4630872483221475e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66872/75000 [1:04:55<07:34, 17.90it/s]

{'loss': 0.2007, 'grad_norm': 4.583625316619873, 'learning_rate': 5.456375838926174e-06, 'epoch': 2.67}


                                                       
 89%|████████▉ | 66884/75000 [1:04:55<06:54, 19.56it/s]

{'loss': 0.4114, 'grad_norm': 3.8331363201141357, 'learning_rate': 5.449664429530202e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66890/75000 [1:04:55<06:38, 20.34it/s]

{'loss': 0.2912, 'grad_norm': 5.9321088790893555, 'learning_rate': 5.442953020134228e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66904/75000 [1:04:56<06:49, 19.79it/s]

{'loss': 0.2505, 'grad_norm': 1.9008022546768188, 'learning_rate': 5.436241610738255e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66912/75000 [1:04:57<07:24, 18.21it/s]

{'loss': 0.329, 'grad_norm': 4.572299003601074, 'learning_rate': 5.429530201342283e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66922/75000 [1:04:57<08:51, 15.20it/s]

{'loss': 0.2447, 'grad_norm': 2.142174005508423, 'learning_rate': 5.422818791946309e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66934/75000 [1:04:58<06:54, 19.47it/s]

{'loss': 0.3227, 'grad_norm': 2.2490897178649902, 'learning_rate': 5.416107382550336e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66944/75000 [1:04:58<06:55, 19.40it/s]

{'loss': 0.3428, 'grad_norm': 4.856297969818115, 'learning_rate': 5.409395973154362e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66953/75000 [1:04:59<06:39, 20.14it/s]

{'loss': 0.181, 'grad_norm': 4.361599445343018, 'learning_rate': 5.402684563758389e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66962/75000 [1:04:59<06:46, 19.77it/s]

{'loss': 0.3586, 'grad_norm': 8.158926963806152, 'learning_rate': 5.395973154362417e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66973/75000 [1:05:00<06:56, 19.28it/s]

{'loss': 0.3073, 'grad_norm': 4.426591396331787, 'learning_rate': 5.389261744966443e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66982/75000 [1:05:01<09:33, 13.98it/s]

{'loss': 0.3607, 'grad_norm': 5.584609508514404, 'learning_rate': 5.38255033557047e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 66992/75000 [1:05:01<07:37, 17.50it/s]

{'loss': 0.3541, 'grad_norm': 3.1889126300811768, 'learning_rate': 5.3758389261744975e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67000/75000 [1:05:02<07:02, 18.96it/s]

{'loss': 0.261, 'grad_norm': 1.3369550704956055, 'learning_rate': 5.3691275167785235e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67014/75000 [1:05:03<08:49, 15.07it/s]

{'loss': 0.2807, 'grad_norm': 3.6301918029785156, 'learning_rate': 5.36241610738255e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67021/75000 [1:05:03<09:39, 13.78it/s]

{'loss': 0.3208, 'grad_norm': 3.117506980895996, 'learning_rate': 5.355704697986577e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67032/75000 [1:05:04<07:41, 17.28it/s]

{'loss': 0.3057, 'grad_norm': 5.02893590927124, 'learning_rate': 5.348993288590604e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67042/75000 [1:05:05<07:12, 18.38it/s]

{'loss': 0.4474, 'grad_norm': 5.237446308135986, 'learning_rate': 5.342281879194632e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67053/75000 [1:05:05<07:13, 18.33it/s]

{'loss': 0.187, 'grad_norm': 3.7435970306396484, 'learning_rate': 5.335570469798658e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67061/75000 [1:05:06<08:50, 14.97it/s]

{'loss': 0.3046, 'grad_norm': 2.0521209239959717, 'learning_rate': 5.328859060402685e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67072/75000 [1:05:06<07:17, 18.11it/s]

{'loss': 0.343, 'grad_norm': 2.8583059310913086, 'learning_rate': 5.322147651006712e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67084/75000 [1:05:07<07:17, 18.10it/s]

{'loss': 0.2652, 'grad_norm': 3.1494319438934326, 'learning_rate': 5.315436241610738e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67091/75000 [1:05:07<08:19, 15.82it/s]

{'loss': 0.4404, 'grad_norm': 4.517883777618408, 'learning_rate': 5.308724832214765e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67104/75000 [1:05:08<07:32, 17.46it/s]

{'loss': 0.3563, 'grad_norm': 2.2736222743988037, 'learning_rate': 5.302013422818792e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67112/75000 [1:05:09<07:35, 17.33it/s]

{'loss': 0.3095, 'grad_norm': 3.1626105308532715, 'learning_rate': 5.295302013422819e-06, 'epoch': 2.68}


                                                       
 89%|████████▉ | 67122/75000 [1:05:09<07:21, 17.84it/s]

{'loss': 0.2891, 'grad_norm': 3.457875967025757, 'learning_rate': 5.288590604026846e-06, 'epoch': 2.68}


                                                       
 90%|████████▉ | 67132/75000 [1:05:10<07:44, 16.94it/s]

{'loss': 0.4073, 'grad_norm': 10.634384155273438, 'learning_rate': 5.281879194630873e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67141/75000 [1:05:10<06:55, 18.90it/s]

{'loss': 0.3599, 'grad_norm': 1.4436116218566895, 'learning_rate': 5.2751677852348994e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67154/75000 [1:05:11<06:43, 19.42it/s]

{'loss': 0.467, 'grad_norm': 6.738625526428223, 'learning_rate': 5.268456375838927e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67163/75000 [1:05:12<08:41, 15.04it/s]

{'loss': 0.2568, 'grad_norm': 7.574765205383301, 'learning_rate': 5.261744966442953e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67171/75000 [1:05:12<08:28, 15.40it/s]

{'loss': 0.3709, 'grad_norm': 4.75532865524292, 'learning_rate': 5.25503355704698e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67182/75000 [1:05:13<08:19, 15.67it/s]

{'loss': 0.4288, 'grad_norm': 2.677666187286377, 'learning_rate': 5.248322147651007e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67192/75000 [1:05:14<07:56, 16.38it/s]

{'loss': 0.2074, 'grad_norm': 5.643229961395264, 'learning_rate': 5.241610738255034e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67203/75000 [1:05:14<07:03, 18.41it/s]

{'loss': 0.2682, 'grad_norm': 9.921619415283203, 'learning_rate': 5.2348993288590606e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67213/75000 [1:05:15<08:38, 15.03it/s]

{'loss': 0.4298, 'grad_norm': 5.6080145835876465, 'learning_rate': 5.228187919463087e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67221/75000 [1:05:15<07:32, 17.21it/s]

{'loss': 0.2845, 'grad_norm': 4.768444538116455, 'learning_rate': 5.221476510067114e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67232/75000 [1:05:16<08:17, 15.60it/s]

{'loss': 0.2928, 'grad_norm': 3.5605368614196777, 'learning_rate': 5.214765100671141e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67243/75000 [1:05:17<07:09, 18.06it/s]

{'loss': 0.2166, 'grad_norm': 2.2348313331604004, 'learning_rate': 5.208053691275168e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67253/75000 [1:05:17<07:46, 16.61it/s]

{'loss': 0.2967, 'grad_norm': 10.154671669006348, 'learning_rate': 5.201342281879195e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67263/75000 [1:05:18<09:53, 13.04it/s]

{'loss': 0.2406, 'grad_norm': 5.364288330078125, 'learning_rate': 5.194630872483222e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67271/75000 [1:05:19<11:20, 11.36it/s]

{'loss': 0.3762, 'grad_norm': 1.8464875221252441, 'learning_rate': 5.1879194630872485e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67281/75000 [1:05:20<10:02, 12.82it/s]

{'loss': 0.3422, 'grad_norm': 3.79409122467041, 'learning_rate': 5.181208053691275e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67291/75000 [1:05:20<09:42, 13.24it/s]

{'loss': 0.222, 'grad_norm': 1.9448527097702026, 'learning_rate': 5.174496644295302e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67302/75000 [1:05:21<09:03, 14.17it/s]

{'loss': 0.256, 'grad_norm': 7.013862133026123, 'learning_rate': 5.167785234899329e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67312/75000 [1:05:22<08:08, 15.74it/s]

{'loss': 0.2352, 'grad_norm': 2.2630908489227295, 'learning_rate': 5.161073825503356e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67320/75000 [1:05:22<08:12, 15.59it/s]

{'loss': 0.3477, 'grad_norm': 2.8845582008361816, 'learning_rate': 5.154362416107383e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67332/75000 [1:05:24<11:34, 11.04it/s]

{'loss': 0.2167, 'grad_norm': 0.8413193225860596, 'learning_rate': 5.14765100671141e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67344/75000 [1:05:24<07:26, 17.14it/s]

{'loss': 0.3306, 'grad_norm': 0.6767158508300781, 'learning_rate': 5.1409395973154365e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67352/75000 [1:05:25<07:15, 17.54it/s]

{'loss': 0.389, 'grad_norm': 5.646878242492676, 'learning_rate': 5.134228187919463e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67362/75000 [1:05:25<09:53, 12.87it/s]

{'loss': 0.3183, 'grad_norm': 3.3330860137939453, 'learning_rate': 5.12751677852349e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67372/75000 [1:05:26<07:26, 17.07it/s]

{'loss': 0.3554, 'grad_norm': 3.4632391929626465, 'learning_rate': 5.120805369127517e-06, 'epoch': 2.69}


                                                       
 90%|████████▉ | 67381/75000 [1:05:26<07:02, 18.05it/s]

{'loss': 0.2938, 'grad_norm': 3.9374866485595703, 'learning_rate': 5.114093959731544e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67393/75000 [1:05:27<07:55, 16.01it/s]

{'loss': 0.2269, 'grad_norm': 8.579296112060547, 'learning_rate': 5.107382550335571e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67403/75000 [1:05:28<07:21, 17.19it/s]

{'loss': 0.3302, 'grad_norm': 10.620387077331543, 'learning_rate': 5.100671140939598e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67411/75000 [1:05:28<07:00, 18.03it/s]

{'loss': 0.3099, 'grad_norm': 2.898350238800049, 'learning_rate': 5.0939597315436245e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67420/75000 [1:05:29<09:43, 12.99it/s]

{'loss': 0.3423, 'grad_norm': 6.845304012298584, 'learning_rate': 5.087248322147651e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67432/75000 [1:05:31<13:04,  9.64it/s]

{'loss': 0.3807, 'grad_norm': 4.361284255981445, 'learning_rate': 5.080536912751678e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67442/75000 [1:05:31<08:24, 14.97it/s]

{'loss': 0.3544, 'grad_norm': 4.508750915527344, 'learning_rate': 5.073825503355705e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67452/75000 [1:05:32<09:53, 12.72it/s]

{'loss': 0.3336, 'grad_norm': 6.036231517791748, 'learning_rate': 5.067114093959732e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67462/75000 [1:05:33<09:25, 13.34it/s]

{'loss': 0.2447, 'grad_norm': 2.1946675777435303, 'learning_rate': 5.060402684563759e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67472/75000 [1:05:34<09:42, 12.93it/s]

{'loss': 0.2078, 'grad_norm': 1.9405126571655273, 'learning_rate': 5.053691275167786e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67482/75000 [1:05:34<10:04, 12.44it/s]

{'loss': 0.2525, 'grad_norm': 0.7597811818122864, 'learning_rate': 5.0469798657718124e-06, 'epoch': 2.7}


                                                       
 90%|████████▉ | 67491/75000 [1:05:35<11:15, 11.12it/s]

{'loss': 0.2895, 'grad_norm': 6.96610164642334, 'learning_rate': 5.040268456375839e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67500/75000 [1:05:36<09:35, 13.03it/s]

{'loss': 0.3586, 'grad_norm': 12.691880226135254, 'learning_rate': 5.033557046979865e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67513/75000 [1:05:38<11:00, 11.33it/s]

{'loss': 0.3279, 'grad_norm': 5.22946310043335, 'learning_rate': 5.026845637583893e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67523/75000 [1:05:38<08:08, 15.32it/s]

{'loss': 0.2886, 'grad_norm': 6.8978986740112305, 'learning_rate': 5.02013422818792e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67533/75000 [1:05:39<08:36, 14.47it/s]

{'loss': 0.2112, 'grad_norm': 3.3685076236724854, 'learning_rate': 5.013422818791947e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67543/75000 [1:05:40<10:32, 11.79it/s]

{'loss': 0.2154, 'grad_norm': 3.1611127853393555, 'learning_rate': 5.0067114093959736e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67553/75000 [1:05:41<07:55, 15.66it/s]

{'loss': 0.3541, 'grad_norm': 3.0039875507354736, 'learning_rate': 5e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67561/75000 [1:05:41<08:25, 14.71it/s]

{'loss': 0.2994, 'grad_norm': 2.9731085300445557, 'learning_rate': 4.993288590604027e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67571/75000 [1:05:42<11:55, 10.38it/s]

{'loss': 0.3594, 'grad_norm': 1.2817732095718384, 'learning_rate': 4.986577181208054e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67582/75000 [1:05:44<13:19,  9.27it/s]

{'loss': 0.2228, 'grad_norm': 4.421581745147705, 'learning_rate': 4.97986577181208e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67592/75000 [1:05:45<07:58, 15.49it/s]

{'loss': 0.3374, 'grad_norm': 0.7540955543518066, 'learning_rate': 4.973154362416108e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67603/75000 [1:05:45<06:54, 17.83it/s]

{'loss': 0.4352, 'grad_norm': 11.619264602661133, 'learning_rate': 4.966442953020135e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67613/75000 [1:05:46<06:59, 17.59it/s]

{'loss': 0.3938, 'grad_norm': 3.64560604095459, 'learning_rate': 4.959731543624161e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67622/75000 [1:05:46<07:27, 16.49it/s]

{'loss': 0.4386, 'grad_norm': 0.5791916251182556, 'learning_rate': 4.953020134228188e-06, 'epoch': 2.7}


                                                       
 90%|█████████ | 67630/75000 [1:05:47<10:15, 11.97it/s]

{'loss': 0.2889, 'grad_norm': 1.0060744285583496, 'learning_rate': 4.946308724832215e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67642/75000 [1:05:48<09:21, 13.10it/s]

{'loss': 0.3814, 'grad_norm': 12.84108829498291, 'learning_rate': 4.939597315436242e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67652/75000 [1:05:49<08:46, 13.96it/s]

{'loss': 0.3407, 'grad_norm': 3.7535769939422607, 'learning_rate': 4.932885906040269e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67660/75000 [1:05:50<20:52,  5.86it/s]

{'loss': 0.312, 'grad_norm': 15.885896682739258, 'learning_rate': 4.926174496644295e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67672/75000 [1:05:51<10:05, 12.10it/s]

{'loss': 0.149, 'grad_norm': 1.1243205070495605, 'learning_rate': 4.919463087248323e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67682/75000 [1:05:52<08:41, 14.04it/s]

{'loss': 0.2386, 'grad_norm': 5.950118064880371, 'learning_rate': 4.9127516778523495e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67692/75000 [1:05:53<12:55,  9.42it/s]

{'loss': 0.266, 'grad_norm': 9.829071998596191, 'learning_rate': 4.9060402684563755e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67702/75000 [1:05:53<09:37, 12.64it/s]

{'loss': 0.3147, 'grad_norm': 9.317793846130371, 'learning_rate': 4.899328859060403e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67711/75000 [1:05:54<07:09, 16.98it/s]

{'loss': 0.3071, 'grad_norm': 2.443067789077759, 'learning_rate': 4.89261744966443e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67722/75000 [1:05:54<06:21, 19.05it/s]

{'loss': 0.2587, 'grad_norm': 1.4985508918762207, 'learning_rate': 4.885906040268456e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67733/75000 [1:05:55<07:31, 16.11it/s]

{'loss': 0.337, 'grad_norm': 5.5121307373046875, 'learning_rate': 4.879194630872484e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67743/75000 [1:05:56<07:02, 17.18it/s]

{'loss': 0.3503, 'grad_norm': 3.1043121814727783, 'learning_rate': 4.87248322147651e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67751/75000 [1:05:56<07:16, 16.60it/s]

{'loss': 0.2687, 'grad_norm': 1.757604718208313, 'learning_rate': 4.8657718120805375e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67761/75000 [1:05:57<10:26, 11.55it/s]

{'loss': 0.213, 'grad_norm': 1.301774501800537, 'learning_rate': 4.859060402684564e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67773/75000 [1:05:59<10:32, 11.42it/s]

{'loss': 0.2795, 'grad_norm': 15.439950942993164, 'learning_rate': 4.85234899328859e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67783/75000 [1:05:59<08:16, 14.53it/s]

{'loss': 0.3636, 'grad_norm': 2.6819136142730713, 'learning_rate': 4.845637583892618e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67791/75000 [1:06:00<07:40, 15.66it/s]

{'loss': 0.4184, 'grad_norm': 6.077089309692383, 'learning_rate': 4.838926174496645e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67803/75000 [1:06:01<07:42, 15.55it/s]

{'loss': 0.334, 'grad_norm': 5.394206523895264, 'learning_rate': 4.832214765100671e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67811/75000 [1:06:01<07:30, 15.96it/s]

{'loss': 0.2699, 'grad_norm': 3.8088345527648926, 'learning_rate': 4.825503355704699e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67823/75000 [1:06:02<07:13, 16.55it/s]

{'loss': 0.2709, 'grad_norm': 2.678800344467163, 'learning_rate': 4.8187919463087254e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67831/75000 [1:06:02<07:40, 15.56it/s]

{'loss': 0.2211, 'grad_norm': 3.0483052730560303, 'learning_rate': 4.8120805369127514e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67843/75000 [1:06:03<07:35, 15.72it/s]

{'loss': 0.3298, 'grad_norm': 3.8671607971191406, 'learning_rate': 4.805369127516779e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67852/75000 [1:06:04<06:32, 18.21it/s]

{'loss': 0.3296, 'grad_norm': 2.2715837955474854, 'learning_rate': 4.798657718120805e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67862/75000 [1:06:04<06:14, 19.06it/s]

{'loss': 0.3401, 'grad_norm': 2.89856219291687, 'learning_rate': 4.791946308724833e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 67871/75000 [1:06:05<07:11, 16.52it/s]

{'loss': 0.3259, 'grad_norm': 3.385446071624756, 'learning_rate': 4.78523489932886e-06, 'epoch': 2.71}


                                                       
 91%|█████████ | 67882/75000 [1:06:05<06:07, 19.37it/s]

{'loss': 0.2309, 'grad_norm': 3.2460060119628906, 'learning_rate': 4.778523489932886e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67891/75000 [1:06:06<06:10, 19.21it/s]

{'loss': 0.3027, 'grad_norm': 3.129371404647827, 'learning_rate': 4.771812080536913e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67902/75000 [1:06:06<06:05, 19.39it/s]

{'loss': 0.3142, 'grad_norm': 2.1615982055664062, 'learning_rate': 4.76510067114094e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67912/75000 [1:06:07<05:57, 19.80it/s]

{'loss': 0.3391, 'grad_norm': 4.991008758544922, 'learning_rate': 4.758389261744966e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67922/75000 [1:06:07<06:10, 19.12it/s]

{'loss': 0.278, 'grad_norm': 3.28995418548584, 'learning_rate': 4.751677852348994e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67933/75000 [1:06:08<05:52, 20.02it/s]

{'loss': 0.3342, 'grad_norm': 1.6969265937805176, 'learning_rate': 4.74496644295302e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67942/75000 [1:06:08<05:54, 19.91it/s]

{'loss': 0.3177, 'grad_norm': 2.868452787399292, 'learning_rate': 4.738255033557047e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67954/75000 [1:06:09<05:38, 20.84it/s]

{'loss': 0.2849, 'grad_norm': 1.5226261615753174, 'learning_rate': 4.7315436241610745e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67963/75000 [1:06:09<05:44, 20.44it/s]

{'loss': 0.3379, 'grad_norm': 2.9493420124053955, 'learning_rate': 4.7248322147651005e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67973/75000 [1:06:10<06:12, 18.86it/s]

{'loss': 0.2686, 'grad_norm': 3.3638362884521484, 'learning_rate': 4.718120805369128e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67981/75000 [1:06:10<05:59, 19.50it/s]

{'loss': 0.2758, 'grad_norm': 0.8747844696044922, 'learning_rate': 4.711409395973155e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 67994/75000 [1:06:11<06:06, 19.13it/s]

{'loss': 0.2119, 'grad_norm': 2.127375841140747, 'learning_rate': 4.704697986577181e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68000/75000 [1:06:11<05:47, 20.14it/s]

{'loss': 0.3764, 'grad_norm': 1.5413058996200562, 'learning_rate': 4.697986577181209e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68014/75000 [1:06:12<07:20, 15.87it/s]

{'loss': 0.2376, 'grad_norm': 2.5488102436065674, 'learning_rate': 4.691275167785235e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68023/75000 [1:06:13<06:11, 18.80it/s]

{'loss': 0.1473, 'grad_norm': 0.9495855569839478, 'learning_rate': 4.684563758389262e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68032/75000 [1:06:13<05:50, 19.87it/s]

{'loss': 0.3317, 'grad_norm': 0.7929129600524902, 'learning_rate': 4.677852348993289e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68044/75000 [1:06:14<05:37, 20.61it/s]

{'loss': 0.2868, 'grad_norm': 1.7588186264038086, 'learning_rate': 4.671140939597315e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68053/75000 [1:06:14<05:34, 20.77it/s]

{'loss': 0.3298, 'grad_norm': 2.3088841438293457, 'learning_rate': 4.664429530201342e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68062/75000 [1:06:15<05:50, 19.80it/s]

{'loss': 0.3047, 'grad_norm': 1.617421269416809, 'learning_rate': 4.65771812080537e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68073/75000 [1:06:15<05:42, 20.20it/s]

{'loss': 0.3226, 'grad_norm': 8.013385772705078, 'learning_rate': 4.651006711409396e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68082/75000 [1:06:16<05:34, 20.66it/s]

{'loss': 0.3342, 'grad_norm': 3.7293214797973633, 'learning_rate': 4.644295302013423e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68094/75000 [1:06:16<05:34, 20.67it/s]

{'loss': 0.3491, 'grad_norm': 6.07092809677124, 'learning_rate': 4.63758389261745e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68103/75000 [1:06:17<05:45, 19.93it/s]

{'loss': 0.2457, 'grad_norm': 2.1729447841644287, 'learning_rate': 4.6308724832214765e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68113/75000 [1:06:17<05:54, 19.41it/s]

{'loss': 0.3607, 'grad_norm': 3.799555540084839, 'learning_rate': 4.624161073825504e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68122/75000 [1:06:18<06:33, 17.49it/s]

{'loss': 0.2563, 'grad_norm': 2.851602077484131, 'learning_rate': 4.61744966442953e-06, 'epoch': 2.72}


                                                       
 91%|█████████ | 68133/75000 [1:06:19<06:31, 17.55it/s]

{'loss': 0.3238, 'grad_norm': 3.908994674682617, 'learning_rate': 4.610738255033557e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68143/75000 [1:06:19<06:53, 16.59it/s]

{'loss': 0.3327, 'grad_norm': 3.0353734493255615, 'learning_rate': 4.604026845637585e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68153/75000 [1:06:20<06:46, 16.83it/s]

{'loss': 0.2763, 'grad_norm': 3.455179452896118, 'learning_rate': 4.597315436241611e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68163/75000 [1:06:20<06:09, 18.51it/s]

{'loss': 0.2436, 'grad_norm': 3.0320940017700195, 'learning_rate': 4.590604026845638e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68172/75000 [1:06:21<06:03, 18.79it/s]

{'loss': 0.3223, 'grad_norm': 4.913464069366455, 'learning_rate': 4.5838926174496645e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68182/75000 [1:06:21<06:18, 18.02it/s]

{'loss': 0.2796, 'grad_norm': 3.3720014095306396, 'learning_rate': 4.577181208053691e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68193/75000 [1:06:22<06:10, 18.38it/s]

{'loss': 0.3309, 'grad_norm': 5.628574848175049, 'learning_rate': 4.570469798657718e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68203/75000 [1:06:22<06:02, 18.76it/s]

{'loss': 0.1789, 'grad_norm': 2.047586679458618, 'learning_rate': 4.563758389261745e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68213/75000 [1:06:23<06:05, 18.55it/s]

{'loss': 0.358, 'grad_norm': 2.9363725185394287, 'learning_rate': 4.557046979865772e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68222/75000 [1:06:23<06:14, 18.09it/s]

{'loss': 0.2591, 'grad_norm': 3.872386932373047, 'learning_rate': 4.5503355704697996e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68233/75000 [1:06:24<06:01, 18.70it/s]

{'loss': 0.4164, 'grad_norm': 4.269956588745117, 'learning_rate': 4.5436241610738256e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68242/75000 [1:06:24<06:04, 18.53it/s]

{'loss': 0.1547, 'grad_norm': 4.414559841156006, 'learning_rate': 4.536912751677852e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68252/75000 [1:06:25<06:30, 17.27it/s]

{'loss': 0.289, 'grad_norm': 1.5083447694778442, 'learning_rate': 4.530201342281879e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68264/75000 [1:06:26<05:56, 18.90it/s]

{'loss': 0.2205, 'grad_norm': 2.480806350708008, 'learning_rate': 4.523489932885906e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68273/75000 [1:06:26<05:45, 19.46it/s]

{'loss': 0.4072, 'grad_norm': 2.4770090579986572, 'learning_rate': 4.516778523489933e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68282/75000 [1:06:27<05:59, 18.70it/s]

{'loss': 0.2183, 'grad_norm': 9.75462532043457, 'learning_rate': 4.51006711409396e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68292/75000 [1:06:27<06:00, 18.59it/s]

{'loss': 0.2535, 'grad_norm': 12.437777519226074, 'learning_rate': 4.503355704697987e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68302/75000 [1:06:28<05:59, 18.65it/s]

{'loss': 0.2419, 'grad_norm': 2.04787015914917, 'learning_rate': 4.4966442953020135e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68312/75000 [1:06:28<06:17, 17.73it/s]

{'loss': 0.2522, 'grad_norm': 3.2448055744171143, 'learning_rate': 4.48993288590604e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68322/75000 [1:06:29<06:39, 16.73it/s]

{'loss': 0.2443, 'grad_norm': 1.4360102415084839, 'learning_rate': 4.483221476510067e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68332/75000 [1:06:29<06:27, 17.23it/s]

{'loss': 0.2322, 'grad_norm': 18.490657806396484, 'learning_rate': 4.476510067114094e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68342/75000 [1:06:30<06:32, 16.96it/s]

{'loss': 0.2801, 'grad_norm': 2.223031997680664, 'learning_rate': 4.469798657718121e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68352/75000 [1:06:31<06:27, 17.17it/s]

{'loss': 0.4135, 'grad_norm': 8.838656425476074, 'learning_rate': 4.463087248322148e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68362/75000 [1:06:31<06:41, 16.52it/s]

{'loss': 0.3726, 'grad_norm': 8.54216480255127, 'learning_rate': 4.456375838926175e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68372/75000 [1:06:32<06:31, 16.93it/s]

{'loss': 0.3318, 'grad_norm': 2.409757137298584, 'learning_rate': 4.4496644295302015e-06, 'epoch': 2.73}


                                                       
 91%|█████████ | 68382/75000 [1:06:32<06:24, 17.22it/s]

{'loss': 0.3072, 'grad_norm': 6.642215728759766, 'learning_rate': 4.442953020134228e-06, 'epoch': 2.74}


                                                       
 91%|█████████ | 68392/75000 [1:06:33<06:29, 16.95it/s]

{'loss': 0.2395, 'grad_norm': 3.7010293006896973, 'learning_rate': 4.436241610738255e-06, 'epoch': 2.74}


                                                       
 91%|█████████ | 68403/75000 [1:06:33<06:02, 18.18it/s]

{'loss': 0.323, 'grad_norm': 2.977592945098877, 'learning_rate': 4.429530201342282e-06, 'epoch': 2.74}


                                                       
 91%|█████████ | 68414/75000 [1:06:34<06:16, 17.48it/s]

{'loss': 0.2606, 'grad_norm': 2.767394781112671, 'learning_rate': 4.422818791946309e-06, 'epoch': 2.74}


                                                       
 91%|█████████ | 68423/75000 [1:06:35<05:56, 18.44it/s]

{'loss': 0.2487, 'grad_norm': 2.9986002445220947, 'learning_rate': 4.416107382550336e-06, 'epoch': 2.74}


                                                       
 91%|█████████ | 68433/75000 [1:06:35<06:23, 17.12it/s]

{'loss': 0.3071, 'grad_norm': 7.495667457580566, 'learning_rate': 4.409395973154363e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68443/75000 [1:06:36<06:52, 15.89it/s]

{'loss': 0.2513, 'grad_norm': 1.1947957277297974, 'learning_rate': 4.4026845637583895e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68453/75000 [1:06:36<06:33, 16.62it/s]

{'loss': 0.3329, 'grad_norm': 4.5440521240234375, 'learning_rate': 4.395973154362416e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68463/75000 [1:06:37<06:16, 17.37it/s]

{'loss': 0.2759, 'grad_norm': 1.594184160232544, 'learning_rate': 4.389261744966443e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68473/75000 [1:06:38<06:28, 16.80it/s]

{'loss': 0.3322, 'grad_norm': 5.077215194702148, 'learning_rate': 4.38255033557047e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68481/75000 [1:06:38<06:47, 16.00it/s]

{'loss': 0.3589, 'grad_norm': 4.761589527130127, 'learning_rate': 4.375838926174497e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68493/75000 [1:06:39<06:29, 16.69it/s]

{'loss': 0.4585, 'grad_norm': 11.601518630981445, 'learning_rate': 4.369127516778524e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68500/75000 [1:06:39<06:09, 17.58it/s]

{'loss': 0.2399, 'grad_norm': 0.7813499569892883, 'learning_rate': 4.362416107382551e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68511/75000 [1:06:41<08:05, 13.37it/s]

{'loss': 0.2529, 'grad_norm': 22.883146286010742, 'learning_rate': 4.3557046979865775e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68522/75000 [1:06:41<06:18, 17.11it/s]

{'loss': 0.3477, 'grad_norm': 1.5019913911819458, 'learning_rate': 4.348993288590604e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68532/75000 [1:06:42<06:57, 15.50it/s]

{'loss': 0.2627, 'grad_norm': 1.6720402240753174, 'learning_rate': 4.342281879194631e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68542/75000 [1:06:42<06:48, 15.82it/s]

{'loss': 0.3332, 'grad_norm': 2.086261510848999, 'learning_rate': 4.335570469798658e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68552/75000 [1:06:43<06:39, 16.14it/s]

{'loss': 0.3053, 'grad_norm': 8.426180839538574, 'learning_rate': 4.328859060402685e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68562/75000 [1:06:44<06:07, 17.53it/s]

{'loss': 0.2955, 'grad_norm': 2.4964826107025146, 'learning_rate': 4.322147651006712e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68571/75000 [1:06:44<05:56, 18.05it/s]

{'loss': 0.368, 'grad_norm': 3.4389288425445557, 'learning_rate': 4.315436241610738e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68582/75000 [1:06:45<05:53, 18.16it/s]

{'loss': 0.2645, 'grad_norm': 3.507625102996826, 'learning_rate': 4.3087248322147654e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68592/75000 [1:06:45<05:34, 19.16it/s]

{'loss': 0.2885, 'grad_norm': 3.591440200805664, 'learning_rate': 4.302013422818792e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68602/75000 [1:06:46<05:44, 18.55it/s]

{'loss': 0.361, 'grad_norm': 2.8364410400390625, 'learning_rate': 4.295302013422819e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68613/75000 [1:06:46<05:33, 19.15it/s]

{'loss': 0.2791, 'grad_norm': 2.3416597843170166, 'learning_rate': 4.288590604026846e-06, 'epoch': 2.74}


                                                       
 91%|█████████▏| 68623/75000 [1:06:47<05:27, 19.46it/s]

{'loss': 0.2524, 'grad_norm': 1.7008659839630127, 'learning_rate': 4.281879194630873e-06, 'epoch': 2.74}


                                                       
 92%|█████████▏| 68632/75000 [1:06:47<06:12, 17.12it/s]

{'loss': 0.364, 'grad_norm': 2.0850281715393066, 'learning_rate': 4.2751677852349e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68641/75000 [1:06:48<05:47, 18.30it/s]

{'loss': 0.2414, 'grad_norm': 1.8894785642623901, 'learning_rate': 4.2684563758389265e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68654/75000 [1:06:49<05:17, 20.00it/s]

{'loss': 0.2777, 'grad_norm': 1.1912719011306763, 'learning_rate': 4.2617449664429526e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68663/75000 [1:06:49<05:32, 19.07it/s]

{'loss': 0.3033, 'grad_norm': 7.41215181350708, 'learning_rate': 4.25503355704698e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68671/75000 [1:06:49<05:35, 18.86it/s]

{'loss': 0.2518, 'grad_norm': 4.853457450866699, 'learning_rate': 4.248322147651007e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68684/75000 [1:06:50<05:13, 20.17it/s]

{'loss': 0.366, 'grad_norm': 4.252333164215088, 'learning_rate': 4.241610738255033e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68692/75000 [1:06:50<05:21, 19.65it/s]

{'loss': 0.3459, 'grad_norm': 6.662158966064453, 'learning_rate': 4.234899328859061e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68704/75000 [1:06:51<05:04, 20.67it/s]

{'loss': 0.2628, 'grad_norm': 2.4680542945861816, 'learning_rate': 4.228187919463088e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68713/75000 [1:06:52<05:19, 19.68it/s]

{'loss': 0.3817, 'grad_norm': 7.504144668579102, 'learning_rate': 4.2214765100671145e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68723/75000 [1:06:52<05:08, 20.37it/s]

{'loss': 0.2865, 'grad_norm': 2.5368614196777344, 'learning_rate': 4.214765100671141e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68731/75000 [1:06:52<05:22, 19.42it/s]

{'loss': 0.2507, 'grad_norm': 9.9821195602417, 'learning_rate': 4.208053691275167e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68743/75000 [1:06:53<05:20, 19.53it/s]

{'loss': 0.3533, 'grad_norm': 6.667605400085449, 'learning_rate': 4.201342281879195e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68752/75000 [1:06:54<05:28, 19.02it/s]

{'loss': 0.3147, 'grad_norm': 8.414443016052246, 'learning_rate': 4.194630872483222e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68764/75000 [1:06:54<05:06, 20.36it/s]

{'loss': 0.2825, 'grad_norm': 2.2485132217407227, 'learning_rate': 4.187919463087248e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68772/75000 [1:06:55<05:39, 18.35it/s]

{'loss': 0.1967, 'grad_norm': 2.5848984718322754, 'learning_rate': 4.181208053691276e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68781/75000 [1:06:55<05:38, 18.39it/s]

{'loss': 0.3049, 'grad_norm': 1.94340181350708, 'learning_rate': 4.1744966442953025e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68793/75000 [1:06:56<05:30, 18.79it/s]

{'loss': 0.2249, 'grad_norm': 2.9832329750061035, 'learning_rate': 4.1677852348993285e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68801/75000 [1:06:56<05:15, 19.62it/s]

{'loss': 0.4082, 'grad_norm': 5.659689426422119, 'learning_rate': 4.161073825503356e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68814/75000 [1:06:57<05:10, 19.95it/s]

{'loss': 0.29, 'grad_norm': 15.703454971313477, 'learning_rate': 4.154362416107382e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68821/75000 [1:06:57<05:21, 19.19it/s]

{'loss': 0.2688, 'grad_norm': 7.3426618576049805, 'learning_rate': 4.14765100671141e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68832/75000 [1:06:58<05:12, 19.74it/s]

{'loss': 0.3394, 'grad_norm': 2.545013904571533, 'learning_rate': 4.140939597315437e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68842/75000 [1:06:58<05:16, 19.44it/s]

{'loss': 0.4039, 'grad_norm': 5.234137058258057, 'learning_rate': 4.134228187919463e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68854/75000 [1:06:59<05:02, 20.32it/s]

{'loss': 0.3043, 'grad_norm': 0.35344964265823364, 'learning_rate': 4.1275167785234905e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68863/75000 [1:06:59<05:09, 19.85it/s]

{'loss': 0.3408, 'grad_norm': 3.0916666984558105, 'learning_rate': 4.120805369127517e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68871/75000 [1:07:00<05:32, 18.46it/s]

{'loss': 0.3172, 'grad_norm': 4.038034915924072, 'learning_rate': 4.114093959731543e-06, 'epoch': 2.75}


                                                       
 92%|█████████▏| 68882/75000 [1:07:00<05:26, 18.76it/s]

{'loss': 0.3255, 'grad_norm': 4.807170391082764, 'learning_rate': 4.107382550335571e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68892/75000 [1:07:01<05:28, 18.62it/s]

{'loss': 0.1917, 'grad_norm': 2.9149820804595947, 'learning_rate': 4.100671140939597e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68903/75000 [1:07:01<05:35, 18.18it/s]

{'loss': 0.1591, 'grad_norm': 1.5512826442718506, 'learning_rate': 4.093959731543624e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68914/75000 [1:07:02<05:09, 19.68it/s]

{'loss': 0.3789, 'grad_norm': 2.2462446689605713, 'learning_rate': 4.0872483221476516e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68922/75000 [1:07:02<05:43, 17.67it/s]

{'loss': 0.3409, 'grad_norm': 4.106489181518555, 'learning_rate': 4.080536912751678e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68934/75000 [1:07:03<05:02, 20.07it/s]

{'loss': 0.273, 'grad_norm': 3.8078508377075195, 'learning_rate': 4.073825503355705e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68943/75000 [1:07:04<05:38, 17.88it/s]

{'loss': 0.2975, 'grad_norm': 10.197856903076172, 'learning_rate': 4.067114093959732e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68951/75000 [1:07:04<05:21, 18.79it/s]

{'loss': 0.2597, 'grad_norm': 0.3876955509185791, 'learning_rate': 4.060402684563758e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68963/75000 [1:07:05<05:30, 18.28it/s]

{'loss': 0.368, 'grad_norm': 5.263922214508057, 'learning_rate': 4.053691275167786e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68972/75000 [1:07:05<05:04, 19.80it/s]

{'loss': 0.3117, 'grad_norm': 1.9884501695632935, 'learning_rate': 4.046979865771812e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68982/75000 [1:07:06<05:24, 18.56it/s]

{'loss': 0.27, 'grad_norm': 2.6083855628967285, 'learning_rate': 4.040268456375839e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 68994/75000 [1:07:06<05:10, 19.35it/s]

{'loss': 0.2102, 'grad_norm': 4.197698593139648, 'learning_rate': 4.033557046979866e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69000/75000 [1:07:07<05:12, 19.19it/s]

{'loss': 0.3013, 'grad_norm': 10.819659233093262, 'learning_rate': 4.026845637583892e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69014/75000 [1:07:08<06:24, 15.59it/s]

{'loss': 0.2939, 'grad_norm': 1.9568594694137573, 'learning_rate': 4.020134228187919e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69022/75000 [1:07:08<05:59, 16.64it/s]

{'loss': 0.2259, 'grad_norm': 1.523289442062378, 'learning_rate': 4.013422818791947e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69033/75000 [1:07:09<05:32, 17.92it/s]

{'loss': 0.3714, 'grad_norm': 3.5502636432647705, 'learning_rate': 4.006711409395973e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69041/75000 [1:07:09<05:12, 19.07it/s]

{'loss': 0.2462, 'grad_norm': 2.8625528812408447, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69052/75000 [1:07:10<05:06, 19.43it/s]

{'loss': 0.4141, 'grad_norm': 6.664095401763916, 'learning_rate': 3.9932885906040275e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69063/75000 [1:07:11<05:24, 18.27it/s]

{'loss': 0.2756, 'grad_norm': 1.5381486415863037, 'learning_rate': 3.9865771812080535e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69072/75000 [1:07:11<05:12, 18.94it/s]

{'loss': 0.3204, 'grad_norm': 2.011946201324463, 'learning_rate': 3.979865771812081e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69083/75000 [1:07:12<05:21, 18.40it/s]

{'loss': 0.2604, 'grad_norm': 2.10689377784729, 'learning_rate': 3.973154362416107e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69091/75000 [1:07:12<05:10, 19.01it/s]

{'loss': 0.2515, 'grad_norm': 4.901753902435303, 'learning_rate': 3.966442953020134e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69102/75000 [1:07:13<05:13, 18.82it/s]

{'loss': 0.3265, 'grad_norm': 2.18874454498291, 'learning_rate': 3.959731543624162e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69114/75000 [1:07:13<04:49, 20.30it/s]

{'loss': 0.2996, 'grad_norm': 2.616675853729248, 'learning_rate': 3.953020134228188e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69123/75000 [1:07:14<04:51, 20.17it/s]

{'loss': 0.2745, 'grad_norm': 7.330686569213867, 'learning_rate': 3.946308724832215e-06, 'epoch': 2.76}


                                                       
 92%|█████████▏| 69134/75000 [1:07:14<04:53, 19.99it/s]

{'loss': 0.2989, 'grad_norm': 3.6164355278015137, 'learning_rate': 3.939597315436242e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69144/75000 [1:07:15<04:55, 19.84it/s]

{'loss': 0.28, 'grad_norm': 6.45367956161499, 'learning_rate': 3.932885906040268e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69153/75000 [1:07:15<04:56, 19.75it/s]

{'loss': 0.3198, 'grad_norm': 1.2522755861282349, 'learning_rate': 3.926174496644296e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69163/75000 [1:07:16<05:05, 19.09it/s]

{'loss': 0.3272, 'grad_norm': 2.662705659866333, 'learning_rate': 3.919463087248322e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69171/75000 [1:07:16<05:18, 18.28it/s]

{'loss': 0.3564, 'grad_norm': 4.794504165649414, 'learning_rate': 3.912751677852349e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69182/75000 [1:07:17<04:54, 19.77it/s]

{'loss': 0.1999, 'grad_norm': 4.463900089263916, 'learning_rate': 3.906040268456377e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69192/75000 [1:07:17<05:20, 18.10it/s]

{'loss': 0.2481, 'grad_norm': 1.6067101955413818, 'learning_rate': 3.899328859060403e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69201/75000 [1:07:18<05:42, 16.95it/s]

{'loss': 0.2444, 'grad_norm': 9.495635032653809, 'learning_rate': 3.8926174496644295e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69214/75000 [1:07:19<04:46, 20.18it/s]

{'loss': 0.2461, 'grad_norm': 4.780450344085693, 'learning_rate': 3.885906040268457e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69223/75000 [1:07:19<05:08, 18.73it/s]

{'loss': 0.2987, 'grad_norm': 2.4409801959991455, 'learning_rate': 3.879194630872483e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69233/75000 [1:07:19<04:55, 19.53it/s]

{'loss': 0.2414, 'grad_norm': 1.216138482093811, 'learning_rate': 3.87248322147651e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69242/75000 [1:07:20<05:19, 18.03it/s]

{'loss': 0.2999, 'grad_norm': 6.025100231170654, 'learning_rate': 3.865771812080537e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69253/75000 [1:07:21<05:03, 18.93it/s]

{'loss': 0.3719, 'grad_norm': 3.731989622116089, 'learning_rate': 3.859060402684564e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69261/75000 [1:07:21<04:49, 19.86it/s]

{'loss': 0.4034, 'grad_norm': 2.246450662612915, 'learning_rate': 3.8523489932885914e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69272/75000 [1:07:22<05:07, 18.61it/s]

{'loss': 0.2978, 'grad_norm': 19.351581573486328, 'learning_rate': 3.8456375838926174e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69282/75000 [1:07:22<04:59, 19.09it/s]

{'loss': 0.356, 'grad_norm': 4.852204322814941, 'learning_rate': 3.838926174496644e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69294/75000 [1:07:23<04:44, 20.08it/s]

{'loss': 0.2434, 'grad_norm': 1.278339147567749, 'learning_rate': 3.832214765100672e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69302/75000 [1:07:23<05:23, 17.59it/s]

{'loss': 0.3621, 'grad_norm': 7.365505695343018, 'learning_rate': 3.825503355704698e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69311/75000 [1:07:24<04:59, 18.99it/s]

{'loss': 0.2387, 'grad_norm': 3.2969565391540527, 'learning_rate': 3.818791946308725e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69323/75000 [1:07:24<04:56, 19.12it/s]

{'loss': 0.4025, 'grad_norm': 0.5656530857086182, 'learning_rate': 3.8120805369127517e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69332/75000 [1:07:25<04:58, 18.96it/s]

{'loss': 0.3088, 'grad_norm': 0.4861004650592804, 'learning_rate': 3.8053691275167786e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69342/75000 [1:07:25<04:55, 19.14it/s]

{'loss': 0.2712, 'grad_norm': 2.986452102661133, 'learning_rate': 3.798657718120806e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69352/75000 [1:07:26<05:09, 18.25it/s]

{'loss': 0.3618, 'grad_norm': 1.4158039093017578, 'learning_rate': 3.7919463087248323e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69362/75000 [1:07:26<04:49, 19.48it/s]

{'loss': 0.4512, 'grad_norm': 3.600985527038574, 'learning_rate': 3.785234899328859e-06, 'epoch': 2.77}


                                                       
 92%|█████████▏| 69371/75000 [1:07:27<04:55, 19.02it/s]

{'loss': 0.3218, 'grad_norm': 3.68350887298584, 'learning_rate': 3.7785234899328864e-06, 'epoch': 2.77}


                                                       
 93%|█████████▎| 69381/75000 [1:07:27<04:43, 19.79it/s]

{'loss': 0.3642, 'grad_norm': 4.7769036293029785, 'learning_rate': 3.771812080536913e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69391/75000 [1:07:28<05:11, 18.00it/s]

{'loss': 0.282, 'grad_norm': 1.8076201677322388, 'learning_rate': 3.76510067114094e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69403/75000 [1:07:29<05:20, 17.45it/s]

{'loss': 0.2799, 'grad_norm': 2.923276424407959, 'learning_rate': 3.7583892617449665e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69412/75000 [1:07:29<05:42, 16.31it/s]

{'loss': 0.316, 'grad_norm': 1.2602800130844116, 'learning_rate': 3.7516778523489934e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69422/75000 [1:07:30<05:14, 17.75it/s]

{'loss': 0.2295, 'grad_norm': 1.1588960886001587, 'learning_rate': 3.7449664429530207e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69433/75000 [1:07:30<04:52, 19.04it/s]

{'loss': 0.3578, 'grad_norm': 0.4908628761768341, 'learning_rate': 3.738255033557047e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69443/75000 [1:07:31<04:51, 19.09it/s]

{'loss': 0.2873, 'grad_norm': 6.6786580085754395, 'learning_rate': 3.731543624161074e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69451/75000 [1:07:31<04:46, 19.34it/s]

{'loss': 0.4201, 'grad_norm': 3.2321882247924805, 'learning_rate': 3.7248322147651012e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69463/75000 [1:07:32<04:53, 18.85it/s]

{'loss': 0.2225, 'grad_norm': 1.264284610748291, 'learning_rate': 3.7181208053691276e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69473/75000 [1:07:32<05:34, 16.53it/s]

{'loss': 0.2687, 'grad_norm': 1.719333291053772, 'learning_rate': 3.7114093959731545e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69483/75000 [1:07:33<04:55, 18.67it/s]

{'loss': 0.3936, 'grad_norm': 9.13147258758545, 'learning_rate': 3.704697986577181e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69491/75000 [1:07:33<05:08, 17.86it/s]

{'loss': 0.2519, 'grad_norm': 4.417064189910889, 'learning_rate': 3.697986577181208e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69500/75000 [1:07:34<05:20, 17.18it/s]

{'loss': 0.2518, 'grad_norm': 2.994514226913452, 'learning_rate': 3.6912751677852355e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69514/75000 [1:07:35<06:08, 14.88it/s]

{'loss': 0.1984, 'grad_norm': 0.6281306743621826, 'learning_rate': 3.684563758389262e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69522/75000 [1:07:36<05:54, 15.47it/s]

{'loss': 0.2925, 'grad_norm': 8.088163375854492, 'learning_rate': 3.6778523489932888e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69532/75000 [1:07:37<05:07, 17.76it/s]

{'loss': 0.2448, 'grad_norm': 4.716533184051514, 'learning_rate': 3.671140939597316e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69542/75000 [1:07:37<05:01, 18.10it/s]

{'loss': 0.3121, 'grad_norm': 8.595173835754395, 'learning_rate': 3.6644295302013425e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69552/75000 [1:07:38<04:57, 18.34it/s]

{'loss': 0.4256, 'grad_norm': 6.7548394203186035, 'learning_rate': 3.6577181208053693e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69561/75000 [1:07:38<05:08, 17.65it/s]

{'loss': 0.2447, 'grad_norm': 2.5253665447235107, 'learning_rate': 3.6510067114093958e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69571/75000 [1:07:39<05:49, 15.53it/s]

{'loss': 0.2888, 'grad_norm': 1.8375352621078491, 'learning_rate': 3.644295302013423e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69582/75000 [1:07:39<05:01, 17.97it/s]

{'loss': 0.2876, 'grad_norm': 6.2701640129089355, 'learning_rate': 3.63758389261745e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69592/75000 [1:07:40<05:42, 15.78it/s]

{'loss': 0.2641, 'grad_norm': 1.2128905057907104, 'learning_rate': 3.6308724832214763e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69601/75000 [1:07:40<05:02, 17.85it/s]

{'loss': 0.2443, 'grad_norm': 3.6938350200653076, 'learning_rate': 3.6241610738255036e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69613/75000 [1:07:41<04:41, 19.16it/s]

{'loss': 0.2703, 'grad_norm': 3.1902339458465576, 'learning_rate': 3.617449664429531e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69621/75000 [1:07:42<04:45, 18.82it/s]

{'loss': 0.3201, 'grad_norm': 9.300202369689941, 'learning_rate': 3.6107382550335573e-06, 'epoch': 2.78}


                                                       
 93%|█████████▎| 69632/75000 [1:07:42<04:33, 19.66it/s]

{'loss': 0.2705, 'grad_norm': 1.7930549383163452, 'learning_rate': 3.604026845637584e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69642/75000 [1:07:43<05:26, 16.40it/s]

{'loss': 0.2593, 'grad_norm': 4.8805766105651855, 'learning_rate': 3.5973154362416106e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69652/75000 [1:07:43<04:57, 17.99it/s]

{'loss': 0.3265, 'grad_norm': 4.990687370300293, 'learning_rate': 3.590604026845638e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69660/75000 [1:07:44<04:58, 17.90it/s]

{'loss': 0.2479, 'grad_norm': 3.938164710998535, 'learning_rate': 3.5838926174496647e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69672/75000 [1:07:45<06:34, 13.51it/s]

{'loss': 0.2756, 'grad_norm': 2.671290397644043, 'learning_rate': 3.577181208053691e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69682/75000 [1:07:45<05:59, 14.80it/s]

{'loss': 0.4079, 'grad_norm': 4.292843818664551, 'learning_rate': 3.5704697986577184e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69693/75000 [1:07:46<05:10, 17.11it/s]

{'loss': 0.3536, 'grad_norm': 11.702274322509766, 'learning_rate': 3.5637583892617453e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69702/75000 [1:07:46<05:09, 17.13it/s]

{'loss': 0.3981, 'grad_norm': 0.9995912313461304, 'learning_rate': 3.5570469798657717e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69712/75000 [1:07:47<05:00, 17.60it/s]

{'loss': 0.3107, 'grad_norm': 10.489819526672363, 'learning_rate': 3.550335570469799e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69722/75000 [1:07:48<05:16, 16.69it/s]

{'loss': 0.3131, 'grad_norm': 9.334428787231445, 'learning_rate': 3.5436241610738254e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69732/75000 [1:07:48<05:18, 16.54it/s]

{'loss': 0.2883, 'grad_norm': 1.7086889743804932, 'learning_rate': 3.5369127516778523e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69742/75000 [1:07:49<05:20, 16.43it/s]

{'loss': 0.3072, 'grad_norm': 5.415833950042725, 'learning_rate': 3.5302013422818795e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69753/75000 [1:07:49<05:11, 16.86it/s]

{'loss': 0.3309, 'grad_norm': 1.2250826358795166, 'learning_rate': 3.523489932885906e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69763/75000 [1:07:50<04:54, 17.79it/s]

{'loss': 0.3571, 'grad_norm': 2.3239076137542725, 'learning_rate': 3.5167785234899332e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69773/75000 [1:07:51<04:47, 18.20it/s]

{'loss': 0.3556, 'grad_norm': 2.1776974201202393, 'learning_rate': 3.51006711409396e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69781/75000 [1:07:51<05:27, 15.94it/s]

{'loss': 0.3666, 'grad_norm': 2.72617769241333, 'learning_rate': 3.5033557046979865e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69793/75000 [1:07:52<05:32, 15.67it/s]

{'loss': 0.4641, 'grad_norm': 11.03509521484375, 'learning_rate': 3.496644295302014e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69800/75000 [1:07:52<05:02, 17.17it/s]

{'loss': 0.3214, 'grad_norm': 1.3514063358306885, 'learning_rate': 3.4899328859060402e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69813/75000 [1:07:53<06:04, 14.25it/s]

{'loss': 0.4062, 'grad_norm': 14.347567558288574, 'learning_rate': 3.483221476510067e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69822/75000 [1:07:54<05:07, 16.86it/s]

{'loss': 0.3354, 'grad_norm': 4.321389198303223, 'learning_rate': 3.4765100671140944e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69832/75000 [1:07:55<05:50, 14.74it/s]

{'loss': 0.2243, 'grad_norm': 2.718977928161621, 'learning_rate': 3.4697986577181208e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69842/75000 [1:07:55<04:55, 17.44it/s]

{'loss': 0.2515, 'grad_norm': 4.781578063964844, 'learning_rate': 3.4630872483221476e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69852/75000 [1:07:56<05:55, 14.47it/s]

{'loss': 0.2576, 'grad_norm': 2.4589169025421143, 'learning_rate': 3.456375838926175e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69862/75000 [1:07:57<05:57, 14.38it/s]

{'loss': 0.2851, 'grad_norm': 2.641267776489258, 'learning_rate': 3.4496644295302013e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69872/75000 [1:07:57<05:35, 15.28it/s]

{'loss': 0.2829, 'grad_norm': 6.2560224533081055, 'learning_rate': 3.4429530201342286e-06, 'epoch': 2.79}


                                                       
 93%|█████████▎| 69882/75000 [1:07:58<06:13, 13.72it/s]

{'loss': 0.3493, 'grad_norm': 2.975855827331543, 'learning_rate': 3.436241610738255e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69892/75000 [1:07:59<07:10, 11.85it/s]

{'loss': 0.374, 'grad_norm': 4.391942501068115, 'learning_rate': 3.429530201342282e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69902/75000 [1:08:00<09:18,  9.12it/s]

{'loss': 0.2832, 'grad_norm': 1.7646300792694092, 'learning_rate': 3.422818791946309e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69912/75000 [1:08:01<06:51, 12.36it/s]

{'loss': 0.3997, 'grad_norm': 6.652344703674316, 'learning_rate': 3.4161073825503356e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69922/75000 [1:08:01<05:12, 16.27it/s]

{'loss': 0.3403, 'grad_norm': 5.984941005706787, 'learning_rate': 3.4093959731543625e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69933/75000 [1:08:02<04:42, 17.92it/s]

{'loss': 0.2759, 'grad_norm': 4.020820140838623, 'learning_rate': 3.4026845637583897e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69942/75000 [1:08:02<05:37, 14.99it/s]

{'loss': 0.263, 'grad_norm': 6.522061824798584, 'learning_rate': 3.395973154362416e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69952/75000 [1:08:03<06:17, 13.37it/s]

{'loss': 0.2914, 'grad_norm': 1.8490453958511353, 'learning_rate': 3.389261744966443e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69962/75000 [1:08:04<05:10, 16.20it/s]

{'loss': 0.4018, 'grad_norm': 2.4157309532165527, 'learning_rate': 3.3825503355704695e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69972/75000 [1:08:05<07:41, 10.89it/s]

{'loss': 0.2389, 'grad_norm': 5.005209445953369, 'learning_rate': 3.3758389261744967e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69980/75000 [1:08:05<06:29, 12.87it/s]

{'loss': 0.3146, 'grad_norm': 4.061185359954834, 'learning_rate': 3.369127516778524e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 69992/75000 [1:08:06<06:26, 12.96it/s]

{'loss': 0.4147, 'grad_norm': 2.28704833984375, 'learning_rate': 3.3624161073825504e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70000/75000 [1:08:07<05:06, 16.30it/s]

{'loss': 0.3291, 'grad_norm': 3.4089345932006836, 'learning_rate': 3.3557046979865773e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70013/75000 [1:08:09<07:06, 11.68it/s]

{'loss': 0.28, 'grad_norm': 1.516281008720398, 'learning_rate': 3.3489932885906046e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70020/75000 [1:08:10<08:54,  9.32it/s]

{'loss': 0.2074, 'grad_norm': 4.115298271179199, 'learning_rate': 3.342281879194631e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70033/75000 [1:08:11<05:33, 14.91it/s]

{'loss': 0.264, 'grad_norm': 1.4279377460479736, 'learning_rate': 3.335570469798658e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70043/75000 [1:08:11<05:00, 16.52it/s]

{'loss': 0.1772, 'grad_norm': 2.0549213886260986, 'learning_rate': 3.3288590604026843e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70051/75000 [1:08:12<05:01, 16.39it/s]

{'loss': 0.2903, 'grad_norm': 1.4680591821670532, 'learning_rate': 3.3221476510067116e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70061/75000 [1:08:13<04:54, 16.79it/s]

{'loss': 0.3974, 'grad_norm': 4.14675760269165, 'learning_rate': 3.3154362416107384e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70071/75000 [1:08:14<07:14, 11.34it/s]

{'loss': 0.296, 'grad_norm': 5.448907375335693, 'learning_rate': 3.308724832214765e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70081/75000 [1:08:14<05:38, 14.52it/s]

{'loss': 0.2774, 'grad_norm': 2.6235694885253906, 'learning_rate': 3.302013422818792e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70091/75000 [1:08:15<08:33,  9.56it/s]

{'loss': 0.3243, 'grad_norm': 4.324428081512451, 'learning_rate': 3.2953020134228194e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70103/75000 [1:08:17<07:18, 11.16it/s]

{'loss': 0.3129, 'grad_norm': 8.880027770996094, 'learning_rate': 3.288590604026846e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70112/75000 [1:08:17<06:12, 13.12it/s]

{'loss': 0.342, 'grad_norm': 8.98106861114502, 'learning_rate': 3.2818791946308727e-06, 'epoch': 2.8}


                                                       
 93%|█████████▎| 70122/75000 [1:08:18<05:03, 16.10it/s]

{'loss': 0.2826, 'grad_norm': 4.239741325378418, 'learning_rate': 3.275167785234899e-06, 'epoch': 2.8}


                                                       
 94%|█████████▎| 70130/75000 [1:08:18<04:44, 17.10it/s]

{'loss': 0.2562, 'grad_norm': 3.795214891433716, 'learning_rate': 3.2684563758389264e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70142/75000 [1:08:20<07:04, 11.44it/s]

{'loss': 0.2642, 'grad_norm': 2.391961097717285, 'learning_rate': 3.2617449664429532e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70152/75000 [1:08:20<04:54, 16.49it/s]

{'loss': 0.304, 'grad_norm': 4.448770999908447, 'learning_rate': 3.2550335570469797e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70162/75000 [1:08:21<04:45, 16.96it/s]

{'loss': 0.2199, 'grad_norm': 6.937138557434082, 'learning_rate': 3.248322147651007e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70170/75000 [1:08:21<05:36, 14.34it/s]

{'loss': 0.3238, 'grad_norm': 2.72395658493042, 'learning_rate': 3.241610738255034e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70180/75000 [1:08:23<11:54,  6.75it/s]

{'loss': 0.3483, 'grad_norm': 3.1227688789367676, 'learning_rate': 3.2348993288590602e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70193/75000 [1:08:24<06:23, 12.52it/s]

{'loss': 0.3197, 'grad_norm': 4.592179775238037, 'learning_rate': 3.2281879194630875e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70203/75000 [1:08:25<04:49, 16.56it/s]

{'loss': 0.3031, 'grad_norm': 1.3649204969406128, 'learning_rate': 3.2214765100671148e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70211/75000 [1:08:25<04:39, 17.16it/s]

{'loss': 0.3026, 'grad_norm': 1.8864941596984863, 'learning_rate': 3.214765100671141e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70223/75000 [1:08:27<08:01,  9.93it/s]

{'loss': 0.3082, 'grad_norm': 5.63084077835083, 'learning_rate': 3.208053691275168e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70231/75000 [1:08:27<05:23, 14.75it/s]

{'loss': 0.3062, 'grad_norm': 9.451394081115723, 'learning_rate': 3.2013422818791945e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70241/75000 [1:08:28<08:09,  9.72it/s]

{'loss': 0.2688, 'grad_norm': 1.856925129890442, 'learning_rate': 3.1946308724832218e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70251/75000 [1:08:30<09:01,  8.77it/s]

{'loss': 0.3188, 'grad_norm': 3.0080199241638184, 'learning_rate': 3.1879194630872486e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70263/75000 [1:08:31<05:18, 14.85it/s]

{'loss': 0.3202, 'grad_norm': 2.4325599670410156, 'learning_rate': 3.181208053691275e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70271/75000 [1:08:31<05:14, 15.06it/s]

{'loss': 0.358, 'grad_norm': 1.6715977191925049, 'learning_rate': 3.1744966442953023e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70281/75000 [1:08:32<08:00,  9.83it/s]

{'loss': 0.3229, 'grad_norm': 1.422249436378479, 'learning_rate': 3.167785234899329e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70293/75000 [1:08:33<04:58, 15.78it/s]

{'loss': 0.3123, 'grad_norm': 4.5031867027282715, 'learning_rate': 3.1610738255033556e-06, 'epoch': 2.81}


                                                       
 94%|█████████▎| 70301/75000 [1:08:34<05:52, 13.34it/s]

{'loss': 0.3636, 'grad_norm': 2.338576555252075, 'learning_rate': 3.154362416107383e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 70313/75000 [1:08:34<04:47, 16.33it/s]

{'loss': 0.2679, 'grad_norm': 3.496779680252075, 'learning_rate': 3.1476510067114093e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 70321/75000 [1:08:35<05:10, 15.06it/s]

{'loss': 0.2601, 'grad_norm': 5.622708320617676, 'learning_rate': 3.1409395973154366e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 70331/75000 [1:08:36<05:55, 13.14it/s]

{'loss': 0.3442, 'grad_norm': 1.5819246768951416, 'learning_rate': 3.1342281879194634e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 70343/75000 [1:08:37<06:13, 12.47it/s]

{'loss': 0.2061, 'grad_norm': 3.518723487854004, 'learning_rate': 3.12751677852349e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 70351/75000 [1:08:37<05:48, 13.33it/s]

{'loss': 0.2423, 'grad_norm': 10.332902908325195, 'learning_rate': 3.120805369127517e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 70361/75000 [1:08:38<05:39, 13.67it/s]

{'loss': 0.1835, 'grad_norm': 1.7954672574996948, 'learning_rate': 3.1140939597315436e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 70373/75000 [1:08:39<04:27, 17.29it/s]

{'loss': 0.2, 'grad_norm': 1.0638668537139893, 'learning_rate': 3.1073825503355704e-06, 'epoch': 2.81}


                                                       
 94%|█████████▍| 70383/75000 [1:08:39<04:07, 18.64it/s]

{'loss': 0.2961, 'grad_norm': 4.86647891998291, 'learning_rate': 3.1006711409395977e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70392/75000 [1:08:40<04:09, 18.49it/s]

{'loss': 0.22, 'grad_norm': 4.9986395835876465, 'learning_rate': 3.0939597315436246e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70401/75000 [1:08:40<04:00, 19.16it/s]

{'loss': 0.3698, 'grad_norm': 5.40242862701416, 'learning_rate': 3.087248322147651e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70414/75000 [1:08:41<03:47, 20.17it/s]

{'loss': 0.2618, 'grad_norm': 2.7073137760162354, 'learning_rate': 3.080536912751678e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70423/75000 [1:08:41<04:03, 18.82it/s]

{'loss': 0.4311, 'grad_norm': 3.542893171310425, 'learning_rate': 3.073825503355705e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70432/75000 [1:08:42<03:58, 19.12it/s]

{'loss': 0.2264, 'grad_norm': 1.6312801837921143, 'learning_rate': 3.067114093959732e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70442/75000 [1:08:42<03:53, 19.51it/s]

{'loss': 0.2942, 'grad_norm': 0.5157151818275452, 'learning_rate': 3.0604026845637584e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70453/75000 [1:08:43<04:11, 18.07it/s]

{'loss': 0.3045, 'grad_norm': 7.756524562835693, 'learning_rate': 3.0536912751677853e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70464/75000 [1:08:44<03:47, 19.97it/s]

{'loss': 0.2374, 'grad_norm': 5.582319259643555, 'learning_rate': 3.0469798657718125e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70472/75000 [1:08:44<03:48, 19.78it/s]

{'loss': 0.2835, 'grad_norm': 3.955362319946289, 'learning_rate': 3.040268456375839e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70482/75000 [1:08:44<03:58, 18.94it/s]

{'loss': 0.2898, 'grad_norm': 3.664889335632324, 'learning_rate': 3.033557046979866e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70491/75000 [1:08:45<03:58, 18.89it/s]

{'loss': 0.3262, 'grad_norm': 1.804445743560791, 'learning_rate': 3.0268456375838927e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70500/75000 [1:08:45<03:48, 19.70it/s]

{'loss': 0.2945, 'grad_norm': 3.657341718673706, 'learning_rate': 3.02013422818792e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70513/75000 [1:08:47<04:52, 15.32it/s]

{'loss': 0.3281, 'grad_norm': 12.281418800354004, 'learning_rate': 3.0134228187919464e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70523/75000 [1:08:47<04:36, 16.17it/s]

{'loss': 0.3613, 'grad_norm': 49.78177261352539, 'learning_rate': 3.0067114093959732e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70532/75000 [1:08:48<04:02, 18.45it/s]

{'loss': 0.3008, 'grad_norm': 5.193946838378906, 'learning_rate': 3e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70544/75000 [1:08:48<03:40, 20.19it/s]

{'loss': 0.2828, 'grad_norm': 3.4687740802764893, 'learning_rate': 2.993288590604027e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70553/75000 [1:08:49<03:40, 20.15it/s]

{'loss': 0.3092, 'grad_norm': 3.911839723587036, 'learning_rate': 2.9865771812080538e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70562/75000 [1:08:49<03:36, 20.48it/s]

{'loss': 0.3804, 'grad_norm': 2.302485942840576, 'learning_rate': 2.9798657718120806e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70573/75000 [1:08:50<03:42, 19.91it/s]

{'loss': 0.3893, 'grad_norm': 1.1031763553619385, 'learning_rate': 2.9731543624161075e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70584/75000 [1:08:50<03:36, 20.36it/s]

{'loss': 0.4998, 'grad_norm': 3.657790184020996, 'learning_rate': 2.9664429530201343e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70593/75000 [1:08:51<03:38, 20.15it/s]

{'loss': 0.3941, 'grad_norm': 2.889798879623413, 'learning_rate': 2.959731543624161e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70602/75000 [1:08:51<03:34, 20.50it/s]

{'loss': 0.3255, 'grad_norm': 6.018143177032471, 'learning_rate': 2.953020134228188e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70614/75000 [1:08:52<03:30, 20.82it/s]

{'loss': 0.2797, 'grad_norm': 0.9619797468185425, 'learning_rate': 2.946308724832215e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70623/75000 [1:08:52<03:30, 20.80it/s]

{'loss': 0.2277, 'grad_norm': 4.822149753570557, 'learning_rate': 2.9395973154362418e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 70632/75000 [1:08:53<03:36, 20.15it/s]

{'loss': 0.2793, 'grad_norm': 7.272536277770996, 'learning_rate': 2.9328859060402686e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70644/75000 [1:08:53<03:32, 20.50it/s]

{'loss': 0.3243, 'grad_norm': 2.107184648513794, 'learning_rate': 2.9261744966442955e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70653/75000 [1:08:54<03:35, 20.20it/s]

{'loss': 0.2918, 'grad_norm': 1.5165987014770508, 'learning_rate': 2.9194630872483223e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70663/75000 [1:08:54<03:42, 19.48it/s]

{'loss': 0.2061, 'grad_norm': 4.29420804977417, 'learning_rate': 2.912751677852349e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70672/75000 [1:08:55<03:53, 18.56it/s]

{'loss': 0.2313, 'grad_norm': 1.6189124584197998, 'learning_rate': 2.906040268456376e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70682/75000 [1:08:55<03:55, 18.34it/s]

{'loss': 0.3163, 'grad_norm': 1.6124099493026733, 'learning_rate': 2.899328859060403e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70693/75000 [1:08:56<04:22, 16.42it/s]

{'loss': 0.2812, 'grad_norm': 5.235788345336914, 'learning_rate': 2.8926174496644297e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70703/75000 [1:08:56<03:51, 18.55it/s]

{'loss': 0.207, 'grad_norm': 3.985990047454834, 'learning_rate': 2.8859060402684566e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70712/75000 [1:08:57<03:50, 18.57it/s]

{'loss': 0.2818, 'grad_norm': 10.580121040344238, 'learning_rate': 2.8791946308724834e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70724/75000 [1:08:57<03:40, 19.42it/s]

{'loss': 0.2697, 'grad_norm': 2.03267765045166, 'learning_rate': 2.8724832214765103e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70731/75000 [1:08:58<03:44, 18.98it/s]

{'loss': 0.3355, 'grad_norm': 4.894826889038086, 'learning_rate': 2.8657718120805367e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70743/75000 [1:08:58<03:41, 19.22it/s]

{'loss': 0.2757, 'grad_norm': 4.469209671020508, 'learning_rate': 2.859060402684564e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70752/75000 [1:08:59<03:45, 18.80it/s]

{'loss': 0.2613, 'grad_norm': 11.608039855957031, 'learning_rate': 2.852348993288591e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70763/75000 [1:09:00<03:39, 19.28it/s]

{'loss': 0.3528, 'grad_norm': 5.361412525177002, 'learning_rate': 2.8456375838926177e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70772/75000 [1:09:00<03:48, 18.49it/s]

{'loss': 0.322, 'grad_norm': 3.6248185634613037, 'learning_rate': 2.838926174496644e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70783/75000 [1:09:01<03:39, 19.21it/s]

{'loss': 0.3082, 'grad_norm': 5.055802822113037, 'learning_rate': 2.8322147651006714e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70793/75000 [1:09:01<04:08, 16.94it/s]

{'loss': 0.4167, 'grad_norm': 4.759724140167236, 'learning_rate': 2.8255033557046983e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70803/75000 [1:09:02<04:02, 17.29it/s]

{'loss': 0.3163, 'grad_norm': 4.877895355224609, 'learning_rate': 2.818791946308725e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70812/75000 [1:09:02<03:56, 17.73it/s]

{'loss': 0.341, 'grad_norm': 2.8792967796325684, 'learning_rate': 2.8120805369127515e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70823/75000 [1:09:03<03:42, 18.76it/s]

{'loss': 0.3833, 'grad_norm': 4.207837104797363, 'learning_rate': 2.805369127516779e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70832/75000 [1:09:03<03:40, 18.87it/s]

{'loss': 0.2996, 'grad_norm': 2.716674566268921, 'learning_rate': 2.7986577181208057e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70843/75000 [1:09:04<03:41, 18.78it/s]

{'loss': 0.3021, 'grad_norm': 1.1591169834136963, 'learning_rate': 2.791946308724832e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70852/75000 [1:09:04<03:41, 18.70it/s]

{'loss': 0.3111, 'grad_norm': 1.8559367656707764, 'learning_rate': 2.785234899328859e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70862/75000 [1:09:05<03:46, 18.25it/s]

{'loss': 0.288, 'grad_norm': 2.992720603942871, 'learning_rate': 2.7785234899328862e-06, 'epoch': 2.83}


                                                       
 94%|█████████▍| 70874/75000 [1:09:05<03:31, 19.51it/s]

{'loss': 0.3536, 'grad_norm': 1.195757508277893, 'learning_rate': 2.771812080536913e-06, 'epoch': 2.83}


                                                       
 95%|█████████▍| 70883/75000 [1:09:06<03:31, 19.43it/s]

{'loss': 0.3433, 'grad_norm': 7.882687568664551, 'learning_rate': 2.7651006711409395e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70892/75000 [1:09:06<03:35, 19.03it/s]

{'loss': 0.2753, 'grad_norm': 1.548593521118164, 'learning_rate': 2.7583892617449664e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70903/75000 [1:09:07<03:34, 19.13it/s]

{'loss': 0.2403, 'grad_norm': 2.575162887573242, 'learning_rate': 2.7516778523489936e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70912/75000 [1:09:07<03:29, 19.47it/s]

{'loss': 0.2746, 'grad_norm': 1.422866702079773, 'learning_rate': 2.7449664429530205e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70922/75000 [1:09:08<03:28, 19.55it/s]

{'loss': 0.2301, 'grad_norm': 6.28750467300415, 'learning_rate': 2.738255033557047e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70933/75000 [1:09:09<03:37, 18.68it/s]

{'loss': 0.2122, 'grad_norm': 1.9313973188400269, 'learning_rate': 2.7315436241610738e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70943/75000 [1:09:09<03:40, 18.40it/s]

{'loss': 0.2569, 'grad_norm': 12.998820304870605, 'learning_rate': 2.724832214765101e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70953/75000 [1:09:10<03:37, 18.60it/s]

{'loss': 0.2612, 'grad_norm': 2.2401602268218994, 'learning_rate': 2.7181208053691275e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70963/75000 [1:09:10<03:30, 19.15it/s]

{'loss': 0.3993, 'grad_norm': 6.211731433868408, 'learning_rate': 2.7114093959731543e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70972/75000 [1:09:11<03:34, 18.74it/s]

{'loss': 0.2976, 'grad_norm': 1.2349560260772705, 'learning_rate': 2.704697986577181e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70982/75000 [1:09:11<03:34, 18.73it/s]

{'loss': 0.2749, 'grad_norm': 4.911622047424316, 'learning_rate': 2.6979865771812085e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 70991/75000 [1:09:12<03:31, 19.00it/s]

{'loss': 0.3372, 'grad_norm': 12.091444969177246, 'learning_rate': 2.691275167785235e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71000/75000 [1:09:12<03:22, 19.79it/s]

{'loss': 0.3901, 'grad_norm': 2.320737838745117, 'learning_rate': 2.6845637583892617e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71012/75000 [1:09:16<09:45,  6.81it/s]

{'loss': 0.2954, 'grad_norm': 2.7191548347473145, 'learning_rate': 2.6778523489932886e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71021/75000 [1:09:17<05:13, 12.71it/s]

{'loss': 0.3016, 'grad_norm': 1.1813600063323975, 'learning_rate': 2.671140939597316e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71034/75000 [1:09:17<03:32, 18.68it/s]

{'loss': 0.2017, 'grad_norm': 0.9533759355545044, 'learning_rate': 2.6644295302013423e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71043/75000 [1:09:18<03:23, 19.40it/s]

{'loss': 0.2601, 'grad_norm': 1.2981321811676025, 'learning_rate': 2.657718120805369e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71052/75000 [1:09:18<03:21, 19.56it/s]

{'loss': 0.251, 'grad_norm': 14.802119255065918, 'learning_rate': 2.651006711409396e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71062/75000 [1:09:19<03:19, 19.71it/s]

{'loss': 0.1895, 'grad_norm': 1.9615259170532227, 'learning_rate': 2.644295302013423e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71074/75000 [1:09:20<03:24, 19.23it/s]

{'loss': 0.2977, 'grad_norm': 2.4313220977783203, 'learning_rate': 2.6375838926174497e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71082/75000 [1:09:20<03:24, 19.19it/s]

{'loss': 0.3808, 'grad_norm': 7.4321465492248535, 'learning_rate': 2.6308724832214766e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71093/75000 [1:09:20<03:14, 20.05it/s]

{'loss': 0.3136, 'grad_norm': 4.928846836090088, 'learning_rate': 2.6241610738255034e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71104/75000 [1:09:21<03:27, 18.74it/s]

{'loss': 0.3478, 'grad_norm': 5.7764997482299805, 'learning_rate': 2.6174496644295303e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71113/75000 [1:09:22<03:29, 18.59it/s]

{'loss': 0.2939, 'grad_norm': 1.3984369039535522, 'learning_rate': 2.610738255033557e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71124/75000 [1:09:22<03:24, 18.94it/s]

{'loss': 0.2399, 'grad_norm': 4.22440767288208, 'learning_rate': 2.604026845637584e-06, 'epoch': 2.84}


                                                       
 95%|█████████▍| 71132/75000 [1:09:23<03:23, 19.01it/s]

{'loss': 0.2163, 'grad_norm': 3.9685263633728027, 'learning_rate': 2.597315436241611e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71142/75000 [1:09:23<03:21, 19.12it/s]

{'loss': 0.265, 'grad_norm': 4.90508508682251, 'learning_rate': 2.5906040268456377e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71154/75000 [1:09:24<03:10, 20.20it/s]

{'loss': 0.4039, 'grad_norm': 3.4402976036071777, 'learning_rate': 2.5838926174496645e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71163/75000 [1:09:24<03:10, 20.15it/s]

{'loss': 0.3711, 'grad_norm': 4.675008773803711, 'learning_rate': 2.5771812080536914e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71172/75000 [1:09:25<03:17, 19.43it/s]

{'loss': 0.2482, 'grad_norm': 3.927837610244751, 'learning_rate': 2.5704697986577182e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71181/75000 [1:09:25<03:26, 18.53it/s]

{'loss': 0.2904, 'grad_norm': 1.4926962852478027, 'learning_rate': 2.563758389261745e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71194/75000 [1:09:26<03:05, 20.56it/s]

{'loss': 0.2622, 'grad_norm': 0.4266597628593445, 'learning_rate': 2.557046979865772e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71203/75000 [1:09:26<03:12, 19.72it/s]

{'loss': 0.2995, 'grad_norm': 7.706092357635498, 'learning_rate': 2.550335570469799e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71211/75000 [1:09:27<03:20, 18.86it/s]

{'loss': 0.2414, 'grad_norm': 0.5487259030342102, 'learning_rate': 2.5436241610738257e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71223/75000 [1:09:27<03:22, 18.67it/s]

{'loss': 0.1734, 'grad_norm': 0.5644258260726929, 'learning_rate': 2.5369127516778525e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71233/75000 [1:09:28<03:09, 19.92it/s]

{'loss': 0.3768, 'grad_norm': 8.312752723693848, 'learning_rate': 2.5302013422818794e-06, 'epoch': 2.85}


                                                       
 95%|█████████▍| 71242/75000 [1:09:28<03:11, 19.65it/s]

{'loss': 0.2041, 'grad_norm': 3.0397069454193115, 'learning_rate': 2.5234899328859062e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71251/75000 [1:09:29<03:13, 19.39it/s]

{'loss': 0.4145, 'grad_norm': 2.428999900817871, 'learning_rate': 2.5167785234899326e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71261/75000 [1:09:29<03:20, 18.63it/s]

{'loss': 0.2122, 'grad_norm': 6.219394683837891, 'learning_rate': 2.51006711409396e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71271/75000 [1:09:30<03:13, 19.31it/s]

{'loss': 0.3275, 'grad_norm': 4.1158766746521, 'learning_rate': 2.5033557046979868e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71282/75000 [1:09:30<03:22, 18.32it/s]

{'loss': 0.3181, 'grad_norm': 10.670137405395508, 'learning_rate': 2.4966442953020136e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71291/75000 [1:09:31<03:19, 18.55it/s]

{'loss': 0.3203, 'grad_norm': 16.1990909576416, 'learning_rate': 2.48993288590604e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71301/75000 [1:09:31<03:25, 17.96it/s]

{'loss': 0.289, 'grad_norm': 1.3112460374832153, 'learning_rate': 2.4832214765100673e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71314/75000 [1:09:32<03:02, 20.20it/s]

{'loss': 0.2801, 'grad_norm': 15.042449951171875, 'learning_rate': 2.476510067114094e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71323/75000 [1:09:32<03:16, 18.73it/s]

{'loss': 0.2814, 'grad_norm': 19.395042419433594, 'learning_rate': 2.469798657718121e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71333/75000 [1:09:33<03:21, 18.22it/s]

{'loss': 0.3213, 'grad_norm': 4.536520481109619, 'learning_rate': 2.4630872483221475e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71342/75000 [1:09:33<03:11, 19.07it/s]

{'loss': 0.1655, 'grad_norm': 1.4572829008102417, 'learning_rate': 2.4563758389261747e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71352/75000 [1:09:34<03:25, 17.78it/s]

{'loss': 0.4438, 'grad_norm': 3.843191385269165, 'learning_rate': 2.4496644295302016e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71363/75000 [1:09:35<03:16, 18.54it/s]

{'loss': 0.199, 'grad_norm': 4.5035719871521, 'learning_rate': 2.442953020134228e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71372/75000 [1:09:35<03:24, 17.78it/s]

{'loss': 0.3527, 'grad_norm': 3.4506356716156006, 'learning_rate': 2.436241610738255e-06, 'epoch': 2.85}


                                                       
 95%|█████████▌| 71382/75000 [1:09:36<03:16, 18.40it/s]

{'loss': 0.2835, 'grad_norm': 7.186670303344727, 'learning_rate': 2.429530201342282e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71392/75000 [1:09:36<03:04, 19.53it/s]

{'loss': 0.3758, 'grad_norm': 6.6699018478393555, 'learning_rate': 2.422818791946309e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71402/75000 [1:09:37<03:09, 18.97it/s]

{'loss': 0.2556, 'grad_norm': 8.526065826416016, 'learning_rate': 2.4161073825503354e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71412/75000 [1:09:37<03:07, 19.17it/s]

{'loss': 0.2712, 'grad_norm': 7.458531379699707, 'learning_rate': 2.4093959731543627e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71422/75000 [1:09:38<03:03, 19.47it/s]

{'loss': 0.2816, 'grad_norm': 8.180383682250977, 'learning_rate': 2.4026845637583896e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71432/75000 [1:09:38<03:03, 19.43it/s]

{'loss': 0.2585, 'grad_norm': 2.180838108062744, 'learning_rate': 2.3959731543624164e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71442/75000 [1:09:39<03:22, 17.59it/s]

{'loss': 0.3655, 'grad_norm': 1.5821408033370972, 'learning_rate': 2.389261744966443e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71453/75000 [1:09:39<03:09, 18.68it/s]

{'loss': 0.2615, 'grad_norm': 5.064964294433594, 'learning_rate': 2.38255033557047e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71464/75000 [1:09:40<03:11, 18.48it/s]

{'loss': 0.3035, 'grad_norm': 5.751307964324951, 'learning_rate': 2.375838926174497e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71472/75000 [1:09:40<03:15, 18.04it/s]

{'loss': 0.2291, 'grad_norm': 2.7406933307647705, 'learning_rate': 2.3691275167785234e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71483/75000 [1:09:41<02:59, 19.56it/s]

{'loss': 0.3058, 'grad_norm': 6.579704284667969, 'learning_rate': 2.3624161073825503e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71491/75000 [1:09:41<03:18, 17.70it/s]

{'loss': 0.3288, 'grad_norm': 2.725358009338379, 'learning_rate': 2.3557046979865775e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71500/75000 [1:09:42<02:58, 19.61it/s]

{'loss': 0.3109, 'grad_norm': 4.631503105163574, 'learning_rate': 2.3489932885906044e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71513/75000 [1:09:43<04:04, 14.28it/s]

{'loss': 0.2371, 'grad_norm': 3.4543933868408203, 'learning_rate': 2.342281879194631e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71521/75000 [1:09:44<03:34, 16.23it/s]

{'loss': 0.2633, 'grad_norm': 5.838211536407471, 'learning_rate': 2.3355704697986577e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71534/75000 [1:09:44<03:05, 18.68it/s]

{'loss': 0.3602, 'grad_norm': 3.1073625087738037, 'learning_rate': 2.328859060402685e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71541/75000 [1:09:45<03:09, 18.27it/s]

{'loss': 0.29, 'grad_norm': 3.2453606128692627, 'learning_rate': 2.3221476510067114e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71551/75000 [1:09:45<03:04, 18.68it/s]

{'loss': 0.3407, 'grad_norm': 1.3927295207977295, 'learning_rate': 2.3154362416107382e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71562/75000 [1:09:46<03:05, 18.52it/s]

{'loss': 0.332, 'grad_norm': 0.9334685802459717, 'learning_rate': 2.308724832214765e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71572/75000 [1:09:46<03:00, 18.99it/s]

{'loss': 0.2404, 'grad_norm': 8.125650405883789, 'learning_rate': 2.3020134228187924e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71584/75000 [1:09:47<02:51, 19.90it/s]

{'loss': 0.2356, 'grad_norm': 3.8644068241119385, 'learning_rate': 2.295302013422819e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71594/75000 [1:09:48<03:06, 18.22it/s]

{'loss': 0.4303, 'grad_norm': 3.0420854091644287, 'learning_rate': 2.2885906040268457e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71602/75000 [1:09:48<03:06, 18.18it/s]

{'loss': 0.3363, 'grad_norm': 3.813640594482422, 'learning_rate': 2.2818791946308725e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71613/75000 [1:09:49<02:52, 19.58it/s]

{'loss': 0.4659, 'grad_norm': 11.442947387695312, 'learning_rate': 2.2751677852348998e-06, 'epoch': 2.86}


                                                       
 95%|█████████▌| 71622/75000 [1:09:49<03:03, 18.46it/s]

{'loss': 0.2121, 'grad_norm': 8.199698448181152, 'learning_rate': 2.268456375838926e-06, 'epoch': 2.86}


                                                       
 96%|█████████▌| 71633/75000 [1:09:50<02:57, 18.93it/s]

{'loss': 0.3564, 'grad_norm': 6.991686820983887, 'learning_rate': 2.261744966442953e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71643/75000 [1:09:50<02:51, 19.61it/s]

{'loss': 0.2955, 'grad_norm': 6.819721221923828, 'learning_rate': 2.25503355704698e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71654/75000 [1:09:51<02:52, 19.36it/s]

{'loss': 0.3187, 'grad_norm': 3.89542293548584, 'learning_rate': 2.2483221476510068e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71663/75000 [1:09:51<02:53, 19.22it/s]

{'loss': 0.2319, 'grad_norm': 7.954959869384766, 'learning_rate': 2.2416107382550336e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71673/75000 [1:09:52<02:50, 19.49it/s]

{'loss': 0.3085, 'grad_norm': 9.350489616394043, 'learning_rate': 2.2348993288590605e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71684/75000 [1:09:52<02:55, 18.87it/s]

{'loss': 0.241, 'grad_norm': 2.4598615169525146, 'learning_rate': 2.2281879194630873e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71693/75000 [1:09:53<02:52, 19.21it/s]

{'loss': 0.3432, 'grad_norm': 4.693829536437988, 'learning_rate': 2.221476510067114e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71701/75000 [1:09:53<02:51, 19.23it/s]

{'loss': 0.319, 'grad_norm': 6.142760753631592, 'learning_rate': 2.214765100671141e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71713/75000 [1:09:54<02:54, 18.85it/s]

{'loss': 0.3591, 'grad_norm': 4.515807628631592, 'learning_rate': 2.208053691275168e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71722/75000 [1:09:54<03:03, 17.85it/s]

{'loss': 0.2489, 'grad_norm': 0.8377220034599304, 'learning_rate': 2.2013422818791947e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71734/75000 [1:09:55<02:45, 19.76it/s]

{'loss': 0.3144, 'grad_norm': 0.721742570400238, 'learning_rate': 2.1946308724832216e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71743/75000 [1:09:56<02:54, 18.63it/s]

{'loss': 0.2829, 'grad_norm': 4.122429847717285, 'learning_rate': 2.1879194630872484e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71753/75000 [1:09:56<02:57, 18.26it/s]

{'loss': 0.3829, 'grad_norm': 6.212968826293945, 'learning_rate': 2.1812080536912753e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71762/75000 [1:09:57<02:48, 19.17it/s]

{'loss': 0.4324, 'grad_norm': 11.502455711364746, 'learning_rate': 2.174496644295302e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71773/75000 [1:09:57<03:06, 17.29it/s]

{'loss': 0.3257, 'grad_norm': 3.619509696960449, 'learning_rate': 2.167785234899329e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71782/75000 [1:09:58<03:07, 17.18it/s]

{'loss': 0.2434, 'grad_norm': 4.7292046546936035, 'learning_rate': 2.161073825503356e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71794/75000 [1:09:58<02:42, 19.77it/s]

{'loss': 0.2478, 'grad_norm': 1.622219204902649, 'learning_rate': 2.1543624161073827e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71803/75000 [1:09:59<02:42, 19.64it/s]

{'loss': 0.2542, 'grad_norm': 2.0924136638641357, 'learning_rate': 2.1476510067114096e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71812/75000 [1:09:59<02:46, 19.09it/s]

{'loss': 0.2286, 'grad_norm': 5.371546745300293, 'learning_rate': 2.1409395973154364e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71824/75000 [1:10:00<02:42, 19.59it/s]

{'loss': 0.2021, 'grad_norm': 0.7406103014945984, 'learning_rate': 2.1342281879194633e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71834/75000 [1:10:00<02:42, 19.49it/s]

{'loss': 0.3427, 'grad_norm': 3.4884254932403564, 'learning_rate': 2.12751677852349e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71844/75000 [1:10:01<02:37, 20.07it/s]

{'loss': 0.1896, 'grad_norm': 3.7774343490600586, 'learning_rate': 2.1208053691275166e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71853/75000 [1:10:01<02:51, 18.39it/s]

{'loss': 0.2908, 'grad_norm': 6.672311305999756, 'learning_rate': 2.114093959731544e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71864/75000 [1:10:02<02:38, 19.74it/s]

{'loss': 0.3306, 'grad_norm': 2.594876527786255, 'learning_rate': 2.1073825503355707e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71872/75000 [1:10:02<02:46, 18.74it/s]

{'loss': 0.3032, 'grad_norm': 5.44064998626709, 'learning_rate': 2.1006711409395975e-06, 'epoch': 2.87}


                                                       
 96%|█████████▌| 71884/75000 [1:10:03<02:41, 19.25it/s]

{'loss': 0.3511, 'grad_norm': 4.788936614990234, 'learning_rate': 2.093959731543624e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71893/75000 [1:10:04<02:54, 17.80it/s]

{'loss': 0.3591, 'grad_norm': 8.213852882385254, 'learning_rate': 2.0872483221476512e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71901/75000 [1:10:04<03:05, 16.72it/s]

{'loss': 0.2554, 'grad_norm': 1.0584555864334106, 'learning_rate': 2.080536912751678e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71913/75000 [1:10:05<02:46, 18.51it/s]

{'loss': 0.2935, 'grad_norm': 6.503808975219727, 'learning_rate': 2.073825503355705e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71922/75000 [1:10:05<02:47, 18.36it/s]

{'loss': 0.334, 'grad_norm': 16.9772891998291, 'learning_rate': 2.0671140939597314e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71932/75000 [1:10:06<02:58, 17.22it/s]

{'loss': 0.4064, 'grad_norm': 3.877732753753662, 'learning_rate': 2.0604026845637587e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71941/75000 [1:10:06<02:42, 18.83it/s]

{'loss': 0.2029, 'grad_norm': 10.365859031677246, 'learning_rate': 2.0536912751677855e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71953/75000 [1:10:07<02:34, 19.77it/s]

{'loss': 0.2552, 'grad_norm': 2.7372806072235107, 'learning_rate': 2.046979865771812e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71964/75000 [1:10:07<02:34, 19.67it/s]

{'loss': 0.3286, 'grad_norm': 12.942495346069336, 'learning_rate': 2.040268456375839e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71971/75000 [1:10:08<02:40, 18.90it/s]

{'loss': 0.3422, 'grad_norm': 0.8915894031524658, 'learning_rate': 2.033557046979866e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71984/75000 [1:10:08<02:38, 19.02it/s]

{'loss': 0.3185, 'grad_norm': 4.274179458618164, 'learning_rate': 2.026845637583893e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 71994/75000 [1:10:09<02:30, 19.98it/s]

{'loss': 0.237, 'grad_norm': 2.914513111114502, 'learning_rate': 2.0201342281879194e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72000/75000 [1:10:09<02:31, 19.86it/s]

{'loss': 0.2329, 'grad_norm': 3.8692924976348877, 'learning_rate': 2.013422818791946e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72013/75000 [1:10:10<03:11, 15.60it/s]

{'loss': 0.3184, 'grad_norm': 2.7652924060821533, 'learning_rate': 2.0067114093959735e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72023/75000 [1:10:11<02:53, 17.16it/s]

{'loss': 0.2782, 'grad_norm': 1.5503052473068237, 'learning_rate': 2.0000000000000003e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72033/75000 [1:10:12<02:48, 17.65it/s]

{'loss': 0.3064, 'grad_norm': 2.9021334648132324, 'learning_rate': 1.9932885906040268e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72041/75000 [1:10:12<02:45, 17.83it/s]

{'loss': 0.3734, 'grad_norm': 2.2532174587249756, 'learning_rate': 1.9865771812080536e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72052/75000 [1:10:13<02:35, 18.99it/s]

{'loss': 0.2877, 'grad_norm': 2.273263931274414, 'learning_rate': 1.979865771812081e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72064/75000 [1:10:13<02:34, 19.00it/s]

{'loss': 0.3796, 'grad_norm': 2.2876195907592773, 'learning_rate': 1.9731543624161073e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72073/75000 [1:10:14<02:47, 17.46it/s]

{'loss': 0.2662, 'grad_norm': 7.542128562927246, 'learning_rate': 1.966442953020134e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72081/75000 [1:10:14<02:34, 18.89it/s]

{'loss': 0.2716, 'grad_norm': 7.309977054595947, 'learning_rate': 1.959731543624161e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72092/75000 [1:10:15<02:23, 20.24it/s]

{'loss': 0.2773, 'grad_norm': 2.0072968006134033, 'learning_rate': 1.9530201342281883e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72101/75000 [1:10:15<02:25, 19.96it/s]

{'loss': 0.3604, 'grad_norm': 9.817337989807129, 'learning_rate': 1.9463087248322147e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72113/75000 [1:10:16<02:22, 20.23it/s]

{'loss': 0.3536, 'grad_norm': 3.5405497550964355, 'learning_rate': 1.9395973154362416e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72121/75000 [1:10:16<02:39, 18.04it/s]

{'loss': 0.442, 'grad_norm': 5.158262252807617, 'learning_rate': 1.9328859060402684e-06, 'epoch': 2.88}


                                                       
 96%|█████████▌| 72133/75000 [1:10:17<02:30, 19.06it/s]

{'loss': 0.3024, 'grad_norm': 5.215395450592041, 'learning_rate': 1.9261744966442957e-06, 'epoch': 2.89}


                                                       
 96%|█████████▌| 72142/75000 [1:10:17<02:24, 19.78it/s]

{'loss': 0.3133, 'grad_norm': 1.2789913415908813, 'learning_rate': 1.919463087248322e-06, 'epoch': 2.89}


                                                       
 96%|█████████▌| 72151/75000 [1:10:18<02:23, 19.79it/s]

{'loss': 0.3259, 'grad_norm': 7.076742649078369, 'learning_rate': 1.912751677852349e-06, 'epoch': 2.89}


                                                       
 96%|█████████▌| 72162/75000 [1:10:18<02:28, 19.10it/s]

{'loss': 0.3894, 'grad_norm': 14.100753784179688, 'learning_rate': 1.9060402684563759e-06, 'epoch': 2.89}


                                                       
 96%|█████████▌| 72172/75000 [1:10:19<02:25, 19.46it/s]

{'loss': 0.4563, 'grad_norm': 1.2047384977340698, 'learning_rate': 1.899328859060403e-06, 'epoch': 2.89}


                                                       
 96%|█████████▌| 72183/75000 [1:10:19<02:27, 19.06it/s]

{'loss': 0.3151, 'grad_norm': 2.3057661056518555, 'learning_rate': 1.8926174496644296e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72192/75000 [1:10:20<02:36, 17.96it/s]

{'loss': 0.3332, 'grad_norm': 1.6092396974563599, 'learning_rate': 1.8859060402684564e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72202/75000 [1:10:20<02:25, 19.20it/s]

{'loss': 0.2302, 'grad_norm': 5.333515644073486, 'learning_rate': 1.8791946308724833e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72212/75000 [1:10:21<02:27, 18.91it/s]

{'loss': 0.3088, 'grad_norm': 1.5609546899795532, 'learning_rate': 1.8724832214765103e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72221/75000 [1:10:22<02:48, 16.50it/s]

{'loss': 0.2207, 'grad_norm': 7.334081172943115, 'learning_rate': 1.865771812080537e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72232/75000 [1:10:22<02:27, 18.81it/s]

{'loss': 0.282, 'grad_norm': 2.7930946350097656, 'learning_rate': 1.8590604026845638e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72243/75000 [1:10:23<02:35, 17.77it/s]

{'loss': 0.4002, 'grad_norm': 4.1594014167785645, 'learning_rate': 1.8523489932885905e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72254/75000 [1:10:23<02:20, 19.55it/s]

{'loss': 0.1892, 'grad_norm': 0.7927892208099365, 'learning_rate': 1.8456375838926177e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72263/75000 [1:10:24<02:23, 19.09it/s]

{'loss': 0.3605, 'grad_norm': 2.0926809310913086, 'learning_rate': 1.8389261744966444e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72274/75000 [1:10:24<02:14, 20.31it/s]

{'loss': 0.3828, 'grad_norm': 9.58640193939209, 'learning_rate': 1.8322147651006712e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72282/75000 [1:10:25<02:19, 19.45it/s]

{'loss': 0.3767, 'grad_norm': 5.429275035858154, 'learning_rate': 1.8255033557046979e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72292/75000 [1:10:25<02:34, 17.47it/s]

{'loss': 0.3541, 'grad_norm': 11.642178535461426, 'learning_rate': 1.818791946308725e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72302/75000 [1:10:26<02:21, 19.07it/s]

{'loss': 0.3548, 'grad_norm': 0.9568902254104614, 'learning_rate': 1.8120805369127518e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72313/75000 [1:10:26<02:23, 18.69it/s]

{'loss': 0.1944, 'grad_norm': 0.912955641746521, 'learning_rate': 1.8053691275167786e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72323/75000 [1:10:27<02:19, 19.22it/s]

{'loss': 0.3595, 'grad_norm': 5.004377841949463, 'learning_rate': 1.7986577181208053e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72333/75000 [1:10:27<02:16, 19.58it/s]

{'loss': 0.2804, 'grad_norm': 1.505914568901062, 'learning_rate': 1.7919463087248324e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72343/75000 [1:10:28<02:21, 18.82it/s]

{'loss': 0.2912, 'grad_norm': 5.358508110046387, 'learning_rate': 1.7852348993288592e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72352/75000 [1:10:28<02:21, 18.74it/s]

{'loss': 0.1949, 'grad_norm': 2.3596034049987793, 'learning_rate': 1.7785234899328858e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72363/75000 [1:10:29<02:24, 18.24it/s]

{'loss': 0.3548, 'grad_norm': 13.115593910217285, 'learning_rate': 1.7718120805369127e-06, 'epoch': 2.89}


                                                       
 96%|█████████▋| 72372/75000 [1:10:29<02:16, 19.20it/s]

{'loss': 0.2513, 'grad_norm': 5.530623912811279, 'learning_rate': 1.7651006711409398e-06, 'epoch': 2.89}


                                                       
 97%|█████████▋| 72384/75000 [1:10:30<02:14, 19.48it/s]

{'loss': 0.2559, 'grad_norm': 7.062488079071045, 'learning_rate': 1.7583892617449666e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72391/75000 [1:10:31<02:28, 17.57it/s]

{'loss': 0.213, 'grad_norm': 3.5199828147888184, 'learning_rate': 1.7516778523489933e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72404/75000 [1:10:31<02:11, 19.79it/s]

{'loss': 0.2826, 'grad_norm': 4.919629096984863, 'learning_rate': 1.7449664429530201e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72412/75000 [1:10:32<02:26, 17.63it/s]

{'loss': 0.2568, 'grad_norm': 1.4882521629333496, 'learning_rate': 1.7382550335570472e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72424/75000 [1:10:32<02:11, 19.55it/s]

{'loss': 0.3306, 'grad_norm': 0.8517472743988037, 'learning_rate': 1.7315436241610738e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72433/75000 [1:10:33<02:19, 18.45it/s]

{'loss': 0.3157, 'grad_norm': 5.306608200073242, 'learning_rate': 1.7248322147651007e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72443/75000 [1:10:33<02:20, 18.21it/s]

{'loss': 0.2392, 'grad_norm': 4.809325218200684, 'learning_rate': 1.7181208053691275e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72452/75000 [1:10:34<02:19, 18.31it/s]

{'loss': 0.4081, 'grad_norm': 8.037684440612793, 'learning_rate': 1.7114093959731546e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72461/75000 [1:10:34<02:10, 19.48it/s]

{'loss': 0.2856, 'grad_norm': 6.82735013961792, 'learning_rate': 1.7046979865771812e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72473/75000 [1:10:35<02:09, 19.49it/s]

{'loss': 0.2613, 'grad_norm': 4.664314270019531, 'learning_rate': 1.697986577181208e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72483/75000 [1:10:35<02:12, 19.06it/s]

{'loss': 0.224, 'grad_norm': 3.325568675994873, 'learning_rate': 1.6912751677852347e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72492/75000 [1:10:36<02:09, 19.37it/s]

{'loss': 0.3175, 'grad_norm': 2.1972219944000244, 'learning_rate': 1.684563758389262e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72500/75000 [1:10:36<02:08, 19.50it/s]

{'loss': 0.2286, 'grad_norm': 1.119147539138794, 'learning_rate': 1.6778523489932886e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72511/75000 [1:10:38<03:03, 13.53it/s]

{'loss': 0.2131, 'grad_norm': 5.340183258056641, 'learning_rate': 1.6711409395973155e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72522/75000 [1:10:38<02:25, 17.00it/s]

{'loss': 0.2204, 'grad_norm': 0.9092201590538025, 'learning_rate': 1.6644295302013421e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72533/75000 [1:10:39<02:27, 16.68it/s]

{'loss': 0.2776, 'grad_norm': 1.8455407619476318, 'learning_rate': 1.6577181208053692e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72542/75000 [1:10:39<02:12, 18.60it/s]

{'loss': 0.3724, 'grad_norm': 2.872345209121704, 'learning_rate': 1.651006711409396e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72551/75000 [1:10:40<02:12, 18.44it/s]

{'loss': 0.1873, 'grad_norm': 0.6982057094573975, 'learning_rate': 1.644295302013423e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72564/75000 [1:10:40<02:00, 20.27it/s]

{'loss': 0.2042, 'grad_norm': 2.216188907623291, 'learning_rate': 1.6375838926174496e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72572/75000 [1:10:41<02:01, 19.92it/s]

{'loss': 0.2979, 'grad_norm': 0.8015717267990112, 'learning_rate': 1.6308724832214766e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72582/75000 [1:10:41<02:08, 18.75it/s]

{'loss': 0.3042, 'grad_norm': 7.088464260101318, 'learning_rate': 1.6241610738255035e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72592/75000 [1:10:42<02:04, 19.39it/s]

{'loss': 0.332, 'grad_norm': 3.53230619430542, 'learning_rate': 1.6174496644295301e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72604/75000 [1:10:42<01:57, 20.40it/s]

{'loss': 0.3413, 'grad_norm': 2.305183172225952, 'learning_rate': 1.6107382550335574e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72612/75000 [1:10:43<02:09, 18.47it/s]

{'loss': 0.4808, 'grad_norm': 3.8292479515075684, 'learning_rate': 1.604026845637584e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72622/75000 [1:10:43<02:05, 18.88it/s]

{'loss': 0.2784, 'grad_norm': 6.119190692901611, 'learning_rate': 1.5973154362416109e-06, 'epoch': 2.9}


                                                       
 97%|█████████▋| 72632/75000 [1:10:44<02:00, 19.70it/s]

{'loss': 0.3879, 'grad_norm': 1.891926884651184, 'learning_rate': 1.5906040268456375e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72643/75000 [1:10:44<02:01, 19.36it/s]

{'loss': 0.3499, 'grad_norm': 1.014298915863037, 'learning_rate': 1.5838926174496646e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72653/75000 [1:10:45<02:05, 18.72it/s]

{'loss': 0.2832, 'grad_norm': 19.808622360229492, 'learning_rate': 1.5771812080536914e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72662/75000 [1:10:46<02:04, 18.72it/s]

{'loss': 0.3355, 'grad_norm': 14.204097747802734, 'learning_rate': 1.5704697986577183e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72674/75000 [1:10:46<01:57, 19.82it/s]

{'loss': 0.3086, 'grad_norm': 4.074960231781006, 'learning_rate': 1.563758389261745e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72683/75000 [1:10:47<01:57, 19.75it/s]

{'loss': 0.4791, 'grad_norm': 1.7623471021652222, 'learning_rate': 1.5570469798657718e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72692/75000 [1:10:47<02:07, 18.06it/s]

{'loss': 0.3373, 'grad_norm': 3.10921311378479, 'learning_rate': 1.5503355704697989e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72703/75000 [1:10:48<02:00, 19.02it/s]

{'loss': 0.4214, 'grad_norm': 5.343735694885254, 'learning_rate': 1.5436241610738255e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72714/75000 [1:10:48<01:52, 20.26it/s]

{'loss': 0.3768, 'grad_norm': 5.700018882751465, 'learning_rate': 1.5369127516778526e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72723/75000 [1:10:49<01:55, 19.80it/s]

{'loss': 0.3463, 'grad_norm': 4.765997409820557, 'learning_rate': 1.5302013422818792e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72734/75000 [1:10:49<01:56, 19.40it/s]

{'loss': 0.3437, 'grad_norm': 5.89150857925415, 'learning_rate': 1.5234899328859063e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72742/75000 [1:10:50<02:03, 18.24it/s]

{'loss': 0.2756, 'grad_norm': 2.1091790199279785, 'learning_rate': 1.516778523489933e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72753/75000 [1:10:50<02:01, 18.43it/s]

{'loss': 0.3744, 'grad_norm': 10.37922477722168, 'learning_rate': 1.51006711409396e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72762/75000 [1:10:51<01:57, 19.12it/s]

{'loss': 0.1903, 'grad_norm': 4.470376968383789, 'learning_rate': 1.5033557046979866e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72771/75000 [1:10:51<02:05, 17.78it/s]

{'loss': 0.1734, 'grad_norm': 6.650960922241211, 'learning_rate': 1.4966442953020135e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72784/75000 [1:10:52<01:48, 20.36it/s]

{'loss': 0.1877, 'grad_norm': 2.460123062133789, 'learning_rate': 1.4899328859060403e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72791/75000 [1:10:52<02:03, 17.94it/s]

{'loss': 0.3498, 'grad_norm': 1.9195564985275269, 'learning_rate': 1.4832214765100672e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72803/75000 [1:10:53<02:00, 18.25it/s]

{'loss': 0.2848, 'grad_norm': 5.049772262573242, 'learning_rate': 1.476510067114094e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72812/75000 [1:10:53<01:55, 18.95it/s]

{'loss': 0.3057, 'grad_norm': 4.396749496459961, 'learning_rate': 1.4697986577181209e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72824/75000 [1:10:54<01:51, 19.54it/s]

{'loss': 0.1961, 'grad_norm': 1.4076762199401855, 'learning_rate': 1.4630872483221477e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72833/75000 [1:10:54<01:50, 19.55it/s]

{'loss': 0.2184, 'grad_norm': 6.260054111480713, 'learning_rate': 1.4563758389261746e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72841/75000 [1:10:55<01:52, 19.12it/s]

{'loss': 0.2647, 'grad_norm': 1.7747489213943481, 'learning_rate': 1.4496644295302014e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72853/75000 [1:10:55<01:46, 20.23it/s]

{'loss': 0.161, 'grad_norm': 0.47138991951942444, 'learning_rate': 1.4429530201342283e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72862/75000 [1:10:56<01:50, 19.27it/s]

{'loss': 0.303, 'grad_norm': 21.741432189941406, 'learning_rate': 1.4362416107382551e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72873/75000 [1:10:56<01:49, 19.44it/s]

{'loss': 0.2486, 'grad_norm': 6.875839710235596, 'learning_rate': 1.429530201342282e-06, 'epoch': 2.91}


                                                       
 97%|█████████▋| 72882/75000 [1:10:57<01:52, 18.82it/s]

{'loss': 0.2445, 'grad_norm': 20.76885223388672, 'learning_rate': 1.4228187919463088e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72891/75000 [1:10:57<01:47, 19.54it/s]

{'loss': 0.373, 'grad_norm': 4.790870189666748, 'learning_rate': 1.4161073825503357e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72904/75000 [1:10:58<01:50, 19.03it/s]

{'loss': 0.3491, 'grad_norm': 1.998701572418213, 'learning_rate': 1.4093959731543626e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72911/75000 [1:10:59<01:51, 18.78it/s]

{'loss': 0.2159, 'grad_norm': 9.252227783203125, 'learning_rate': 1.4026845637583894e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72924/75000 [1:10:59<01:41, 20.41it/s]

{'loss': 0.3721, 'grad_norm': 2.1699068546295166, 'learning_rate': 1.395973154362416e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72933/75000 [1:11:00<01:45, 19.68it/s]

{'loss': 0.1603, 'grad_norm': 3.9826769828796387, 'learning_rate': 1.3892617449664431e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72943/75000 [1:11:00<01:46, 19.34it/s]

{'loss': 0.2743, 'grad_norm': 2.549513816833496, 'learning_rate': 1.3825503355704698e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72952/75000 [1:11:01<01:57, 17.37it/s]

{'loss': 0.2853, 'grad_norm': 15.611611366271973, 'learning_rate': 1.3758389261744968e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72961/75000 [1:11:01<01:48, 18.78it/s]

{'loss': 0.2257, 'grad_norm': 1.864893913269043, 'learning_rate': 1.3691275167785235e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72972/75000 [1:11:02<01:46, 19.00it/s]

{'loss': 0.3412, 'grad_norm': 0.4187816381454468, 'learning_rate': 1.3624161073825505e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72982/75000 [1:11:02<01:54, 17.66it/s]

{'loss': 0.4122, 'grad_norm': 3.5597751140594482, 'learning_rate': 1.3557046979865772e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 72991/75000 [1:11:03<01:56, 17.25it/s]

{'loss': 0.2801, 'grad_norm': 2.035902976989746, 'learning_rate': 1.3489932885906042e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73000/75000 [1:11:03<01:47, 18.69it/s]

{'loss': 0.3239, 'grad_norm': 10.996273040771484, 'learning_rate': 1.3422818791946309e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73014/75000 [1:11:05<02:07, 15.62it/s]

{'loss': 0.2552, 'grad_norm': 3.9158806800842285, 'learning_rate': 1.335570469798658e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73022/75000 [1:11:05<02:07, 15.47it/s]

{'loss': 0.2783, 'grad_norm': 3.968656539916992, 'learning_rate': 1.3288590604026846e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73033/75000 [1:11:06<01:46, 18.51it/s]

{'loss': 0.2059, 'grad_norm': 1.0996222496032715, 'learning_rate': 1.3221476510067114e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73042/75000 [1:11:06<01:39, 19.74it/s]

{'loss': 0.4374, 'grad_norm': 5.832247734069824, 'learning_rate': 1.3154362416107383e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73053/75000 [1:11:07<01:46, 18.23it/s]

{'loss': 0.2749, 'grad_norm': 1.886702537536621, 'learning_rate': 1.3087248322147651e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73064/75000 [1:11:07<01:38, 19.73it/s]

{'loss': 0.3218, 'grad_norm': 1.6162614822387695, 'learning_rate': 1.302013422818792e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73072/75000 [1:11:08<01:39, 19.32it/s]

{'loss': 0.2731, 'grad_norm': 0.7830122113227844, 'learning_rate': 1.2953020134228188e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73083/75000 [1:11:08<01:41, 18.84it/s]

{'loss': 0.4031, 'grad_norm': 2.938275098800659, 'learning_rate': 1.2885906040268457e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73093/75000 [1:11:09<01:40, 19.03it/s]

{'loss': 0.3348, 'grad_norm': 2.0428807735443115, 'learning_rate': 1.2818791946308726e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73103/75000 [1:11:09<01:41, 18.60it/s]

{'loss': 0.2876, 'grad_norm': 5.978455543518066, 'learning_rate': 1.2751677852348994e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73112/75000 [1:11:10<01:39, 18.97it/s]

{'loss': 0.3768, 'grad_norm': 4.099696636199951, 'learning_rate': 1.2684563758389263e-06, 'epoch': 2.92}


                                                       
 97%|█████████▋| 73123/75000 [1:11:10<01:37, 19.33it/s]

{'loss': 0.3958, 'grad_norm': 2.8232996463775635, 'learning_rate': 1.2617449664429531e-06, 'epoch': 2.92}


                                                       
 98%|█████████▊| 73131/75000 [1:11:11<01:35, 19.56it/s]

{'loss': 0.3425, 'grad_norm': 4.687546253204346, 'learning_rate': 1.25503355704698e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73143/75000 [1:11:11<01:35, 19.39it/s]

{'loss': 0.2254, 'grad_norm': 4.222133159637451, 'learning_rate': 1.2483221476510068e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73153/75000 [1:11:12<01:37, 19.01it/s]

{'loss': 0.3049, 'grad_norm': 3.8646481037139893, 'learning_rate': 1.2416107382550337e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73162/75000 [1:11:12<01:35, 19.20it/s]

{'loss': 0.2269, 'grad_norm': 18.282136917114258, 'learning_rate': 1.2348993288590605e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73171/75000 [1:11:13<01:35, 19.16it/s]

{'loss': 0.2222, 'grad_norm': 2.3918187618255615, 'learning_rate': 1.2281879194630874e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73184/75000 [1:11:14<01:30, 20.04it/s]

{'loss': 0.3398, 'grad_norm': 3.7291910648345947, 'learning_rate': 1.221476510067114e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73192/75000 [1:11:14<01:32, 19.54it/s]

{'loss': 0.2613, 'grad_norm': 2.069620132446289, 'learning_rate': 1.214765100671141e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73202/75000 [1:11:14<01:34, 19.03it/s]

{'loss': 0.2608, 'grad_norm': 2.316862106323242, 'learning_rate': 1.2080536912751677e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73212/75000 [1:11:15<01:41, 17.67it/s]

{'loss': 0.2972, 'grad_norm': 1.2043613195419312, 'learning_rate': 1.2013422818791948e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73222/75000 [1:11:16<01:34, 18.86it/s]

{'loss': 0.292, 'grad_norm': 0.8875536322593689, 'learning_rate': 1.1946308724832214e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73234/75000 [1:11:16<01:27, 20.20it/s]

{'loss': 0.1847, 'grad_norm': 4.052999019622803, 'learning_rate': 1.1879194630872485e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73244/75000 [1:11:17<01:26, 20.20it/s]

{'loss': 0.3493, 'grad_norm': 1.9419147968292236, 'learning_rate': 1.1812080536912751e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73253/75000 [1:11:17<01:31, 19.14it/s]

{'loss': 0.2552, 'grad_norm': 3.9880363941192627, 'learning_rate': 1.1744966442953022e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73263/75000 [1:11:18<01:31, 19.07it/s]

{'loss': 0.4347, 'grad_norm': 2.246324062347412, 'learning_rate': 1.1677852348993288e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73273/75000 [1:11:18<01:35, 18.15it/s]

{'loss': 0.278, 'grad_norm': 4.387347221374512, 'learning_rate': 1.1610738255033557e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73281/75000 [1:11:19<01:31, 18.76it/s]

{'loss': 0.3495, 'grad_norm': 2.7258729934692383, 'learning_rate': 1.1543624161073825e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73292/75000 [1:11:19<01:26, 19.80it/s]

{'loss': 0.313, 'grad_norm': 1.2735986709594727, 'learning_rate': 1.1476510067114094e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73301/75000 [1:11:20<01:27, 19.44it/s]

{'loss': 0.3282, 'grad_norm': 0.7354384660720825, 'learning_rate': 1.1409395973154363e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73312/75000 [1:11:20<01:27, 19.19it/s]

{'loss': 0.303, 'grad_norm': 4.162538528442383, 'learning_rate': 1.134228187919463e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73322/75000 [1:11:21<01:27, 19.10it/s]

{'loss': 0.2741, 'grad_norm': 7.434755802154541, 'learning_rate': 1.12751677852349e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73334/75000 [1:11:21<01:24, 19.61it/s]

{'loss': 0.3112, 'grad_norm': 6.483675956726074, 'learning_rate': 1.1208053691275168e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73342/75000 [1:11:22<01:24, 19.58it/s]

{'loss': 0.2573, 'grad_norm': 10.577089309692383, 'learning_rate': 1.1140939597315437e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73354/75000 [1:11:22<01:24, 19.45it/s]

{'loss': 0.4258, 'grad_norm': 6.696925163269043, 'learning_rate': 1.1073825503355705e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73364/75000 [1:11:23<01:22, 19.82it/s]

{'loss': 0.4522, 'grad_norm': 2.116286039352417, 'learning_rate': 1.1006711409395974e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73373/75000 [1:11:23<01:22, 19.72it/s]

{'loss': 0.2253, 'grad_norm': 3.6018638610839844, 'learning_rate': 1.0939597315436242e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 73381/75000 [1:11:24<01:22, 19.52it/s]

{'loss': 0.3279, 'grad_norm': 4.676231384277344, 'learning_rate': 1.087248322147651e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73393/75000 [1:11:24<01:23, 19.35it/s]

{'loss': 0.4076, 'grad_norm': 15.126261711120605, 'learning_rate': 1.080536912751678e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73404/75000 [1:11:25<01:19, 20.17it/s]

{'loss': 0.3531, 'grad_norm': 2.2020442485809326, 'learning_rate': 1.0738255033557048e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73413/75000 [1:11:25<01:21, 19.44it/s]

{'loss': 0.3763, 'grad_norm': 3.626272678375244, 'learning_rate': 1.0671140939597316e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73424/75000 [1:11:26<01:20, 19.51it/s]

{'loss': 0.3272, 'grad_norm': 1.564147710800171, 'learning_rate': 1.0604026845637583e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73432/75000 [1:11:26<01:25, 18.26it/s]

{'loss': 0.2568, 'grad_norm': 20.822616577148438, 'learning_rate': 1.0536912751677853e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73442/75000 [1:11:27<01:22, 18.88it/s]

{'loss': 0.2943, 'grad_norm': 3.1677095890045166, 'learning_rate': 1.046979865771812e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73453/75000 [1:11:28<01:23, 18.57it/s]

{'loss': 0.2827, 'grad_norm': 1.82504141330719, 'learning_rate': 1.040268456375839e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73462/75000 [1:11:28<01:20, 19.17it/s]

{'loss': 0.3172, 'grad_norm': 3.376007318496704, 'learning_rate': 1.0335570469798657e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73473/75000 [1:11:29<01:23, 18.21it/s]

{'loss': 0.1765, 'grad_norm': 4.202696800231934, 'learning_rate': 1.0268456375838928e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73483/75000 [1:11:29<01:16, 19.74it/s]

{'loss': 0.3453, 'grad_norm': 4.931803226470947, 'learning_rate': 1.0201342281879194e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73493/75000 [1:11:30<01:25, 17.70it/s]

{'loss': 0.2727, 'grad_norm': 1.1505134105682373, 'learning_rate': 1.0134228187919465e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73500/75000 [1:11:30<01:19, 18.93it/s]

{'loss': 0.318, 'grad_norm': 2.8090951442718506, 'learning_rate': 1.006711409395973e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73513/75000 [1:11:31<01:39, 14.94it/s]

{'loss': 0.3096, 'grad_norm': 1.1183501482009888, 'learning_rate': 1.0000000000000002e-06, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73523/75000 [1:11:32<01:35, 15.43it/s]

{'loss': 0.2913, 'grad_norm': 1.2260971069335938, 'learning_rate': 9.932885906040268e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73532/75000 [1:11:32<01:22, 17.83it/s]

{'loss': 0.3761, 'grad_norm': 1.0504149198532104, 'learning_rate': 9.865771812080537e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73542/75000 [1:11:33<01:23, 17.48it/s]

{'loss': 0.314, 'grad_norm': 9.254612922668457, 'learning_rate': 9.798657718120805e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73552/75000 [1:11:34<01:23, 17.43it/s]

{'loss': 0.2485, 'grad_norm': 3.9379868507385254, 'learning_rate': 9.731543624161074e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73561/75000 [1:11:34<01:17, 18.61it/s]

{'loss': 0.2096, 'grad_norm': 2.625023365020752, 'learning_rate': 9.664429530201342e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73572/75000 [1:11:35<01:21, 17.60it/s]

{'loss': 0.2979, 'grad_norm': 3.488941192626953, 'learning_rate': 9.59731543624161e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73583/75000 [1:11:35<01:21, 17.43it/s]

{'loss': 0.1502, 'grad_norm': 2.0230939388275146, 'learning_rate': 9.530201342281879e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73592/75000 [1:11:36<01:16, 18.30it/s]

{'loss': 0.1779, 'grad_norm': 1.8092503547668457, 'learning_rate': 9.463087248322148e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73602/75000 [1:11:36<01:20, 17.34it/s]

{'loss': 0.3109, 'grad_norm': 1.2222899198532104, 'learning_rate': 9.395973154362416e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73614/75000 [1:11:37<01:10, 19.58it/s]

{'loss': 0.2565, 'grad_norm': 0.5342735648155212, 'learning_rate': 9.328859060402685e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73624/75000 [1:11:37<01:11, 19.18it/s]

{'loss': 0.2918, 'grad_norm': 1.757837176322937, 'learning_rate': 9.261744966442952e-07, 'epoch': 2.94}


                                                       
 98%|█████████▊| 73631/75000 [1:11:38<01:11, 19.22it/s]

{'loss': 0.2383, 'grad_norm': 4.076611042022705, 'learning_rate': 9.194630872483222e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73644/75000 [1:11:38<01:07, 19.98it/s]

{'loss': 0.3565, 'grad_norm': 8.013849258422852, 'learning_rate': 9.127516778523489e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73653/75000 [1:11:39<01:07, 20.03it/s]

{'loss': 0.2586, 'grad_norm': 5.291793346405029, 'learning_rate': 9.060402684563759e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73662/75000 [1:11:39<01:08, 19.61it/s]

{'loss': 0.2652, 'grad_norm': 4.420945644378662, 'learning_rate': 8.993288590604026e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73671/75000 [1:11:40<01:17, 17.18it/s]

{'loss': 0.2698, 'grad_norm': 3.737797260284424, 'learning_rate': 8.926174496644296e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73681/75000 [1:11:40<01:11, 18.33it/s]

{'loss': 0.3288, 'grad_norm': 4.243398666381836, 'learning_rate': 8.859060402684564e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73694/75000 [1:11:41<01:06, 19.62it/s]

{'loss': 0.2432, 'grad_norm': 11.552287101745605, 'learning_rate': 8.791946308724833e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73704/75000 [1:11:42<01:05, 19.88it/s]

{'loss': 0.366, 'grad_norm': 5.461968421936035, 'learning_rate': 8.724832214765101e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73711/75000 [1:11:42<01:07, 19.06it/s]

{'loss': 0.3422, 'grad_norm': 6.049304962158203, 'learning_rate': 8.657718120805369e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73723/75000 [1:11:43<01:07, 19.00it/s]

{'loss': 0.361, 'grad_norm': 2.471221685409546, 'learning_rate': 8.590604026845638e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73734/75000 [1:11:43<01:04, 19.77it/s]

{'loss': 0.2953, 'grad_norm': 3.5932223796844482, 'learning_rate': 8.523489932885906e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73743/75000 [1:11:44<01:07, 18.70it/s]

{'loss': 0.3515, 'grad_norm': 3.416182279586792, 'learning_rate': 8.456375838926174e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73753/75000 [1:11:44<01:03, 19.60it/s]

{'loss': 0.2073, 'grad_norm': 3.285918712615967, 'learning_rate': 8.389261744966443e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73761/75000 [1:11:44<01:04, 19.15it/s]

{'loss': 0.3269, 'grad_norm': 0.9927441477775574, 'learning_rate': 8.322147651006711e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73771/75000 [1:11:45<01:08, 17.86it/s]

{'loss': 0.4206, 'grad_norm': 3.042994737625122, 'learning_rate': 8.25503355704698e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73784/75000 [1:11:46<01:01, 19.68it/s]

{'loss': 0.2621, 'grad_norm': 6.4398393630981445, 'learning_rate': 8.187919463087248e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73794/75000 [1:11:46<00:59, 20.21it/s]

{'loss': 0.391, 'grad_norm': 1.9190709590911865, 'learning_rate': 8.120805369127517e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73803/75000 [1:11:47<01:00, 19.85it/s]

{'loss': 0.3296, 'grad_norm': 1.4045605659484863, 'learning_rate': 8.053691275167787e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73812/75000 [1:11:47<01:02, 19.06it/s]

{'loss': 0.308, 'grad_norm': 0.763160228729248, 'learning_rate': 7.986577181208054e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73822/75000 [1:11:48<01:01, 19.02it/s]

{'loss': 0.2338, 'grad_norm': 5.130430698394775, 'learning_rate': 7.919463087248323e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73831/75000 [1:11:48<00:58, 19.87it/s]

{'loss': 0.3567, 'grad_norm': 7.180255889892578, 'learning_rate': 7.852348993288591e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73843/75000 [1:11:49<00:58, 19.93it/s]

{'loss': 0.351, 'grad_norm': 5.829549312591553, 'learning_rate': 7.785234899328859e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73852/75000 [1:11:49<00:58, 19.58it/s]

{'loss': 0.3348, 'grad_norm': 5.058860778808594, 'learning_rate': 7.718120805369127e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73863/75000 [1:11:50<00:58, 19.30it/s]

{'loss': 0.2348, 'grad_norm': 1.9051377773284912, 'learning_rate': 7.651006711409396e-07, 'epoch': 2.95}


                                                       
 98%|█████████▊| 73872/75000 [1:11:50<00:57, 19.69it/s]

{'loss': 0.4365, 'grad_norm': 8.399506568908691, 'learning_rate': 7.583892617449665e-07, 'epoch': 2.95}


                                                       
 99%|█████████▊| 73883/75000 [1:11:51<00:57, 19.28it/s]

{'loss': 0.327, 'grad_norm': 2.8175666332244873, 'learning_rate': 7.516778523489933e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73894/75000 [1:11:51<00:53, 20.61it/s]

{'loss': 0.3399, 'grad_norm': 5.574191570281982, 'learning_rate': 7.449664429530202e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73902/75000 [1:11:52<00:59, 18.51it/s]

{'loss': 0.3542, 'grad_norm': 7.343011379241943, 'learning_rate': 7.38255033557047e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73914/75000 [1:11:52<00:55, 19.47it/s]

{'loss': 0.2908, 'grad_norm': 2.5842843055725098, 'learning_rate': 7.315436241610739e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73922/75000 [1:11:53<00:55, 19.58it/s]

{'loss': 0.3249, 'grad_norm': 0.479449063539505, 'learning_rate': 7.248322147651007e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73932/75000 [1:11:53<00:54, 19.44it/s]

{'loss': 0.3073, 'grad_norm': 2.957550525665283, 'learning_rate': 7.181208053691276e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73943/75000 [1:11:54<00:58, 18.15it/s]

{'loss': 0.4553, 'grad_norm': 7.946589469909668, 'learning_rate': 7.114093959731544e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73952/75000 [1:11:54<00:56, 18.60it/s]

{'loss': 0.2373, 'grad_norm': 10.69526481628418, 'learning_rate': 7.046979865771813e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73961/75000 [1:11:55<00:58, 17.81it/s]

{'loss': 0.2768, 'grad_norm': 2.2643351554870605, 'learning_rate': 6.97986577181208e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73973/75000 [1:11:55<00:51, 20.04it/s]

{'loss': 0.2894, 'grad_norm': 1.0870513916015625, 'learning_rate': 6.912751677852349e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73982/75000 [1:11:56<00:50, 19.98it/s]

{'loss': 0.3085, 'grad_norm': 2.826976776123047, 'learning_rate': 6.845637583892617e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 73992/75000 [1:11:56<00:52, 19.07it/s]

{'loss': 0.2823, 'grad_norm': 6.00839900970459, 'learning_rate': 6.778523489932886e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 74000/75000 [1:11:57<00:50, 19.76it/s]

{'loss': 0.2556, 'grad_norm': 3.399681329727173, 'learning_rate': 6.711409395973154e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 74013/75000 [1:11:58<01:10, 13.95it/s]

{'loss': 0.2535, 'grad_norm': 3.7324612140655518, 'learning_rate': 6.644295302013423e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 74023/75000 [1:11:59<01:00, 16.26it/s]

{'loss': 0.4006, 'grad_norm': 5.685683727264404, 'learning_rate': 6.577181208053691e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 74032/75000 [1:11:59<00:53, 18.05it/s]

{'loss': 0.3266, 'grad_norm': 1.306639313697815, 'learning_rate': 6.51006711409396e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 74043/75000 [1:12:00<00:51, 18.58it/s]

{'loss': 0.3615, 'grad_norm': 1.5071829557418823, 'learning_rate': 6.442953020134228e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 74052/75000 [1:12:00<00:49, 18.99it/s]

{'loss': 0.3171, 'grad_norm': 9.78322982788086, 'learning_rate': 6.375838926174497e-07, 'epoch': 2.96}


                                                       
 99%|█████████▊| 74062/75000 [1:12:01<00:54, 17.12it/s]

{'loss': 0.2081, 'grad_norm': 1.496410846710205, 'learning_rate': 6.308724832214766e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 74072/75000 [1:12:01<00:50, 18.28it/s]

{'loss': 0.3078, 'grad_norm': 3.3177497386932373, 'learning_rate': 6.241610738255034e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 74082/75000 [1:12:02<00:48, 18.81it/s]

{'loss': 0.2918, 'grad_norm': 3.30898118019104, 'learning_rate': 6.174496644295303e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 74092/75000 [1:12:02<00:47, 19.17it/s]

{'loss': 0.3456, 'grad_norm': 5.989598274230957, 'learning_rate': 6.10738255033557e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 74102/75000 [1:12:03<00:46, 19.24it/s]

{'loss': 0.3065, 'grad_norm': 4.902098655700684, 'learning_rate': 6.040268456375839e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 74113/75000 [1:12:04<00:45, 19.31it/s]

{'loss': 0.3335, 'grad_norm': 5.1120829582214355, 'learning_rate': 5.973154362416107e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 74123/75000 [1:12:04<00:47, 18.63it/s]

{'loss': 0.2817, 'grad_norm': 3.022468090057373, 'learning_rate': 5.906040268456376e-07, 'epoch': 2.96}


                                                       
 99%|█████████▉| 74132/75000 [1:12:05<00:49, 17.56it/s]

{'loss': 0.3103, 'grad_norm': 7.621555805206299, 'learning_rate': 5.838926174496644e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74141/75000 [1:12:05<00:45, 18.81it/s]

{'loss': 0.2538, 'grad_norm': 1.5043059587478638, 'learning_rate': 5.771812080536913e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74152/75000 [1:12:06<00:47, 18.01it/s]

{'loss': 0.3442, 'grad_norm': 2.813140630722046, 'learning_rate': 5.704697986577181e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74164/75000 [1:12:06<00:41, 20.26it/s]

{'loss': 0.2174, 'grad_norm': 5.169559001922607, 'learning_rate': 5.63758389261745e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74173/75000 [1:12:07<00:41, 19.86it/s]

{'loss': 0.2999, 'grad_norm': 5.117087364196777, 'learning_rate': 5.570469798657718e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74182/75000 [1:12:07<00:43, 18.85it/s]

{'loss': 0.3457, 'grad_norm': 4.582279682159424, 'learning_rate': 5.503355704697987e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74192/75000 [1:12:08<00:45, 17.74it/s]

{'loss': 0.2697, 'grad_norm': 1.626019835472107, 'learning_rate': 5.436241610738255e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74202/75000 [1:12:08<00:41, 19.19it/s]

{'loss': 0.2944, 'grad_norm': 2.6311419010162354, 'learning_rate': 5.369127516778524e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74211/75000 [1:12:09<00:49, 15.93it/s]

{'loss': 0.3833, 'grad_norm': 3.7136175632476807, 'learning_rate': 5.302013422818791e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74224/75000 [1:12:10<00:41, 18.88it/s]

{'loss': 0.3209, 'grad_norm': 0.43877503275871277, 'learning_rate': 5.23489932885906e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74234/75000 [1:12:10<00:38, 19.70it/s]

{'loss': 0.1579, 'grad_norm': 1.3948150873184204, 'learning_rate': 5.167785234899328e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74242/75000 [1:12:10<00:39, 19.09it/s]

{'loss': 0.2932, 'grad_norm': 4.577138900756836, 'learning_rate': 5.100671140939597e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74252/75000 [1:12:11<00:40, 18.52it/s]

{'loss': 0.2535, 'grad_norm': 3.2412877082824707, 'learning_rate': 5.033557046979866e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74262/75000 [1:12:11<00:37, 19.45it/s]

{'loss': 0.2544, 'grad_norm': 4.323253154754639, 'learning_rate': 4.966442953020134e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74272/75000 [1:12:12<00:38, 18.80it/s]

{'loss': 0.38, 'grad_norm': 10.250954627990723, 'learning_rate': 4.899328859060403e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74281/75000 [1:12:13<00:41, 17.42it/s]

{'loss': 0.2406, 'grad_norm': 3.869786262512207, 'learning_rate': 4.832214765100671e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74294/75000 [1:12:13<00:36, 19.60it/s]

{'loss': 0.3609, 'grad_norm': 12.730359077453613, 'learning_rate': 4.7651006711409396e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74302/75000 [1:12:14<00:37, 18.75it/s]

{'loss': 0.3597, 'grad_norm': 1.5954339504241943, 'learning_rate': 4.697986577181208e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74313/75000 [1:12:14<00:38, 17.95it/s]

{'loss': 0.4364, 'grad_norm': 1.9023159742355347, 'learning_rate': 4.630872483221476e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74321/75000 [1:12:15<00:35, 19.15it/s]

{'loss': 0.343, 'grad_norm': 4.501904487609863, 'learning_rate': 4.5637583892617447e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74333/75000 [1:12:15<00:34, 19.18it/s]

{'loss': 0.4949, 'grad_norm': 6.931332588195801, 'learning_rate': 4.496644295302013e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74343/75000 [1:12:16<00:34, 18.99it/s]

{'loss': 0.3402, 'grad_norm': 4.277373313903809, 'learning_rate': 4.429530201342282e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74354/75000 [1:12:16<00:32, 19.98it/s]

{'loss': 0.4118, 'grad_norm': 8.866230010986328, 'learning_rate': 4.3624161073825503e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74363/75000 [1:12:17<00:33, 19.24it/s]

{'loss': 0.2389, 'grad_norm': 2.4751579761505127, 'learning_rate': 4.295302013422819e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74374/75000 [1:12:17<00:32, 19.47it/s]

{'loss': 0.2545, 'grad_norm': 1.086326003074646, 'learning_rate': 4.228187919463087e-07, 'epoch': 2.97}


                                                       
 99%|█████████▉| 74382/75000 [1:12:18<00:32, 19.13it/s]

{'loss': 0.2838, 'grad_norm': 5.265031337738037, 'learning_rate': 4.1610738255033553e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74394/75000 [1:12:18<00:31, 19.30it/s]

{'loss': 0.2233, 'grad_norm': 5.396859169006348, 'learning_rate': 4.093959731543624e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74402/75000 [1:12:19<00:31, 19.08it/s]

{'loss': 0.3007, 'grad_norm': 2.2516393661499023, 'learning_rate': 4.0268456375838935e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74412/75000 [1:12:19<00:31, 18.73it/s]

{'loss': 0.2619, 'grad_norm': 3.73057222366333, 'learning_rate': 3.9597315436241615e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74424/75000 [1:12:20<00:29, 19.36it/s]

{'loss': 0.3339, 'grad_norm': 1.2163323163986206, 'learning_rate': 3.8926174496644295e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74431/75000 [1:12:20<00:31, 18.18it/s]

{'loss': 0.3816, 'grad_norm': 0.6025750637054443, 'learning_rate': 3.825503355704698e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74442/75000 [1:12:21<00:31, 17.86it/s]

{'loss': 0.2372, 'grad_norm': 1.0065279006958008, 'learning_rate': 3.7583892617449665e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74453/75000 [1:12:22<00:29, 18.36it/s]

{'loss': 0.2499, 'grad_norm': 1.4972527027130127, 'learning_rate': 3.691275167785235e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74462/75000 [1:12:22<00:30, 17.56it/s]

{'loss': 0.2506, 'grad_norm': 5.228364944458008, 'learning_rate': 3.6241610738255036e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74474/75000 [1:12:23<00:26, 19.90it/s]

{'loss': 0.315, 'grad_norm': 5.034282684326172, 'learning_rate': 3.557046979865772e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74482/75000 [1:12:23<00:30, 17.23it/s]

{'loss': 0.2228, 'grad_norm': 3.698774576187134, 'learning_rate': 3.48993288590604e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74494/75000 [1:12:24<00:25, 19.77it/s]

{'loss': 0.2529, 'grad_norm': 3.1976842880249023, 'learning_rate': 3.4228187919463087e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74500/75000 [1:12:24<00:25, 19.39it/s]

{'loss': 0.1969, 'grad_norm': 10.773015022277832, 'learning_rate': 3.355704697986577e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74512/75000 [1:12:26<00:36, 13.22it/s]

{'loss': 0.2289, 'grad_norm': 4.344738006591797, 'learning_rate': 3.2885906040268457e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74523/75000 [1:12:26<00:28, 16.87it/s]

{'loss': 0.2243, 'grad_norm': 2.4851162433624268, 'learning_rate': 3.221476510067114e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74534/75000 [1:12:27<00:24, 18.95it/s]

{'loss': 0.2738, 'grad_norm': 0.9777911305427551, 'learning_rate': 3.154362416107383e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74540/75000 [1:12:27<00:23, 19.99it/s]

{'loss': 0.315, 'grad_norm': 6.998344421386719, 'learning_rate': 3.0872483221476513e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74553/75000 [1:12:28<00:24, 18.43it/s]

{'loss': 0.4996, 'grad_norm': 1.2436248064041138, 'learning_rate': 3.0201342281879193e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74564/75000 [1:12:28<00:21, 20.05it/s]

{'loss': 0.2048, 'grad_norm': 0.7085216641426086, 'learning_rate': 2.953020134228188e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74573/75000 [1:12:29<00:21, 19.55it/s]

{'loss': 0.2509, 'grad_norm': 2.1985251903533936, 'learning_rate': 2.8859060402684564e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74581/75000 [1:12:29<00:22, 18.85it/s]

{'loss': 0.3155, 'grad_norm': 2.509775161743164, 'learning_rate': 2.818791946308725e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74594/75000 [1:12:30<00:19, 20.37it/s]

{'loss': 0.2755, 'grad_norm': 7.519730091094971, 'learning_rate': 2.7516778523489934e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74602/75000 [1:12:30<00:20, 19.73it/s]

{'loss': 0.4527, 'grad_norm': 6.806537628173828, 'learning_rate': 2.684563758389262e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74613/75000 [1:12:31<00:20, 18.71it/s]

{'loss': 0.232, 'grad_norm': 2.457597494125366, 'learning_rate': 2.61744966442953e-07, 'epoch': 2.98}


                                                       
 99%|█████████▉| 74622/75000 [1:12:31<00:19, 19.28it/s]

{'loss': 0.295, 'grad_norm': 6.1075592041015625, 'learning_rate': 2.5503355704697985e-07, 'epoch': 2.98}


                                                       
100%|█████████▉| 74633/75000 [1:12:32<00:19, 19.31it/s]

{'loss': 0.2965, 'grad_norm': 6.724436283111572, 'learning_rate': 2.483221476510067e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74644/75000 [1:12:32<00:17, 20.22it/s]

{'loss': 0.2315, 'grad_norm': 1.9572404623031616, 'learning_rate': 2.4161073825503355e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74652/75000 [1:12:33<00:18, 18.66it/s]

{'loss': 0.1798, 'grad_norm': 4.465916633605957, 'learning_rate': 2.348993288590604e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74663/75000 [1:12:33<00:17, 19.06it/s]

{'loss': 0.2214, 'grad_norm': 2.751582384109497, 'learning_rate': 2.2818791946308723e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74673/75000 [1:12:34<00:17, 19.20it/s]

{'loss': 0.2648, 'grad_norm': 8.950849533081055, 'learning_rate': 2.214765100671141e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74682/75000 [1:12:34<00:17, 18.64it/s]

{'loss': 0.2112, 'grad_norm': 1.2714101076126099, 'learning_rate': 2.1476510067114094e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74694/75000 [1:12:35<00:14, 20.66it/s]

{'loss': 0.3203, 'grad_norm': 4.541447162628174, 'learning_rate': 2.0805369127516777e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74703/75000 [1:12:35<00:14, 19.91it/s]

{'loss': 0.2808, 'grad_norm': 4.726925849914551, 'learning_rate': 2.0134228187919467e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74712/75000 [1:12:36<00:14, 19.74it/s]

{'loss': 0.274, 'grad_norm': 5.729242324829102, 'learning_rate': 1.9463087248322147e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74723/75000 [1:12:36<00:14, 18.50it/s]

{'loss': 0.3306, 'grad_norm': 7.181975841522217, 'learning_rate': 1.8791946308724833e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74732/75000 [1:12:37<00:13, 19.15it/s]

{'loss': 0.3923, 'grad_norm': 4.903260707855225, 'learning_rate': 1.8120805369127518e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74744/75000 [1:12:38<00:12, 20.11it/s]

{'loss': 0.2284, 'grad_norm': 3.845578193664551, 'learning_rate': 1.74496644295302e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74753/75000 [1:12:38<00:12, 20.11it/s]

{'loss': 0.4183, 'grad_norm': 2.5618226528167725, 'learning_rate': 1.6778523489932886e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74763/75000 [1:12:39<00:12, 18.70it/s]

{'loss': 0.2543, 'grad_norm': 6.2857160568237305, 'learning_rate': 1.610738255033557e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74773/75000 [1:12:39<00:12, 18.64it/s]

{'loss': 0.2013, 'grad_norm': 0.9721109867095947, 'learning_rate': 1.5436241610738257e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74783/75000 [1:12:40<00:11, 18.29it/s]

{'loss': 0.3326, 'grad_norm': 2.2187023162841797, 'learning_rate': 1.476510067114094e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74793/75000 [1:12:40<00:11, 18.40it/s]

{'loss': 0.1436, 'grad_norm': 2.7232754230499268, 'learning_rate': 1.4093959731543624e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74801/75000 [1:12:41<00:10, 18.93it/s]

{'loss': 0.334, 'grad_norm': 4.789008140563965, 'learning_rate': 1.342281879194631e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74813/75000 [1:12:41<00:09, 19.18it/s]

{'loss': 0.3026, 'grad_norm': 3.583294153213501, 'learning_rate': 1.2751677852348992e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74824/75000 [1:12:42<00:08, 19.63it/s]

{'loss': 0.297, 'grad_norm': 1.6510982513427734, 'learning_rate': 1.2080536912751678e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74832/75000 [1:12:42<00:08, 19.29it/s]

{'loss': 0.3526, 'grad_norm': 4.645780563354492, 'learning_rate': 1.1409395973154362e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74844/75000 [1:12:43<00:07, 19.97it/s]

{'loss': 0.3369, 'grad_norm': 3.815861225128174, 'learning_rate': 1.0738255033557047e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74852/75000 [1:12:43<00:07, 19.28it/s]

{'loss': 0.305, 'grad_norm': 8.206404685974121, 'learning_rate': 1.0067114093959734e-07, 'epoch': 2.99}


                                                       
100%|█████████▉| 74863/75000 [1:12:44<00:06, 19.73it/s]

{'loss': 0.2648, 'grad_norm': 1.9230061769485474, 'learning_rate': 9.395973154362416e-08, 'epoch': 2.99}


                                                       
100%|█████████▉| 74873/75000 [1:12:44<00:06, 18.75it/s]

{'loss': 0.3247, 'grad_norm': 25.80202865600586, 'learning_rate': 8.7248322147651e-08, 'epoch': 2.99}


                                                       
100%|█████████▉| 74882/75000 [1:12:45<00:06, 19.39it/s]

{'loss': 0.2579, 'grad_norm': 2.632056951522827, 'learning_rate': 8.053691275167786e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74893/75000 [1:12:45<00:05, 19.03it/s]

{'loss': 0.3771, 'grad_norm': 2.8268086910247803, 'learning_rate': 7.38255033557047e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74902/75000 [1:12:46<00:04, 19.79it/s]

{'loss': 0.2685, 'grad_norm': 2.928577184677124, 'learning_rate': 6.711409395973155e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74913/75000 [1:12:46<00:04, 19.81it/s]

{'loss': 0.4135, 'grad_norm': 1.9287526607513428, 'learning_rate': 6.040268456375839e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74923/75000 [1:12:47<00:04, 18.37it/s]

{'loss': 0.3581, 'grad_norm': 1.8351519107818604, 'learning_rate': 5.3691275167785235e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74932/75000 [1:12:47<00:03, 19.52it/s]

{'loss': 0.1801, 'grad_norm': 1.8484008312225342, 'learning_rate': 4.697986577181208e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74944/75000 [1:12:48<00:02, 20.20it/s]

{'loss': 0.3769, 'grad_norm': 6.470461845397949, 'learning_rate': 4.026845637583893e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74952/75000 [1:12:48<00:02, 20.10it/s]

{'loss': 0.241, 'grad_norm': 4.2762322425842285, 'learning_rate': 3.3557046979865774e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74960/75000 [1:12:49<00:01, 20.17it/s]

{'loss': 0.2249, 'grad_norm': 5.503958225250244, 'learning_rate': 2.6845637583892618e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74974/75000 [1:12:49<00:01, 20.38it/s]

{'loss': 0.3607, 'grad_norm': 3.901423215866089, 'learning_rate': 2.0134228187919464e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74983/75000 [1:12:50<00:00, 20.24it/s]

{'loss': 0.287, 'grad_norm': 2.0506160259246826, 'learning_rate': 1.3422818791946309e-08, 'epoch': 3.0}


                                                       
100%|█████████▉| 74992/75000 [1:12:50<00:00, 19.38it/s]

{'loss': 0.3462, 'grad_norm': 2.9673728942871094, 'learning_rate': 6.711409395973154e-09, 'epoch': 3.0}


                                                       
100%|██████████| 75000/75000 [1:12:51<00:00, 13.50it/s]

{'loss': 0.2709, 'grad_norm': 1.4303580522537231, 'learning_rate': 0.0, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 75000/75000 [1:13:07<00:00, 13.50it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'eval_loss': 0.32544657588005066, 'eval_runtime': 66.3771, 'eval_samples_per_second': 1506.543, 'eval_steps_per_second': 23.547, 'epoch': 3.0}
{'train_runtime': 4438.7153, 'train_samples_per_second': 270.348, 'train_steps_per_second': 16.897, 'train_loss': 0.3269071028693517, 'epoch': 3.0}





TrainOutput(global_step=75000, training_loss=0.3269071028693517, metrics={'train_runtime': 4438.7153, 'train_samples_per_second': 270.348, 'train_steps_per_second': 16.897, 'total_flos': 4301987328000000.0, 'train_loss': 0.3269071028693517, 'epoch': 3.0})

In [None]:
import torch
print(torch.cuda.is_available())  # Harus True jika GPU tersedia
print(torch.cuda.get_device_name(0))  # Menampilkan nama GPU, misalnya 'NVIDIA GeForce RTX 2060'


In [14]:

eval_results = trainer.evaluate(test_dataset)


print(eval_results)


100%|██████████| 1563/1563 [00:57<00:00, 27.36it/s]

{'eval_loss': 0.32544657588005066, 'eval_runtime': 57.1679, 'eval_samples_per_second': 1749.233, 'eval_steps_per_second': 27.341, 'epoch': 3.0}





In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)


accuracy = accuracy_score(test_labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, preds, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


100%|██████████| 1563/1563 [01:00<00:00, 26.03it/s]


Accuracy: 0.90118
Precision: 0.8832453423991299
Recall: 0.90118
F1-score: 0.8861208356918333


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


cm = confusion_matrix(test_labels, preds)

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [11]:

model.save_pretrained("./saved_model")


tokenizer.save_pretrained("./saved_model")


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer


loaded_model = AutoModelForSequenceClassification.from_pretrained("./saved_model")


loaded_tokenizer = AutoTokenizer.from_pretrained("./saved_model")


In [13]:

inputs = loaded_tokenizer("Contoh teks untuk prediksi", return_tensors="pt")


outputs = loaded_model(**inputs)
predictions = outputs.logits.argmax(-1)
print(predictions)


tensor([2])


In [17]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch


model_path = "./saved_model"  
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)


loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [18]:

text = "Layanan pelanggan sangat buruk, saya tidak akan membeli lagi."


inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)


inputs = {key: val.to(device) for key, val in inputs.items()}


outputs = loaded_model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)


label_dict = {0: 'Negatif', 1: 'Netral', 2: 'Positif'}
predicted_label = label_dict[predictions.item()]

print(f"Teks: {text}")
print(f"Prediksi Sentimen: {predicted_label}")


Teks: Layanan pelanggan sangat buruk, saya tidak akan membeli lagi.
Prediksi Sentimen: Negatif


In [19]:

texts = [
    "Layanan pelanggan sangat buruk, saya tidak akan membeli lagi.",
    "Produk ini sangat bagus, saya sangat puas dengan pembeliannya.",
    "Pengiriman cukup lama tapi barangnya sesuai deskripsi."
]


inputs = loaded_tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128)


inputs = {key: val.to(device) for key, val in inputs.items()}


outputs = loaded_model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)


predicted_labels = [label_dict[pred.item()] for pred in predictions]

for text, label in zip(texts, predicted_labels):
    print(f"Teks: {text}")
    print(f"Prediksi Sentimen: {label}")


Teks: Layanan pelanggan sangat buruk, saya tidak akan membeli lagi.
Prediksi Sentimen: Negatif
Teks: Produk ini sangat bagus, saya sangat puas dengan pembeliannya.
Prediksi Sentimen: Positif
Teks: Pengiriman cukup lama tapi barangnya sesuai deskripsi.
Prediksi Sentimen: Positif
