In [8]:
!python -m pip install huggingface_hub
!python -m pip install datasets




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


This was run with python 3.11.9 with a RTX 4070 GPU

In [9]:
from datasets import load_dataset
import torch
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader

from setfit import SetFitModel, SetFitTrainer, Trainer
from sentence_transformers import models, SentenceTransformer
from transformers import BertConfig, AutoTokenizer, AutoModel, BertModel

import time
import numpy as np
import pandas as pd

Get the dataset from Huggingface hub

In [10]:
# dataset from hf_hub
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
ds = ds.with_format("torch")
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

First the configuration of the java model

In [11]:
conf_java = {
  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": None,
  "gradient_checkpointing": False,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 1,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.2",
  "type_vocab_size": 2,
  "use_cache": True,
  "vocab_size": 30522
}

The transformer is taken from huggingface hub and the conf_java configuration arguments are applied. The untrained model is then constructed from the transformer and pooling using the SentenceTransformer (SBERT) library.

In [12]:
transformer = models.Transformer("sentence-transformers/all-MiniLM-L6-v2", max_seq_length=256, config_args=conf_java)
pooling = models.Pooling(transformer.get_word_embedding_dimension(), 
    pooling_mode_cls_token=False, pooling_mode_mean_tokens=True, pooling_mode_max_tokens=False, pooling_mode_mean_sqrt_len_tokens=False, pooling_mode_weightedmean_tokens=False, pooling_mode_lasttoken=False, include_prompt=True)
modelMy = SentenceTransformer(modules=[transformer, pooling])
modelMy.save_pretrained('./models/aight-l6-java')

Some weights of the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 were not used when initializing BertModel: ['encoder.layer.1.attention.output.LayerNorm.bias', 'encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.1.attention.output.dense.bias', 'encoder.layer.1.attention.output.dense.weight', 'encoder.layer.1.attention.self.key.bias', 'encoder.layer.1.attention.self.key.weight', 'encoder.layer.1.attention.self.query.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.layer.1.attention.self.value.bias', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.2.attention.output.LayerNorm.weight', 'encoder.layer.2.attention.output.dense.bias', 'encod

The untrained model is loaded as a SetFitModel, and a trainer is set up using the dataset. The trainer then trains the model, and it is saved.

In [13]:
model = SetFitModel.from_pretrained("./models/aight-l6-java", multi_target_strategy="multi-output",device='cuda', normalize_embeddings=False)
trainer = SetFitTrainer(
    model=model,
    train_dataset=ds['java_train'],
    column_mapping={"combo": "text", "labels": "label"},
    num_epochs=5,
    batch_size=32,
)
trainer.train()
trainer.model.save_pretrained('./models/aight-l6-java-trained')

model_head.pkl not found in D:\Documents\!Colleg\CS440-Final\Finalized\models\aight-l6-java, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Applying column mapping to the training dataset
***** Running training *****
  Num unique pairs = 304560
  Batch size = 32
  Num epochs = 5
                                                    
 13%|█▎        | 6278/47590 [1:23:43<22:13, 30.97it/s]

{'embedding_loss': 0.3037, 'grad_norm': 0.5002561211585999, 'learning_rate': 4.2025635637739025e-09, 'epoch': 0.0}


                                                      
 13%|█▎        | 6278/47590 [1:23:44<22:13, 30.97it/s]

{'embedding_loss': 0.2901, 'grad_norm': 0.6191548705101013, 'learning_rate': 2.1012817818869512e-07, 'epoch': 0.01}


                                                      
 13%|█▎        | 6278/47590 [1:23:45<22:13, 30.97it/s]

{'embedding_loss': 0.2996, 'grad_norm': 0.5127046704292297, 'learning_rate': 4.2025635637739023e-07, 'epoch': 0.01}


                                                      
 13%|█▎        | 6278/47590 [1:23:47<22:13, 30.97it/s]

{'embedding_loss': 0.2933, 'grad_norm': 0.37886038422584534, 'learning_rate': 6.303845345660854e-07, 'epoch': 0.02}


                                                      
 13%|█▎        | 6278/47590 [1:23:48<22:13, 30.97it/s]

{'embedding_loss': 0.2783, 'grad_norm': 0.33544307947158813, 'learning_rate': 8.405127127547805e-07, 'epoch': 0.02}


                                                      
 13%|█▎        | 6278/47590 [1:23:50<22:13, 30.97it/s]

{'embedding_loss': 0.278, 'grad_norm': 0.4353111982345581, 'learning_rate': 1.0506408909434757e-06, 'epoch': 0.03}


                                                      
 13%|█▎        | 6278/47590 [1:23:52<22:13, 30.97it/s]

{'embedding_loss': 0.2819, 'grad_norm': 0.6062488555908203, 'learning_rate': 1.2607690691321708e-06, 'epoch': 0.03}


                                                      
 13%|█▎        | 6278/47590 [1:23:54<22:13, 30.97it/s]

{'embedding_loss': 0.2741, 'grad_norm': 0.6235789060592651, 'learning_rate': 1.4708972473208657e-06, 'epoch': 0.04}


                                                      
 13%|█▎        | 6278/47590 [1:23:56<22:13, 30.97it/s]

{'embedding_loss': 0.2735, 'grad_norm': 0.4548020660877228, 'learning_rate': 1.681025425509561e-06, 'epoch': 0.04}


                                                      
 13%|█▎        | 6278/47590 [1:23:57<22:13, 30.97it/s]


{'embedding_loss': 0.2592, 'grad_norm': 0.29167044162750244, 'learning_rate': 1.8911536036982562e-06, 'epoch': 0.05}


                                                      
 13%|█▎        | 6278/47590 [1:23:59<22:13, 30.97it/s]

{'embedding_loss': 0.2561, 'grad_norm': 0.623943030834198, 'learning_rate': 2.1012817818869515e-06, 'epoch': 0.05}



[A
                                                      
 13%|█▎        | 6278/47590 [1:24:02<22:13, 30.97it/s]

{'embedding_loss': 0.2559, 'grad_norm': 0.29195091128349304, 'learning_rate': 2.311409960075646e-06, 'epoch': 0.06}


                                                      
 13%|█▎        | 6278/47590 [1:24:03<22:13, 30.97it/s]

{'embedding_loss': 0.2561, 'grad_norm': 0.6475197672843933, 'learning_rate': 2.5215381382643416e-06, 'epoch': 0.06}


                                                      
 13%|█▎        | 6278/47590 [1:24:05<22:13, 30.97it/s]

{'embedding_loss': 0.2521, 'grad_norm': 0.5326696634292603, 'learning_rate': 2.7316663164530367e-06, 'epoch': 0.07}


                                                      
 13%|█▎        | 6278/47590 [1:24:07<22:13, 30.97it/s]

{'embedding_loss': 0.2438, 'grad_norm': 0.7715303301811218, 'learning_rate': 2.9417944946417313e-06, 'epoch': 0.07}


                                                      
 13%|█▎        | 6278/47590 [1:24:10<22:13, 30.97it/s]

{'embedding_loss': 0.2406, 'grad_norm': 0.41885364055633545, 'learning_rate': 3.1519226728304268e-06, 'epoch': 0.08}


                                                      
 13%|█▎        | 6278/47590 [1:24:11<22:13, 30.97it/s]

{'embedding_loss': 0.2396, 'grad_norm': 0.3721943497657776, 'learning_rate': 3.362050851019122e-06, 'epoch': 0.08}


                                                      
 13%|█▎        | 6278/47590 [1:24:12<22:13, 30.97it/s]

{'embedding_loss': 0.2382, 'grad_norm': 0.36906877160072327, 'learning_rate': 3.572179029207817e-06, 'epoch': 0.09}


                                                      
 13%|█▎        | 6278/47590 [1:24:14<22:13, 30.97it/s]

{'embedding_loss': 0.2335, 'grad_norm': 0.40643587708473206, 'learning_rate': 3.7823072073965124e-06, 'epoch': 0.09}


                                                      
 13%|█▎        | 6278/47590 [1:24:16<22:13, 30.97it/s]

{'embedding_loss': 0.232, 'grad_norm': 0.603917121887207, 'learning_rate': 3.9924353855852075e-06, 'epoch': 0.1}


                                                      
 13%|█▎        | 6278/47590 [1:24:18<22:13, 30.97it/s]

{'embedding_loss': 0.2268, 'grad_norm': 0.5950212478637695, 'learning_rate': 4.202563563773903e-06, 'epoch': 0.11}


                                                      
 13%|█▎        | 6278/47590 [1:24:20<22:13, 30.97it/s]

{'embedding_loss': 0.2251, 'grad_norm': 0.5197702646255493, 'learning_rate': 4.412691741962598e-06, 'epoch': 0.11}


                                                      
 13%|█▎        | 6278/47590 [1:24:21<22:13, 30.97it/s]

{'embedding_loss': 0.2212, 'grad_norm': 0.5553271770477295, 'learning_rate': 4.622819920151292e-06, 'epoch': 0.12}


                                                      
 13%|█▎        | 6278/47590 [1:24:23<22:13, 30.97it/s]


{'embedding_loss': 0.2133, 'grad_norm': 0.42745086550712585, 'learning_rate': 4.832948098339988e-06, 'epoch': 0.12}


                                                      A
 13%|█▎        | 6278/47590 [1:24:24<22:13, 30.97it/s]


{'embedding_loss': 0.2122, 'grad_norm': 0.534555971622467, 'learning_rate': 5.043076276528683e-06, 'epoch': 0.13}


                                                      A
 13%|█▎        | 6278/47590 [1:24:26<22:13, 30.97it/s]

{'embedding_loss': 0.2017, 'grad_norm': 0.5527015328407288, 'learning_rate': 5.253204454717378e-06, 'epoch': 0.13}


                                                      
 13%|█▎        | 6278/47590 [1:24:27<22:13, 30.97it/s]

{'embedding_loss': 0.1936, 'grad_norm': 0.42286959290504456, 'learning_rate': 5.463332632906073e-06, 'epoch': 0.14}


                                                      
 13%|█▎        | 6278/47590 [1:24:29<22:13, 30.97it/s]

{'embedding_loss': 0.1857, 'grad_norm': 0.5148914456367493, 'learning_rate': 5.673460811094769e-06, 'epoch': 0.14}


                                                      
 13%|█▎        | 6278/47590 [1:24:30<22:13, 30.97it/s]

{'embedding_loss': 0.1817, 'grad_norm': 0.45169535279273987, 'learning_rate': 5.883588989283463e-06, 'epoch': 0.15}


                                                      
 13%|█▎        | 6278/47590 [1:24:32<22:13, 30.97it/s]

{'embedding_loss': 0.1868, 'grad_norm': 0.5714364051818848, 'learning_rate': 6.093717167472158e-06, 'epoch': 0.15}


                                                      
 13%|█▎        | 6278/47590 [1:24:34<22:13, 30.97it/s]

{'embedding_loss': 0.1753, 'grad_norm': 0.4850311577320099, 'learning_rate': 6.3038453456608536e-06, 'epoch': 0.16}


                                                      
 13%|█▎        | 6278/47590 [1:24:37<22:13, 30.97it/s]

{'embedding_loss': 0.174, 'grad_norm': 0.46280279755592346, 'learning_rate': 6.513973523849548e-06, 'epoch': 0.16}


                                                      
 13%|█▎        | 6278/47590 [1:24:38<22:13, 30.97it/s]

{'embedding_loss': 0.1685, 'grad_norm': 0.5710284113883972, 'learning_rate': 6.724101702038244e-06, 'epoch': 0.17}


                                                      
 13%|█▎        | 6278/47590 [1:24:40<22:13, 30.97it/s]

{'embedding_loss': 0.1624, 'grad_norm': 0.5174773931503296, 'learning_rate': 6.934229880226939e-06, 'epoch': 0.17}


                                                      
 13%|█▎        | 6278/47590 [1:24:41<22:13, 30.97it/s]


{'embedding_loss': 0.1651, 'grad_norm': 0.4868793189525604, 'learning_rate': 7.144358058415634e-06, 'epoch': 0.18}


                                                      A
 13%|█▎        | 6278/47590 [1:24:43<22:13, 30.97it/s]

{'embedding_loss': 0.1639, 'grad_norm': 0.4228067994117737, 'learning_rate': 7.354486236604329e-06, 'epoch': 0.18}


                                                      
 13%|█▎        | 6278/47590 [1:24:45<22:13, 30.97it/s]

{'embedding_loss': 0.1559, 'grad_norm': 0.6124609112739563, 'learning_rate': 7.564614414793025e-06, 'epoch': 0.19}


                                                      
 13%|█▎        | 6278/47590 [1:24:47<22:13, 30.97it/s]

{'embedding_loss': 0.1512, 'grad_norm': 0.709162175655365, 'learning_rate': 7.77474259298172e-06, 'epoch': 0.19}


                                                      
 13%|█▎        | 6278/47590 [1:24:48<22:13, 30.97it/s]

{'embedding_loss': 0.1553, 'grad_norm': 0.6623627543449402, 'learning_rate': 7.984870771170415e-06, 'epoch': 0.2}


                                                      
 13%|█▎        | 6278/47590 [1:24:49<22:13, 30.97it/s]

{'embedding_loss': 0.1591, 'grad_norm': 0.39696452021598816, 'learning_rate': 8.19499894935911e-06, 'epoch': 0.2}


                                                      
 13%|█▎        | 6278/47590 [1:24:51<22:13, 30.97it/s]

{'embedding_loss': 0.1568, 'grad_norm': 0.7934786081314087, 'learning_rate': 8.405127127547806e-06, 'epoch': 0.21}


                                                      
 13%|█▎        | 6278/47590 [1:24:53<22:13, 30.97it/s]

{'embedding_loss': 0.1502, 'grad_norm': 0.7327717542648315, 'learning_rate': 8.615255305736499e-06, 'epoch': 0.22}


                                                      
 13%|█▎        | 6278/47590 [1:24:55<22:13, 30.97it/s]

{'embedding_loss': 0.144, 'grad_norm': 0.6587015986442566, 'learning_rate': 8.825383483925195e-06, 'epoch': 0.22}


                                                      
 13%|█▎        | 6278/47590 [1:24:56<22:13, 30.97it/s]

{'embedding_loss': 0.1395, 'grad_norm': 0.6440337896347046, 'learning_rate': 9.03551166211389e-06, 'epoch': 0.23}


                                                      
 13%|█▎        | 6278/47590 [1:24:57<22:13, 30.97it/s]

{'embedding_loss': 0.1396, 'grad_norm': 0.5510646104812622, 'learning_rate': 9.245639840302584e-06, 'epoch': 0.23}


                                                      
 13%|█▎        | 6278/47590 [1:24:59<22:13, 30.97it/s]

{'embedding_loss': 0.1409, 'grad_norm': 0.5211188197135925, 'learning_rate': 9.45576801849128e-06, 'epoch': 0.24}


                                                      
 13%|█▎        | 6278/47590 [1:25:00<22:13, 30.97it/s]

{'embedding_loss': 0.1335, 'grad_norm': 0.34945619106292725, 'learning_rate': 9.665896196679975e-06, 'epoch': 0.24}


                                                      
 13%|█▎        | 6278/47590 [1:25:01<22:13, 30.97it/s]

{'embedding_loss': 0.1391, 'grad_norm': 0.5653074979782104, 'learning_rate': 9.87602437486867e-06, 'epoch': 0.25}


                                                      
 13%|█▎        | 6278/47590 [1:25:02<22:13, 30.97it/s]

{'embedding_loss': 0.1329, 'grad_norm': 0.40453317761421204, 'learning_rate': 1.0086152553057366e-05, 'epoch': 0.25}


                                                      
 13%|█▎        | 6278/47590 [1:25:04<22:13, 30.97it/s]

{'embedding_loss': 0.1302, 'grad_norm': 0.39863330125808716, 'learning_rate': 1.029628073124606e-05, 'epoch': 0.26}


                                                      
 13%|█▎        | 6278/47590 [1:25:06<22:13, 30.97it/s]

{'embedding_loss': 0.1312, 'grad_norm': 0.5545403957366943, 'learning_rate': 1.0506408909434756e-05, 'epoch': 0.26}


                                                      
 13%|█▎        | 6278/47590 [1:25:09<22:13, 30.97it/s]

{'embedding_loss': 0.1342, 'grad_norm': 0.6885448694229126, 'learning_rate': 1.071653708762345e-05, 'epoch': 0.27}


                                                      
 13%|█▎        | 6278/47590 [1:25:10<22:13, 30.97it/s]

{'embedding_loss': 0.1297, 'grad_norm': 0.5909608006477356, 'learning_rate': 1.0926665265812147e-05, 'epoch': 0.27}


                                                      
 13%|█▎        | 6278/47590 [1:25:11<22:13, 30.97it/s]

{'embedding_loss': 0.126, 'grad_norm': 0.6280509233474731, 'learning_rate': 1.1136793444000841e-05, 'epoch': 0.28}


                                                      
 13%|█▎        | 6278/47590 [1:25:13<22:13, 30.97it/s]

{'embedding_loss': 0.119, 'grad_norm': 0.5263268947601318, 'learning_rate': 1.1346921622189538e-05, 'epoch': 0.28}


                                                      
 13%|█▎        | 6278/47590 [1:25:14<22:13, 30.97it/s]

{'embedding_loss': 0.1142, 'grad_norm': 0.73863685131073, 'learning_rate': 1.1557049800378232e-05, 'epoch': 0.29}


                                                      
 13%|█▎        | 6278/47590 [1:25:16<22:13, 30.97it/s]


{'embedding_loss': 0.1197, 'grad_norm': 0.5581226348876953, 'learning_rate': 1.1767177978566925e-05, 'epoch': 0.29}


                                                      A
 13%|█▎        | 6278/47590 [1:25:17<22:13, 30.97it/s]

{'embedding_loss': 0.1154, 'grad_norm': 0.5242444276809692, 'learning_rate': 1.1977306156755622e-05, 'epoch': 0.3}


                                                      
 13%|█▎        | 6278/47590 [1:25:18<22:13, 30.97it/s]

{'embedding_loss': 0.1152, 'grad_norm': 0.4973171353340149, 'learning_rate': 1.2187434334944316e-05, 'epoch': 0.3}


                                                      
 13%|█▎        | 6278/47590 [1:25:19<22:13, 30.97it/s]

{'embedding_loss': 0.1131, 'grad_norm': 0.9339942932128906, 'learning_rate': 1.2397562513133013e-05, 'epoch': 0.31}


                                                      
 13%|█▎        | 6278/47590 [1:25:20<22:13, 30.97it/s]

{'embedding_loss': 0.1127, 'grad_norm': 0.9451705813407898, 'learning_rate': 1.2607690691321707e-05, 'epoch': 0.32}


                                                      
 13%|█▎        | 6278/47590 [1:25:22<22:13, 30.97it/s]

{'embedding_loss': 0.1055, 'grad_norm': 0.5672793984413147, 'learning_rate': 1.2817818869510403e-05, 'epoch': 0.32}


                                                      
 13%|█▎        | 6278/47590 [1:25:23<22:13, 30.97it/s]

{'embedding_loss': 0.1009, 'grad_norm': 0.5601515769958496, 'learning_rate': 1.3027947047699096e-05, 'epoch': 0.33}


                                                      
 13%|█▎        | 6278/47590 [1:25:25<22:13, 30.97it/s]

{'embedding_loss': 0.1092, 'grad_norm': 0.5413628220558167, 'learning_rate': 1.3238075225887793e-05, 'epoch': 0.33}


                                                      
 13%|█▎        | 6278/47590 [1:25:26<22:13, 30.97it/s]

{'embedding_loss': 0.1066, 'grad_norm': 0.8837852478027344, 'learning_rate': 1.3448203404076487e-05, 'epoch': 0.34}


                                                      
 13%|█▎        | 6278/47590 [1:25:27<22:13, 30.97it/s]

{'embedding_loss': 0.1092, 'grad_norm': 0.7934530973434448, 'learning_rate': 1.3658331582265184e-05, 'epoch': 0.34}


                                                      
 13%|█▎        | 6278/47590 [1:25:28<22:13, 30.97it/s]

{'embedding_loss': 0.0974, 'grad_norm': 0.672260046005249, 'learning_rate': 1.3868459760453878e-05, 'epoch': 0.35}


                                                      
 13%|█▎        | 6278/47590 [1:25:30<22:13, 30.97it/s]

{'embedding_loss': 0.1033, 'grad_norm': 1.036268711090088, 'learning_rate': 1.4078587938642575e-05, 'epoch': 0.35}


                                                      
 13%|█▎        | 6278/47590 [1:25:31<22:13, 30.97it/s]

{'embedding_loss': 0.0963, 'grad_norm': 1.040311336517334, 'learning_rate': 1.4288716116831268e-05, 'epoch': 0.36}


                                                      
 13%|█▎        | 6278/47590 [1:25:33<22:13, 30.97it/s]

{'embedding_loss': 0.0993, 'grad_norm': 1.4056055545806885, 'learning_rate': 1.4498844295019962e-05, 'epoch': 0.36}


                                                      
 13%|█▎        | 6278/47590 [1:25:34<22:13, 30.97it/s]

{'embedding_loss': 0.0953, 'grad_norm': 0.7247875332832336, 'learning_rate': 1.4708972473208659e-05, 'epoch': 0.37}


                                                      
 13%|█▎        | 6278/47590 [1:25:36<22:13, 30.97it/s]

{'embedding_loss': 0.085, 'grad_norm': 0.571494996547699, 'learning_rate': 1.4919100651397353e-05, 'epoch': 0.37}


                                                      
 13%|█▎        | 6278/47590 [1:25:39<22:13, 30.97it/s]

{'embedding_loss': 0.0874, 'grad_norm': 0.695378839969635, 'learning_rate': 1.512922882958605e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 6278/47590 [1:25:41<22:13, 30.97it/s]

{'embedding_loss': 0.0854, 'grad_norm': 0.42506352066993713, 'learning_rate': 1.5339357007774746e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 6278/47590 [1:25:42<22:13, 30.97it/s]


{'embedding_loss': 0.0869, 'grad_norm': 0.8085518479347229, 'learning_rate': 1.554948518596344e-05, 'epoch': 0.39}


                                                      A
 13%|█▎        | 6278/47590 [1:25:44<22:13, 30.97it/s]

{'embedding_loss': 0.0892, 'grad_norm': 1.071110725402832, 'learning_rate': 1.5759613364152132e-05, 'epoch': 0.39}


                                                      
 13%|█▎        | 6278/47590 [1:25:46<22:13, 30.97it/s]

{'embedding_loss': 0.0866, 'grad_norm': 0.721011757850647, 'learning_rate': 1.596974154234083e-05, 'epoch': 0.4}


                                                      
 13%|█▎        | 6278/47590 [1:25:47<22:13, 30.97it/s]

{'embedding_loss': 0.0809, 'grad_norm': 0.3903386890888214, 'learning_rate': 1.6179869720529524e-05, 'epoch': 0.4}


                                                      
 13%|█▎        | 6278/47590 [1:25:49<22:13, 30.97it/s]

{'embedding_loss': 0.0756, 'grad_norm': 0.5949761867523193, 'learning_rate': 1.638999789871822e-05, 'epoch': 0.41}


                                                      
 13%|█▎        | 6278/47590 [1:25:51<22:13, 30.97it/s]

{'embedding_loss': 0.0829, 'grad_norm': 0.45854178071022034, 'learning_rate': 1.6600126076906914e-05, 'epoch': 0.42}


                                                      
 13%|█▎        | 6278/47590 [1:25:52<22:13, 30.97it/s]

{'embedding_loss': 0.0811, 'grad_norm': 0.8817585110664368, 'learning_rate': 1.6810254255095612e-05, 'epoch': 0.42}


                                                      
 13%|█▎        | 6278/47590 [1:25:54<22:13, 30.97it/s]

{'embedding_loss': 0.0797, 'grad_norm': 1.0666781663894653, 'learning_rate': 1.7020382433284303e-05, 'epoch': 0.43}


                                                      
 13%|█▎        | 6278/47590 [1:25:56<22:13, 30.97it/s]

{'embedding_loss': 0.0779, 'grad_norm': 0.4489434063434601, 'learning_rate': 1.7230510611472998e-05, 'epoch': 0.43}


                                                      
 13%|█▎        | 6278/47590 [1:25:58<22:13, 30.97it/s]

{'embedding_loss': 0.0793, 'grad_norm': 0.875266432762146, 'learning_rate': 1.7440638789661696e-05, 'epoch': 0.44}


                                                      
 13%|█▎        | 6278/47590 [1:26:00<22:13, 30.97it/s]

{'embedding_loss': 0.0767, 'grad_norm': 1.0392909049987793, 'learning_rate': 1.765076696785039e-05, 'epoch': 0.44}


                                                      
 13%|█▎        | 6278/47590 [1:26:01<22:13, 30.97it/s]

{'embedding_loss': 0.0711, 'grad_norm': 0.8939388990402222, 'learning_rate': 1.7860895146039085e-05, 'epoch': 0.45}


                                                      
 13%|█▎        | 6278/47590 [1:26:03<22:13, 30.97it/s]

{'embedding_loss': 0.0795, 'grad_norm': 0.4772610664367676, 'learning_rate': 1.807102332422778e-05, 'epoch': 0.45}


                                                      
 13%|█▎        | 6278/47590 [1:26:05<22:13, 30.97it/s]

{'embedding_loss': 0.0719, 'grad_norm': 1.3950223922729492, 'learning_rate': 1.8281151502416478e-05, 'epoch': 0.46}


                                                      
 13%|█▎        | 6278/47590 [1:26:07<22:13, 30.97it/s]
  9%|▉         | 4407/47590 [02:24<20:42, 34.75it/s]

{'embedding_loss': 0.0706, 'grad_norm': 0.539482593536377, 'learning_rate': 1.849127968060517e-05, 'epoch': 0.46}


                                                      
 13%|█▎        | 6278/47590 [1:26:08<22:13, 30.97it/s]

{'embedding_loss': 0.0741, 'grad_norm': 0.7396148443222046, 'learning_rate': 1.8701407858793864e-05, 'epoch': 0.47}


                                                      
 13%|█▎        | 6278/47590 [1:26:11<22:13, 30.97it/s]

{'embedding_loss': 0.0711, 'grad_norm': 0.7519268989562988, 'learning_rate': 1.891153603698256e-05, 'epoch': 0.47}


                                                      
 13%|█▎        | 6278/47590 [1:26:15<22:13, 30.97it/s]

{'embedding_loss': 0.0735, 'grad_norm': 0.6769930124282837, 'learning_rate': 1.9121664215171256e-05, 'epoch': 0.48}


                                                      
 13%|█▎        | 6278/47590 [1:26:18<22:13, 30.97it/s]

{'embedding_loss': 0.0646, 'grad_norm': 0.7387675642967224, 'learning_rate': 1.933179239335995e-05, 'epoch': 0.48}


                                                      
 13%|█▎        | 6278/47590 [1:26:18<22:13, 30.97it/s]

{'embedding_loss': 0.0615, 'grad_norm': 0.5231037139892578, 'learning_rate': 1.9541920571548645e-05, 'epoch': 0.49}


                                                      
 13%|█▎        | 6278/47590 [1:26:19<22:13, 30.97it/s]

{'embedding_loss': 0.0708, 'grad_norm': 1.4668623208999634, 'learning_rate': 1.975204874973734e-05, 'epoch': 0.49}


                                                      
 13%|█▎        | 6278/47590 [1:26:20<22:13, 30.97it/s]


{'embedding_loss': 0.0664, 'grad_norm': 0.587925374507904, 'learning_rate': 1.9962176927926035e-05, 'epoch': 0.5}


                                                      A
 13%|█▎        | 6278/47590 [1:26:21<22:13, 30.97it/s]

{'embedding_loss': 0.0603, 'grad_norm': 1.2775300741195679, 'learning_rate': 1.9980854988209478e-05, 'epoch': 0.5}


                                                      
 13%|█▎        | 6278/47590 [1:26:22<22:13, 30.97it/s]

{'embedding_loss': 0.0646, 'grad_norm': 2.2588255405426025, 'learning_rate': 1.9957507412855177e-05, 'epoch': 0.51}


                                                      
 13%|█▎        | 6278/47590 [1:26:23<22:13, 30.97it/s]


{'embedding_loss': 0.0681, 'grad_norm': 0.8488881587982178, 'learning_rate': 1.9934159837500876e-05, 'epoch': 0.51}


                                                      A
 13%|█▎        | 6278/47590 [1:26:24<22:13, 30.97it/s]


{'embedding_loss': 0.0609, 'grad_norm': 2.0850675106048584, 'learning_rate': 1.9910812262146575e-05, 'epoch': 0.52}


                                                      A
 13%|█▎        | 6278/47590 [1:26:25<22:13, 30.97it/s]

{'embedding_loss': 0.0567, 'grad_norm': 0.8967294692993164, 'learning_rate': 1.9887464686792278e-05, 'epoch': 0.53}


                                                      
 13%|█▎        | 6278/47590 [1:26:27<22:13, 30.97it/s]

{'embedding_loss': 0.0528, 'grad_norm': 0.8809287548065186, 'learning_rate': 1.986411711143798e-05, 'epoch': 0.53}


                                                      
 13%|█▎        | 6278/47590 [1:26:28<22:13, 30.97it/s]

{'embedding_loss': 0.0616, 'grad_norm': 0.7314808964729309, 'learning_rate': 1.984076953608368e-05, 'epoch': 0.54}


                                                      
 13%|█▎        | 6278/47590 [1:26:29<22:13, 30.97it/s]

{'embedding_loss': 0.0592, 'grad_norm': 1.9224321842193604, 'learning_rate': 1.981742196072938e-05, 'epoch': 0.54}


                                                      
 13%|█▎        | 6278/47590 [1:26:30<22:13, 30.97it/s]


{'embedding_loss': 0.0615, 'grad_norm': 0.4305185079574585, 'learning_rate': 1.979407438537508e-05, 'epoch': 0.55}


                                                      A
 13%|█▎        | 6278/47590 [1:26:31<22:13, 30.97it/s]

{'embedding_loss': 0.0587, 'grad_norm': 0.5594715476036072, 'learning_rate': 1.977072681002078e-05, 'epoch': 0.55}


                                                      
 13%|█▎        | 6278/47590 [1:26:31<22:13, 30.97it/s]

{'embedding_loss': 0.053, 'grad_norm': 0.9020963907241821, 'learning_rate': 1.9747379234666483e-05, 'epoch': 0.56}


                                                      
 13%|█▎        | 6278/47590 [1:26:32<22:13, 30.97it/s]

{'embedding_loss': 0.0545, 'grad_norm': 0.6413028836250305, 'learning_rate': 1.9724031659312182e-05, 'epoch': 0.56}


                                                      
 13%|█▎        | 6278/47590 [1:26:33<22:13, 30.97it/s]


{'embedding_loss': 0.0533, 'grad_norm': 0.9810370802879333, 'learning_rate': 1.970068408395788e-05, 'epoch': 0.57}


                                                      A
 13%|█▎        | 6278/47590 [1:26:34<22:13, 30.97it/s]

{'embedding_loss': 0.0549, 'grad_norm': 0.677661657333374, 'learning_rate': 1.9677336508603583e-05, 'epoch': 0.57}


                                                      
 13%|█▎        | 6278/47590 [1:26:35<22:13, 30.97it/s]

{'embedding_loss': 0.0536, 'grad_norm': 1.3425509929656982, 'learning_rate': 1.9653988933249282e-05, 'epoch': 0.58}


                                                      
 13%|█▎        | 6278/47590 [1:26:37<22:13, 30.97it/s]

{'embedding_loss': 0.0562, 'grad_norm': 0.2873605787754059, 'learning_rate': 1.9630641357894985e-05, 'epoch': 0.58}


                                                      
 13%|█▎        | 6278/47590 [1:26:38<22:13, 30.97it/s]

{'embedding_loss': 0.048, 'grad_norm': 1.0415518283843994, 'learning_rate': 1.9607293782540684e-05, 'epoch': 0.59}


                                                      
 13%|█▎        | 6278/47590 [1:26:39<22:13, 30.97it/s]


{'embedding_loss': 0.0453, 'grad_norm': 1.6674891710281372, 'learning_rate': 1.9583946207186386e-05, 'epoch': 0.59}


                                                      A
 13%|█▎        | 6278/47590 [1:26:39<22:13, 30.97it/s]

{'embedding_loss': 0.0525, 'grad_norm': 0.9084897041320801, 'learning_rate': 1.9560598631832086e-05, 'epoch': 0.6}


                                                      
 13%|█▎        | 6278/47590 [1:26:40<22:13, 30.97it/s]

{'embedding_loss': 0.0563, 'grad_norm': 0.8908786177635193, 'learning_rate': 1.9537251056477785e-05, 'epoch': 0.6}


                                                      
 13%|█▎        | 6278/47590 [1:26:41<22:13, 30.97it/s]

{'embedding_loss': 0.0478, 'grad_norm': 0.7901282906532288, 'learning_rate': 1.9513903481123487e-05, 'epoch': 0.61}


                                                      
 13%|█▎        | 6278/47590 [1:26:42<22:13, 30.97it/s]

{'embedding_loss': 0.0501, 'grad_norm': 0.3087144196033478, 'learning_rate': 1.949055590576919e-05, 'epoch': 0.61}


                                                      
 13%|█▎        | 6278/47590 [1:26:43<22:13, 30.97it/s]

{'embedding_loss': 0.0524, 'grad_norm': 0.19396322965621948, 'learning_rate': 1.946720833041489e-05, 'epoch': 0.62}


                                                      
 13%|█▎        | 6278/47590 [1:26:44<22:13, 30.97it/s]


{'embedding_loss': 0.0491, 'grad_norm': 1.0877801179885864, 'learning_rate': 1.9443860755060588e-05, 'epoch': 0.63}


                                                      A
 13%|█▎        | 6278/47590 [1:26:45<22:13, 30.97it/s]

{'embedding_loss': 0.0423, 'grad_norm': 0.5253562331199646, 'learning_rate': 1.9420513179706287e-05, 'epoch': 0.63}


                                                      
 13%|█▎        | 6278/47590 [1:26:47<22:13, 30.97it/s]

{'embedding_loss': 0.0529, 'grad_norm': 0.6630753874778748, 'learning_rate': 1.939716560435199e-05, 'epoch': 0.64}


                                                      
 13%|█▎        | 6278/47590 [1:26:47<22:13, 30.97it/s]

{'embedding_loss': 0.0476, 'grad_norm': 0.7517562508583069, 'learning_rate': 1.9373818028997692e-05, 'epoch': 0.64}


                                                      
 13%|█▎        | 6278/47590 [1:26:48<22:13, 30.97it/s]

{'embedding_loss': 0.0447, 'grad_norm': 0.41553840041160583, 'learning_rate': 1.935047045364339e-05, 'epoch': 0.65}


                                                      
 13%|█▎        | 6278/47590 [1:26:49<22:13, 30.97it/s]

{'embedding_loss': 0.0461, 'grad_norm': 0.5216786861419678, 'learning_rate': 1.932712287828909e-05, 'epoch': 0.65}


                                                      
 13%|█▎        | 6278/47590 [1:26:50<22:13, 30.97it/s]

{'embedding_loss': 0.0509, 'grad_norm': 0.6343998908996582, 'learning_rate': 1.930377530293479e-05, 'epoch': 0.66}


                                                      
 13%|█▎        | 6278/47590 [1:26:51<22:13, 30.97it/s]

{'embedding_loss': 0.0439, 'grad_norm': 0.5733584761619568, 'learning_rate': 1.928042772758049e-05, 'epoch': 0.66}


                                                      
 13%|█▎        | 6278/47590 [1:26:52<22:13, 30.97it/s]

{'embedding_loss': 0.0461, 'grad_norm': 1.138632893562317, 'learning_rate': 1.9257080152226194e-05, 'epoch': 0.67}


                                                      
 13%|█▎        | 6278/47590 [1:26:53<22:13, 30.97it/s]

{'embedding_loss': 0.0396, 'grad_norm': 0.6067684292793274, 'learning_rate': 1.9233732576871893e-05, 'epoch': 0.67}


                                                      
 13%|█▎        | 6278/47590 [1:26:54<22:13, 30.97it/s]

{'embedding_loss': 0.0475, 'grad_norm': 0.6808825731277466, 'learning_rate': 1.9210385001517592e-05, 'epoch': 0.68}


                                                      
 13%|█▎        | 6278/47590 [1:26:55<22:13, 30.97it/s]

{'embedding_loss': 0.0519, 'grad_norm': 0.8119617104530334, 'learning_rate': 1.9187037426163295e-05, 'epoch': 0.68}


                                                      
 13%|█▎        | 6278/47590 [1:26:56<22:13, 30.97it/s]


{'embedding_loss': 0.0374, 'grad_norm': 0.3527377247810364, 'learning_rate': 1.9163689850808994e-05, 'epoch': 0.69}


                                                      A
 13%|█▎        | 6278/47590 [1:26:57<22:13, 30.97it/s]


{'embedding_loss': 0.0393, 'grad_norm': 1.089968204498291, 'learning_rate': 1.9140342275454696e-05, 'epoch': 0.69}


                                                      A
 13%|█▎        | 6278/47590 [1:26:58<22:13, 30.97it/s]

{'embedding_loss': 0.0531, 'grad_norm': 0.35611438751220703, 'learning_rate': 1.9116994700100395e-05, 'epoch': 0.7}


                                                      
 13%|█▎        | 6278/47590 [1:26:59<22:13, 30.97it/s]

{'embedding_loss': 0.0481, 'grad_norm': 1.1000829935073853, 'learning_rate': 1.9093647124746098e-05, 'epoch': 0.7}


                                                      
 13%|█▎        | 6278/47590 [1:27:00<22:13, 30.97it/s]

{'embedding_loss': 0.0441, 'grad_norm': 1.1461442708969116, 'learning_rate': 1.9070299549391797e-05, 'epoch': 0.71}


                                                      
 13%|█▎        | 6278/47590 [1:27:01<22:13, 30.97it/s]

{'embedding_loss': 0.0406, 'grad_norm': 0.39321768283843994, 'learning_rate': 1.9046951974037496e-05, 'epoch': 0.71}


                                                      
 13%|█▎        | 6278/47590 [1:27:02<22:13, 30.97it/s]

{'embedding_loss': 0.0378, 'grad_norm': 0.5621172189712524, 'learning_rate': 1.90236043986832e-05, 'epoch': 0.72}


                                                      
 13%|█▎        | 6278/47590 [1:27:03<22:13, 30.97it/s]

{'embedding_loss': 0.0429, 'grad_norm': 0.5407271981239319, 'learning_rate': 1.90002568233289e-05, 'epoch': 0.72}


                                                      
 13%|█▎        | 6278/47590 [1:27:04<22:13, 30.97it/s]

{'embedding_loss': 0.0361, 'grad_norm': 0.2493465542793274, 'learning_rate': 1.89769092479746e-05, 'epoch': 0.73}


                                                      
 13%|█▎        | 6278/47590 [1:27:05<22:13, 30.97it/s]

{'embedding_loss': 0.0441, 'grad_norm': 0.2096489518880844, 'learning_rate': 1.89535616726203e-05, 'epoch': 0.74}


                                                      
 13%|█▎        | 6278/47590 [1:27:06<22:13, 30.97it/s]

{'embedding_loss': 0.042, 'grad_norm': 0.7838289141654968, 'learning_rate': 1.8930214097265998e-05, 'epoch': 0.74}


                                                      
 13%|█▎        | 6278/47590 [1:27:07<22:13, 30.97it/s]

{'embedding_loss': 0.0367, 'grad_norm': 0.3044523298740387, 'learning_rate': 1.89068665219117e-05, 'epoch': 0.75}


                                                      
 13%|█▎        | 6278/47590 [1:27:08<22:13, 30.97it/s]

{'embedding_loss': 0.0351, 'grad_norm': 0.3580510914325714, 'learning_rate': 1.8883518946557403e-05, 'epoch': 0.75}


                                                      
 13%|█▎        | 6278/47590 [1:27:09<22:13, 30.97it/s]

{'embedding_loss': 0.037, 'grad_norm': 0.7494357228279114, 'learning_rate': 1.8860171371203102e-05, 'epoch': 0.76}


                                                      
 13%|█▎        | 6278/47590 [1:27:10<22:13, 30.97it/s]

{'embedding_loss': 0.0344, 'grad_norm': 1.1004847288131714, 'learning_rate': 1.88368237958488e-05, 'epoch': 0.76}


                                                      
 13%|█▎        | 6278/47590 [1:27:11<22:13, 30.97it/s]

{'embedding_loss': 0.0399, 'grad_norm': 0.6630773544311523, 'learning_rate': 1.8813476220494504e-05, 'epoch': 0.77}


                                                      
 13%|█▎        | 6278/47590 [1:27:12<22:13, 30.97it/s]


{'embedding_loss': 0.0407, 'grad_norm': 2.582382917404175, 'learning_rate': 1.8790128645140203e-05, 'epoch': 0.77}


                                                      A
 13%|█▎        | 6278/47590 [1:27:13<22:13, 30.97it/s]

{'embedding_loss': 0.0409, 'grad_norm': 0.48476505279541016, 'learning_rate': 1.8766781069785905e-05, 'epoch': 0.78}


                                                      
 13%|█▎        | 6278/47590 [1:27:14<22:13, 30.97it/s]

{'embedding_loss': 0.0383, 'grad_norm': 0.6850413084030151, 'learning_rate': 1.8743433494431605e-05, 'epoch': 0.78}


                                                      
 13%|█▎        | 6278/47590 [1:27:15<22:13, 30.97it/s]

{'embedding_loss': 0.0351, 'grad_norm': 1.7837673425674438, 'learning_rate': 1.8720085919077304e-05, 'epoch': 0.79}


                                                      
 13%|█▎        | 6278/47590 [1:27:16<22:13, 30.97it/s]

{'embedding_loss': 0.0354, 'grad_norm': 1.3839002847671509, 'learning_rate': 1.8696738343723006e-05, 'epoch': 0.79}


                                                      
 13%|█▎        | 6278/47590 [1:27:17<22:13, 30.97it/s]

{'embedding_loss': 0.0289, 'grad_norm': 0.49771979451179504, 'learning_rate': 1.8673390768368705e-05, 'epoch': 0.8}


                                                      
 13%|█▎        | 6278/47590 [1:27:18<22:13, 30.97it/s]

{'embedding_loss': 0.0323, 'grad_norm': 0.573362410068512, 'learning_rate': 1.8650043193014408e-05, 'epoch': 0.8}


                                                      
 13%|█▎        | 6278/47590 [1:27:19<22:13, 30.97it/s]

{'embedding_loss': 0.0339, 'grad_norm': 0.7813534140586853, 'learning_rate': 1.8626695617660107e-05, 'epoch': 0.81}


                                                      
 13%|█▎        | 6278/47590 [1:27:20<22:13, 30.97it/s]

{'embedding_loss': 0.0373, 'grad_norm': 0.8510344624519348, 'learning_rate': 1.860334804230581e-05, 'epoch': 0.81}


                                                      
 13%|█▎        | 6278/47590 [1:27:21<22:13, 30.97it/s]


{'embedding_loss': 0.0352, 'grad_norm': 0.6237248182296753, 'learning_rate': 1.8580000466951508e-05, 'epoch': 0.82}


                                                      A
 13%|█▎        | 6278/47590 [1:27:22<22:13, 30.97it/s]

{'embedding_loss': 0.0334, 'grad_norm': 0.9513611197471619, 'learning_rate': 1.8556652891597207e-05, 'epoch': 0.82}


                                                      
 13%|█▎        | 6278/47590 [1:27:23<22:13, 30.97it/s]

{'embedding_loss': 0.0328, 'grad_norm': 0.9402440786361694, 'learning_rate': 1.853330531624291e-05, 'epoch': 0.83}


                                                      
 13%|█▎        | 6278/47590 [1:27:24<22:13, 30.97it/s]

{'embedding_loss': 0.0377, 'grad_norm': 0.41092777252197266, 'learning_rate': 1.8509957740888612e-05, 'epoch': 0.84}


                                                      
 13%|█▎        | 6278/47590 [1:27:25<22:13, 30.97it/s]

{'embedding_loss': 0.033, 'grad_norm': 0.5365091562271118, 'learning_rate': 1.848661016553431e-05, 'epoch': 0.84}


                                                      
 13%|█▎        | 6278/47590 [1:27:26<22:13, 30.97it/s]

{'embedding_loss': 0.0336, 'grad_norm': 0.5882782936096191, 'learning_rate': 1.846326259018001e-05, 'epoch': 0.85}


                                                      
 13%|█▎        | 6278/47590 [1:27:27<22:13, 30.97it/s]

{'embedding_loss': 0.0306, 'grad_norm': 0.6914575695991516, 'learning_rate': 1.843991501482571e-05, 'epoch': 0.85}


                                                      
 13%|█▎        | 6278/47590 [1:27:28<22:13, 30.97it/s]

{'embedding_loss': 0.0288, 'grad_norm': 0.3127874732017517, 'learning_rate': 1.8416567439471412e-05, 'epoch': 0.86}


                                                      
 13%|█▎        | 6278/47590 [1:27:29<22:13, 30.97it/s]

{'embedding_loss': 0.0318, 'grad_norm': 1.6261310577392578, 'learning_rate': 1.8393219864117115e-05, 'epoch': 0.86}


                                                      
 13%|█▎        | 6278/47590 [1:27:30<22:13, 30.97it/s]

{'embedding_loss': 0.0342, 'grad_norm': 0.6483672857284546, 'learning_rate': 1.8369872288762814e-05, 'epoch': 0.87}


                                                      
 13%|█▎        | 6278/47590 [1:27:31<22:13, 30.97it/s]

{'embedding_loss': 0.0313, 'grad_norm': 0.7291240692138672, 'learning_rate': 1.8346524713408513e-05, 'epoch': 0.87}


                                                      
 13%|█▎        | 6278/47590 [1:27:32<22:13, 30.97it/s]

{'embedding_loss': 0.0267, 'grad_norm': 1.3067632913589478, 'learning_rate': 1.8323177138054215e-05, 'epoch': 0.88}


                                                      
 13%|█▎        | 6278/47590 [1:27:33<22:13, 30.97it/s]


{'embedding_loss': 0.027, 'grad_norm': 1.1127575635910034, 'learning_rate': 1.8299829562699914e-05, 'epoch': 0.88}


                                                      A
 13%|█▎        | 6278/47590 [1:27:34<22:13, 30.97it/s]


{'embedding_loss': 0.0282, 'grad_norm': 0.5179041624069214, 'learning_rate': 1.8276481987345617e-05, 'epoch': 0.89}


                                                      A
 13%|█▎        | 6278/47590 [1:27:35<22:13, 30.97it/s]

{'embedding_loss': 0.0295, 'grad_norm': 0.330742746591568, 'learning_rate': 1.8253134411991316e-05, 'epoch': 0.89}


                                                      
 13%|█▎        | 6278/47590 [1:27:36<22:13, 30.97it/s]

{'embedding_loss': 0.033, 'grad_norm': 0.8801612257957458, 'learning_rate': 1.8229786836637015e-05, 'epoch': 0.9}


                                                      
 13%|█▎        | 6278/47590 [1:27:37<22:13, 30.97it/s]

{'embedding_loss': 0.0278, 'grad_norm': 0.3385790288448334, 'learning_rate': 1.8206439261282717e-05, 'epoch': 0.9}


                                                      
 13%|█▎        | 6278/47590 [1:27:38<22:13, 30.97it/s]


{'embedding_loss': 0.0311, 'grad_norm': 0.4285293519496918, 'learning_rate': 1.8183091685928417e-05, 'epoch': 0.91}


                                                      A
 13%|█▎        | 6278/47590 [1:27:39<22:13, 30.97it/s]

{'embedding_loss': 0.0259, 'grad_norm': 0.6068978905677795, 'learning_rate': 1.815974411057412e-05, 'epoch': 0.91}


                                                      
 13%|█▎        | 6278/47590 [1:27:40<22:13, 30.97it/s]

{'embedding_loss': 0.0303, 'grad_norm': 0.19855396449565887, 'learning_rate': 1.8136396535219818e-05, 'epoch': 0.92}


                                                      
 13%|█▎        | 6278/47590 [1:27:41<22:13, 30.97it/s]


{'embedding_loss': 0.0297, 'grad_norm': 0.3384450376033783, 'learning_rate': 1.811304895986552e-05, 'epoch': 0.92}


                                                      A
 13%|█▎        | 6278/47590 [1:27:42<22:13, 30.97it/s]

{'embedding_loss': 0.0277, 'grad_norm': 0.29398587346076965, 'learning_rate': 1.808970138451122e-05, 'epoch': 0.93}


                                                      
 13%|█▎        | 6278/47590 [1:27:43<22:13, 30.97it/s]

{'embedding_loss': 0.0254, 'grad_norm': 0.6742027997970581, 'learning_rate': 1.806635380915692e-05, 'epoch': 0.94}


                                                      
 13%|█▎        | 6278/47590 [1:27:44<22:13, 30.97it/s]

{'embedding_loss': 0.0285, 'grad_norm': 0.43716180324554443, 'learning_rate': 1.804300623380262e-05, 'epoch': 0.94}


                                                      
 13%|█▎        | 6278/47590 [1:27:45<22:13, 30.97it/s]

{'embedding_loss': 0.0259, 'grad_norm': 0.5182799100875854, 'learning_rate': 1.8019658658448324e-05, 'epoch': 0.95}


                                                      
 13%|█▎        | 6278/47590 [1:27:47<22:13, 30.97it/s]

{'embedding_loss': 0.0278, 'grad_norm': 0.33997124433517456, 'learning_rate': 1.7996311083094023e-05, 'epoch': 0.95}


                                                      
 13%|█▎        | 6278/47590 [1:27:47<22:13, 30.97it/s]

{'embedding_loss': 0.0235, 'grad_norm': 0.6172327399253845, 'learning_rate': 1.7972963507739722e-05, 'epoch': 0.96}


                                                      
 13%|█▎        | 6278/47590 [1:27:48<22:13, 30.97it/s]

{'embedding_loss': 0.0233, 'grad_norm': 0.23563915491104126, 'learning_rate': 1.794961593238542e-05, 'epoch': 0.96}


                                                      
 13%|█▎        | 6278/47590 [1:27:49<22:13, 30.97it/s]

{'embedding_loss': 0.0256, 'grad_norm': 0.36888957023620605, 'learning_rate': 1.7926268357031123e-05, 'epoch': 0.97}


                                                      
 13%|█▎        | 6278/47590 [1:27:50<22:13, 30.97it/s]

{'embedding_loss': 0.0315, 'grad_norm': 0.7324580550193787, 'learning_rate': 1.7902920781676826e-05, 'epoch': 0.97}


                                                      
 13%|█▎        | 6278/47590 [1:27:51<22:13, 30.97it/s]

{'embedding_loss': 0.0284, 'grad_norm': 0.2714293301105499, 'learning_rate': 1.7879573206322525e-05, 'epoch': 0.98}


                                                      
 13%|█▎        | 6278/47590 [1:27:52<22:13, 30.97it/s]

{'embedding_loss': 0.0259, 'grad_norm': 0.41003140807151794, 'learning_rate': 1.7856225630968224e-05, 'epoch': 0.98}


                                                      
 13%|█▎        | 6278/47590 [1:27:53<22:13, 30.97it/s]

{'embedding_loss': 0.0259, 'grad_norm': 1.1662607192993164, 'learning_rate': 1.7832878055613927e-05, 'epoch': 0.99}


                                                      
 13%|█▎        | 6278/47590 [1:27:54<22:13, 30.97it/s]

{'embedding_loss': 0.0209, 'grad_norm': 0.6496630311012268, 'learning_rate': 1.7809530480259626e-05, 'epoch': 0.99}


                                                      
 13%|█▎        | 6278/47590 [1:27:55<22:13, 30.97it/s]

{'embedding_loss': 0.0244, 'grad_norm': 0.7934753894805908, 'learning_rate': 1.7786182904905328e-05, 'epoch': 1.0}


                                                      
 13%|█▎        | 6278/47590 [1:27:57<22:13, 30.97it/s]

{'embedding_loss': 0.0224, 'grad_norm': 0.8514091968536377, 'learning_rate': 1.7762835329551027e-05, 'epoch': 1.0}


                                                      
 13%|█▎        | 6278/47590 [1:27:58<22:13, 30.97it/s]

{'embedding_loss': 0.0232, 'grad_norm': 2.1877388954162598, 'learning_rate': 1.773948775419673e-05, 'epoch': 1.01}


                                                      
 13%|█▎        | 6278/47590 [1:27:59<22:13, 30.97it/s]

{'embedding_loss': 0.0254, 'grad_norm': 0.7929381728172302, 'learning_rate': 1.771614017884243e-05, 'epoch': 1.01}


                                                      
 13%|█▎        | 6278/47590 [1:28:00<22:13, 30.97it/s]

{'embedding_loss': 0.0203, 'grad_norm': 0.7231431007385254, 'learning_rate': 1.7692792603488128e-05, 'epoch': 1.02}


                                                      
 13%|█▎        | 6278/47590 [1:28:01<22:13, 30.97it/s]

{'embedding_loss': 0.0256, 'grad_norm': 0.8332352042198181, 'learning_rate': 1.766944502813383e-05, 'epoch': 1.02}


                                                      
 13%|█▎        | 6278/47590 [1:28:02<22:13, 30.97it/s]

{'embedding_loss': 0.0197, 'grad_norm': 0.6087793111801147, 'learning_rate': 1.764609745277953e-05, 'epoch': 1.03}


                                                      
 13%|█▎        | 6278/47590 [1:28:03<22:13, 30.97it/s]

{'embedding_loss': 0.0264, 'grad_norm': 0.4345208406448364, 'learning_rate': 1.7622749877425232e-05, 'epoch': 1.03}


                                                      
 13%|█▎        | 6278/47590 [1:28:04<22:13, 30.97it/s]

{'embedding_loss': 0.0219, 'grad_norm': 0.46598300337791443, 'learning_rate': 1.759940230207093e-05, 'epoch': 1.04}


                                                      
 13%|█▎        | 6278/47590 [1:28:05<22:13, 30.97it/s]


{'embedding_loss': 0.0206, 'grad_norm': 0.4432854652404785, 'learning_rate': 1.757605472671663e-05, 'epoch': 1.05}


                                                      A
 13%|█▎        | 6278/47590 [1:28:06<22:13, 30.97it/s]

{'embedding_loss': 0.0231, 'grad_norm': 0.4398891031742096, 'learning_rate': 1.7552707151362333e-05, 'epoch': 1.05}


                                                      
 13%|█▎        | 6278/47590 [1:28:07<22:13, 30.97it/s]

{'embedding_loss': 0.0233, 'grad_norm': 0.8515586256980896, 'learning_rate': 1.7529359576008035e-05, 'epoch': 1.06}


                                                      
 13%|█▎        | 6278/47590 [1:28:08<22:13, 30.97it/s]

{'embedding_loss': 0.0232, 'grad_norm': 0.1675146520137787, 'learning_rate': 1.7506012000653734e-05, 'epoch': 1.06}


                                                      
 13%|█▎        | 6278/47590 [1:28:09<22:13, 30.97it/s]

{'embedding_loss': 0.022, 'grad_norm': 0.6526290774345398, 'learning_rate': 1.7482664425299433e-05, 'epoch': 1.07}


                                                      
 13%|█▎        | 6278/47590 [1:28:10<22:13, 30.97it/s]

{'embedding_loss': 0.0232, 'grad_norm': 0.06717667728662491, 'learning_rate': 1.7459316849945132e-05, 'epoch': 1.07}


                                                      
 13%|█▎        | 6278/47590 [1:28:11<22:13, 30.97it/s]

{'embedding_loss': 0.0188, 'grad_norm': 0.6727942824363708, 'learning_rate': 1.7435969274590835e-05, 'epoch': 1.08}


                                                      
 13%|█▎        | 6278/47590 [1:28:12<22:13, 30.97it/s]

{'embedding_loss': 0.0207, 'grad_norm': 0.4162188768386841, 'learning_rate': 1.7412621699236537e-05, 'epoch': 1.08}


                                                      
 13%|█▎        | 6278/47590 [1:28:13<22:13, 30.97it/s]

{'embedding_loss': 0.0213, 'grad_norm': 0.5214107036590576, 'learning_rate': 1.7389274123882236e-05, 'epoch': 1.09}


                                                      
 13%|█▎        | 6278/47590 [1:28:14<22:13, 30.97it/s]

{'embedding_loss': 0.0184, 'grad_norm': 0.24057777225971222, 'learning_rate': 1.7365926548527936e-05, 'epoch': 1.09}


                                                      
 13%|█▎        | 6278/47590 [1:28:15<22:13, 30.97it/s]

{'embedding_loss': 0.0186, 'grad_norm': 0.24613533914089203, 'learning_rate': 1.7342578973173638e-05, 'epoch': 1.1}


                                                      
 13%|█▎        | 6278/47590 [1:28:16<22:13, 30.97it/s]

{'embedding_loss': 0.0198, 'grad_norm': 0.424304723739624, 'learning_rate': 1.7319231397819337e-05, 'epoch': 1.1}


                                                      
 13%|█▎        | 6278/47590 [1:28:17<22:13, 30.97it/s]

{'embedding_loss': 0.0179, 'grad_norm': 0.282648503780365, 'learning_rate': 1.729588382246504e-05, 'epoch': 1.11}


                                                      
 13%|█▎        | 6278/47590 [1:28:18<22:13, 30.97it/s]

{'embedding_loss': 0.0202, 'grad_norm': 0.4828825294971466, 'learning_rate': 1.727253624711074e-05, 'epoch': 1.11}


                                                      
 13%|█▎        | 6278/47590 [1:28:19<22:13, 30.97it/s]

{'embedding_loss': 0.0258, 'grad_norm': 0.3792170286178589, 'learning_rate': 1.724918867175644e-05, 'epoch': 1.12}


                                                      
 13%|█▎        | 6278/47590 [1:28:20<22:13, 30.97it/s]

{'embedding_loss': 0.017, 'grad_norm': 1.064779281616211, 'learning_rate': 1.722584109640214e-05, 'epoch': 1.12}


                                                      
 13%|█▎        | 6278/47590 [1:28:21<22:13, 30.97it/s]

{'embedding_loss': 0.0198, 'grad_norm': 0.36746981739997864, 'learning_rate': 1.720249352104784e-05, 'epoch': 1.13}


                                                      
 13%|█▎        | 6278/47590 [1:28:22<22:13, 30.97it/s]

{'embedding_loss': 0.02, 'grad_norm': 0.321006178855896, 'learning_rate': 1.7179145945693542e-05, 'epoch': 1.13}


                                                      
 13%|█▎        | 6278/47590 [1:28:23<22:13, 30.97it/s]

{'embedding_loss': 0.0194, 'grad_norm': 0.38031890988349915, 'learning_rate': 1.7155798370339244e-05, 'epoch': 1.14}


                                                      
 13%|█▎        | 6278/47590 [1:28:24<22:13, 30.97it/s]

{'embedding_loss': 0.0209, 'grad_norm': 0.5362487435340881, 'learning_rate': 1.7132450794984943e-05, 'epoch': 1.15}


                                                      
 13%|█▎        | 6278/47590 [1:28:25<22:13, 30.97it/s]

{'embedding_loss': 0.0189, 'grad_norm': 0.6902497410774231, 'learning_rate': 1.7109103219630642e-05, 'epoch': 1.15}


                                                      
 13%|█▎        | 6278/47590 [1:28:26<22:13, 30.97it/s]

{'embedding_loss': 0.0189, 'grad_norm': 0.7379510402679443, 'learning_rate': 1.708575564427634e-05, 'epoch': 1.16}


                                                      
 13%|█▎        | 6278/47590 [1:28:27<22:13, 30.97it/s]

{'embedding_loss': 0.016, 'grad_norm': 0.6508194804191589, 'learning_rate': 1.7062408068922044e-05, 'epoch': 1.16}


                                                      
 13%|█▎        | 6278/47590 [1:28:28<22:13, 30.97it/s]

{'embedding_loss': 0.0205, 'grad_norm': 0.6244479417800903, 'learning_rate': 1.7039060493567746e-05, 'epoch': 1.17}


                                                      
 13%|█▎        | 6278/47590 [1:28:29<22:13, 30.97it/s]

{'embedding_loss': 0.0161, 'grad_norm': 0.5734381675720215, 'learning_rate': 1.7015712918213446e-05, 'epoch': 1.17}


                                                      
 13%|█▎        | 6278/47590 [1:28:30<22:13, 30.97it/s]

{'embedding_loss': 0.0169, 'grad_norm': 0.43745729327201843, 'learning_rate': 1.6992365342859145e-05, 'epoch': 1.18}


                                                      
 13%|█▎        | 6278/47590 [1:28:31<22:13, 30.97it/s]

{'embedding_loss': 0.0149, 'grad_norm': 0.44551756978034973, 'learning_rate': 1.6969017767504844e-05, 'epoch': 1.18}


                                                      
 13%|█▎        | 6278/47590 [1:28:32<22:13, 30.97it/s]

{'embedding_loss': 0.0187, 'grad_norm': 0.3340948820114136, 'learning_rate': 1.6945670192150546e-05, 'epoch': 1.19}


                                                      
 13%|█▎        | 6278/47590 [1:28:33<22:13, 30.97it/s]

{'embedding_loss': 0.0139, 'grad_norm': 0.4681302607059479, 'learning_rate': 1.692232261679625e-05, 'epoch': 1.19}


                                                      
 13%|█▎        | 6278/47590 [1:28:34<22:13, 30.97it/s]


{'embedding_loss': 0.0158, 'grad_norm': 0.45961084961891174, 'learning_rate': 1.6898975041441948e-05, 'epoch': 1.2}


                                                      [A
 13%|█▎        | 6278/47590 [1:28:35<22:13, 30.97it/s]

{'embedding_loss': 0.0181, 'grad_norm': 0.3718689978122711, 'learning_rate': 1.6875627466087647e-05, 'epoch': 1.2}


                                                      
 13%|█▎        | 6278/47590 [1:28:36<22:13, 30.97it/s]

{'embedding_loss': 0.018, 'grad_norm': 1.1821606159210205, 'learning_rate': 1.685227989073335e-05, 'epoch': 1.21}


                                                      
 13%|█▎        | 6278/47590 [1:28:37<22:13, 30.97it/s]

{'embedding_loss': 0.0167, 'grad_norm': 0.29288119077682495, 'learning_rate': 1.682893231537905e-05, 'epoch': 1.21}


                                                      
 13%|█▎        | 6278/47590 [1:28:38<22:13, 30.97it/s]

{'embedding_loss': 0.0186, 'grad_norm': 0.82960444688797, 'learning_rate': 1.680558474002475e-05, 'epoch': 1.22}


                                                      
 13%|█▎        | 6278/47590 [1:28:39<22:13, 30.97it/s]

{'embedding_loss': 0.0175, 'grad_norm': 1.2275663614273071, 'learning_rate': 1.678223716467045e-05, 'epoch': 1.22}


                                                      
 13%|█▎        | 6278/47590 [1:28:40<22:13, 30.97it/s]

{'embedding_loss': 0.0142, 'grad_norm': 0.20085227489471436, 'learning_rate': 1.6758889589316153e-05, 'epoch': 1.23}


                                                      
 13%|█▎        | 6278/47590 [1:28:41<22:13, 30.97it/s]

{'embedding_loss': 0.0173, 'grad_norm': 0.29874691367149353, 'learning_rate': 1.673554201396185e-05, 'epoch': 1.23}


                                                      
 13%|█▎        | 6278/47590 [1:28:42<22:13, 30.97it/s]

{'embedding_loss': 0.0167, 'grad_norm': 1.1548652648925781, 'learning_rate': 1.671219443860755e-05, 'epoch': 1.24}


                                                      
 13%|█▎        | 6278/47590 [1:28:43<22:13, 30.97it/s]


{'embedding_loss': 0.0163, 'grad_norm': 0.5105785727500916, 'learning_rate': 1.6688846863253253e-05, 'epoch': 1.25}


                                                      [A
 13%|█▎        | 6278/47590 [1:28:44<22:13, 30.97it/s]

{'embedding_loss': 0.0196, 'grad_norm': 0.3451199531555176, 'learning_rate': 1.6665499287898956e-05, 'epoch': 1.25}


                                                      
 13%|█▎        | 6278/47590 [1:28:45<22:13, 30.97it/s]

{'embedding_loss': 0.0165, 'grad_norm': 0.41678452491760254, 'learning_rate': 1.6642151712544655e-05, 'epoch': 1.26}


                                                      
 13%|█▎        | 6278/47590 [1:28:46<22:13, 30.97it/s]

{'embedding_loss': 0.0181, 'grad_norm': 0.15427452325820923, 'learning_rate': 1.6618804137190354e-05, 'epoch': 1.26}


                                                      
 13%|█▎        | 6278/47590 [1:28:47<22:13, 30.97it/s]


{'embedding_loss': 0.012, 'grad_norm': 0.8241094946861267, 'learning_rate': 1.6595456561836053e-05, 'epoch': 1.27}


                                                      [A
 13%|█▎        | 6278/47590 [1:28:48<22:13, 30.97it/s]

{'embedding_loss': 0.0147, 'grad_norm': 0.43469148874282837, 'learning_rate': 1.6572108986481755e-05, 'epoch': 1.27}


                                                      
 13%|█▎        | 6278/47590 [1:28:49<22:13, 30.97it/s]

{'embedding_loss': 0.0177, 'grad_norm': 0.8051238656044006, 'learning_rate': 1.6548761411127458e-05, 'epoch': 1.28}


                                                      
 13%|█▎        | 6278/47590 [1:28:50<22:13, 30.97it/s]

{'embedding_loss': 0.0134, 'grad_norm': 0.2584275007247925, 'learning_rate': 1.6525413835773157e-05, 'epoch': 1.28}


                                                      
 13%|█▎        | 6278/47590 [1:28:51<22:13, 30.97it/s]

{'embedding_loss': 0.0163, 'grad_norm': 0.5618520975112915, 'learning_rate': 1.6502066260418856e-05, 'epoch': 1.29}


                                                      
 13%|█▎        | 6278/47590 [1:28:52<22:13, 30.97it/s]

{'embedding_loss': 0.0156, 'grad_norm': 0.18639393150806427, 'learning_rate': 1.6478718685064555e-05, 'epoch': 1.29}


                                                      
 13%|█▎        | 6278/47590 [1:28:53<22:13, 30.97it/s]


{'embedding_loss': 0.018, 'grad_norm': 0.18235436081886292, 'learning_rate': 1.6455371109710258e-05, 'epoch': 1.3}


                                                      [A
 13%|█▎        | 6278/47590 [1:28:54<22:13, 30.97it/s]

{'embedding_loss': 0.0154, 'grad_norm': 0.761307418346405, 'learning_rate': 1.643202353435596e-05, 'epoch': 1.3}


                                                      
 13%|█▎        | 6278/47590 [1:28:55<22:13, 30.97it/s]

{'embedding_loss': 0.0157, 'grad_norm': 0.4993670880794525, 'learning_rate': 1.640867595900166e-05, 'epoch': 1.31}


                                                      
 13%|█▎        | 6278/47590 [1:28:56<22:13, 30.97it/s]

{'embedding_loss': 0.0155, 'grad_norm': 0.5193374752998352, 'learning_rate': 1.6385328383647358e-05, 'epoch': 1.31}


                                                      
 13%|█▎        | 6278/47590 [1:28:57<22:13, 30.97it/s]

{'embedding_loss': 0.0173, 'grad_norm': 0.20826567709445953, 'learning_rate': 1.636198080829306e-05, 'epoch': 1.32}


                                                      
 13%|█▎        | 6278/47590 [1:28:58<22:13, 30.97it/s]


{'embedding_loss': 0.0142, 'grad_norm': 0.8319751024246216, 'learning_rate': 1.633863323293876e-05, 'epoch': 1.32}


                                                      [A
 13%|█▎        | 6278/47590 [1:28:59<22:13, 30.97it/s]

{'embedding_loss': 0.0147, 'grad_norm': 0.5286663770675659, 'learning_rate': 1.6315285657584462e-05, 'epoch': 1.33}


                                                      
 13%|█▎        | 6278/47590 [1:29:00<22:13, 30.97it/s]


{'embedding_loss': 0.015, 'grad_norm': 0.2007530927658081, 'learning_rate': 1.629193808223016e-05, 'epoch': 1.33}


                                                      [A
 13%|█▎        | 6278/47590 [1:29:01<22:13, 30.97it/s]

{'embedding_loss': 0.0136, 'grad_norm': 0.29106006026268005, 'learning_rate': 1.6268590506875864e-05, 'epoch': 1.34}


                                                      
 13%|█▎        | 6278/47590 [1:29:02<22:13, 30.97it/s]

{'embedding_loss': 0.017, 'grad_norm': 0.22442848980426788, 'learning_rate': 1.6245242931521563e-05, 'epoch': 1.34}


                                                      
 13%|█▎        | 6278/47590 [1:29:03<22:13, 30.97it/s]

{'embedding_loss': 0.0171, 'grad_norm': 0.32368558645248413, 'learning_rate': 1.6221895356167262e-05, 'epoch': 1.35}


                                                      
 13%|█▎        | 6278/47590 [1:29:04<22:13, 30.97it/s]

{'embedding_loss': 0.0159, 'grad_norm': 0.7330107688903809, 'learning_rate': 1.6198547780812965e-05, 'epoch': 1.36}


                                                      
 13%|█▎        | 6278/47590 [1:29:05<22:13, 30.97it/s]

{'embedding_loss': 0.0145, 'grad_norm': 0.5420488119125366, 'learning_rate': 1.6175200205458667e-05, 'epoch': 1.36}


                                                      
 13%|█▎        | 6278/47590 [1:29:06<22:13, 30.97it/s]

{'embedding_loss': 0.0188, 'grad_norm': 0.5345972776412964, 'learning_rate': 1.6151852630104366e-05, 'epoch': 1.37}


                                                      
 13%|█▎        | 6278/47590 [1:29:07<22:13, 30.97it/s]

{'embedding_loss': 0.0172, 'grad_norm': 0.23287253081798553, 'learning_rate': 1.6128505054750065e-05, 'epoch': 1.37}


                                                      
 13%|█▎        | 6278/47590 [1:29:08<22:13, 30.97it/s]


{'embedding_loss': 0.0173, 'grad_norm': 0.4110642671585083, 'learning_rate': 1.6105157479395764e-05, 'epoch': 1.38}


                                                      [A
 13%|█▎        | 6278/47590 [1:29:09<22:13, 30.97it/s]

{'embedding_loss': 0.0146, 'grad_norm': 0.3190246820449829, 'learning_rate': 1.6081809904041467e-05, 'epoch': 1.38}


                                                      
 13%|█▎        | 6278/47590 [1:29:10<22:13, 30.97it/s]

{'embedding_loss': 0.0135, 'grad_norm': 0.34809979796409607, 'learning_rate': 1.605846232868717e-05, 'epoch': 1.39}


                                                      
 13%|█▎        | 6278/47590 [1:29:11<22:13, 30.97it/s]

{'embedding_loss': 0.0161, 'grad_norm': 0.336721271276474, 'learning_rate': 1.603511475333287e-05, 'epoch': 1.39}


                                                      
 13%|█▎        | 6278/47590 [1:29:12<22:13, 30.97it/s]

{'embedding_loss': 0.0155, 'grad_norm': 0.706636905670166, 'learning_rate': 1.6011767177978567e-05, 'epoch': 1.4}


                                                      
 13%|█▎        | 6278/47590 [1:29:13<22:13, 30.97it/s]

{'embedding_loss': 0.014, 'grad_norm': 0.3835996091365814, 'learning_rate': 1.5988419602624267e-05, 'epoch': 1.4}


                                                      
 13%|█▎        | 6278/47590 [1:29:14<22:13, 30.97it/s]

{'embedding_loss': 0.0152, 'grad_norm': 1.029335856437683, 'learning_rate': 1.596507202726997e-05, 'epoch': 1.41}


                                                      
 13%|█▎        | 6278/47590 [1:29:15<22:13, 30.97it/s]


{'embedding_loss': 0.0143, 'grad_norm': 0.36942559480667114, 'learning_rate': 1.594172445191567e-05, 'epoch': 1.41}


                                                      [A
 13%|█▎        | 6278/47590 [1:29:16<22:13, 30.97it/s]

{'embedding_loss': 0.0136, 'grad_norm': 0.2944096326828003, 'learning_rate': 1.591837687656137e-05, 'epoch': 1.42}


                                                      
 13%|█▎        | 6278/47590 [1:29:17<22:13, 30.97it/s]

{'embedding_loss': 0.0137, 'grad_norm': 0.326850563287735, 'learning_rate': 1.589502930120707e-05, 'epoch': 1.42}


                                                      
 13%|█▎        | 6278/47590 [1:29:18<22:13, 30.97it/s]

{'embedding_loss': 0.0129, 'grad_norm': 0.532719075679779, 'learning_rate': 1.5871681725852772e-05, 'epoch': 1.43}


                                                      
 13%|█▎        | 6278/47590 [1:29:19<22:13, 30.97it/s]

{'embedding_loss': 0.013, 'grad_norm': 0.2984119951725006, 'learning_rate': 1.584833415049847e-05, 'epoch': 1.43}


                                                      
 13%|█▎        | 6278/47590 [1:29:20<22:13, 30.97it/s]

{'embedding_loss': 0.0125, 'grad_norm': 0.24075277149677277, 'learning_rate': 1.5824986575144174e-05, 'epoch': 1.44}


                                                      
 13%|█▎        | 6278/47590 [1:29:21<22:13, 30.97it/s]


{'embedding_loss': 0.013, 'grad_norm': 0.42263320088386536, 'learning_rate': 1.5801638999789873e-05, 'epoch': 1.44}


                                                      [A
 13%|█▎        | 6278/47590 [1:29:22<22:13, 30.97it/s]

{'embedding_loss': 0.0122, 'grad_norm': 0.6827753186225891, 'learning_rate': 1.5778291424435575e-05, 'epoch': 1.45}


                                                      
 13%|█▎        | 6278/47590 [1:29:23<22:13, 30.97it/s]

{'embedding_loss': 0.0146, 'grad_norm': 0.4763692617416382, 'learning_rate': 1.5754943849081274e-05, 'epoch': 1.46}


                                                      
 13%|█▎        | 6278/47590 [1:29:24<22:13, 30.97it/s]

{'embedding_loss': 0.0148, 'grad_norm': 1.008854866027832, 'learning_rate': 1.5731596273726973e-05, 'epoch': 1.46}


                                                      
 13%|█▎        | 6278/47590 [1:29:25<22:13, 30.97it/s]

{'embedding_loss': 0.0104, 'grad_norm': 0.37564554810523987, 'learning_rate': 1.5708248698372676e-05, 'epoch': 1.47}


                                                      
 13%|█▎        | 6278/47590 [1:29:26<22:13, 30.97it/s]

{'embedding_loss': 0.0166, 'grad_norm': 0.41024303436279297, 'learning_rate': 1.568490112301838e-05, 'epoch': 1.47}


                                                      
 13%|█▎        | 6278/47590 [1:29:27<22:13, 30.97it/s]

{'embedding_loss': 0.012, 'grad_norm': 0.4738312363624573, 'learning_rate': 1.5661553547664077e-05, 'epoch': 1.48}


                                                      
 13%|█▎        | 6278/47590 [1:29:28<22:13, 30.97it/s]

{'embedding_loss': 0.0133, 'grad_norm': 0.21423079073429108, 'learning_rate': 1.5638205972309777e-05, 'epoch': 1.48}


                                                      
 13%|█▎        | 6278/47590 [1:29:29<22:13, 30.97it/s]

{'embedding_loss': 0.0136, 'grad_norm': 0.36278441548347473, 'learning_rate': 1.5614858396955476e-05, 'epoch': 1.49}


                                                      
 13%|█▎        | 6278/47590 [1:29:30<22:13, 30.97it/s]

{'embedding_loss': 0.0087, 'grad_norm': 0.4801228940486908, 'learning_rate': 1.5591510821601178e-05, 'epoch': 1.49}


                                                      
 13%|█▎        | 6278/47590 [1:29:31<22:13, 30.97it/s]

{'embedding_loss': 0.016, 'grad_norm': 0.4325827658176422, 'learning_rate': 1.556816324624688e-05, 'epoch': 1.5}


                                                      
 13%|█▎        | 6278/47590 [1:29:32<22:13, 30.97it/s]

{'embedding_loss': 0.0126, 'grad_norm': 0.30203506350517273, 'learning_rate': 1.554481567089258e-05, 'epoch': 1.5}


                                                      
 13%|█▎        | 6278/47590 [1:29:33<22:13, 30.97it/s]

{'embedding_loss': 0.0126, 'grad_norm': 0.38648325204849243, 'learning_rate': 1.552146809553828e-05, 'epoch': 1.51}


                                                      
 13%|█▎        | 6278/47590 [1:29:34<22:13, 30.97it/s]

{'embedding_loss': 0.0105, 'grad_norm': 0.33425459265708923, 'learning_rate': 1.5498120520183978e-05, 'epoch': 1.51}


                                                      
 13%|█▎        | 6278/47590 [1:29:35<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.597265362739563, 'learning_rate': 1.547477294482968e-05, 'epoch': 1.52}


                                                      
 13%|█▎        | 6278/47590 [1:29:36<22:13, 30.97it/s]

{'embedding_loss': 0.0142, 'grad_norm': 1.0364235639572144, 'learning_rate': 1.5451425369475383e-05, 'epoch': 1.52}


                                                      
 13%|█▎        | 6278/47590 [1:29:37<22:13, 30.97it/s]

{'embedding_loss': 0.0106, 'grad_norm': 0.4052525460720062, 'learning_rate': 1.5428077794121082e-05, 'epoch': 1.53}


                                                      
 13%|█▎        | 6278/47590 [1:29:38<22:13, 30.97it/s]

{'embedding_loss': 0.013, 'grad_norm': 0.2462298721075058, 'learning_rate': 1.540473021876678e-05, 'epoch': 1.53}


                                                      
 13%|█▎        | 6278/47590 [1:29:39<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.7984991669654846, 'learning_rate': 1.5381382643412484e-05, 'epoch': 1.54}


                                                      
 13%|█▎        | 6278/47590 [1:29:40<22:13, 30.97it/s]

{'embedding_loss': 0.0124, 'grad_norm': 0.2147626429796219, 'learning_rate': 1.5358035068058183e-05, 'epoch': 1.54}


                                                      
 13%|█▎        | 6278/47590 [1:29:41<22:13, 30.97it/s]

{'embedding_loss': 0.0103, 'grad_norm': 1.152117371559143, 'learning_rate': 1.5334687492703885e-05, 'epoch': 1.55}


                                                      
 13%|█▎        | 6278/47590 [1:29:42<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.49981313943862915, 'learning_rate': 1.5311339917349584e-05, 'epoch': 1.55}


                                                      
 13%|█▎        | 6278/47590 [1:29:43<22:13, 30.97it/s]


{'embedding_loss': 0.0111, 'grad_norm': 0.7076317667961121, 'learning_rate': 1.5287992341995287e-05, 'epoch': 1.56}


                                                      [A
 13%|█▎        | 6278/47590 [1:29:44<22:13, 30.97it/s]

{'embedding_loss': 0.0124, 'grad_norm': 0.8208915591239929, 'learning_rate': 1.5264644766640986e-05, 'epoch': 1.57}


                                                      
 13%|█▎        | 6278/47590 [1:29:45<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.23541900515556335, 'learning_rate': 1.5241297191286687e-05, 'epoch': 1.57}


                                                      
 13%|█▎        | 6278/47590 [1:29:45<22:13, 30.97it/s]

{'embedding_loss': 0.0129, 'grad_norm': 0.2489764392375946, 'learning_rate': 1.5217949615932386e-05, 'epoch': 1.58}


                                                      
 13%|█▎        | 6278/47590 [1:29:47<22:13, 30.97it/s]

{'embedding_loss': 0.0113, 'grad_norm': 0.6429381966590881, 'learning_rate': 1.5194602040578088e-05, 'epoch': 1.58}


                                                      
 13%|█▎        | 6278/47590 [1:29:48<22:13, 30.97it/s]


{'embedding_loss': 0.0109, 'grad_norm': 0.3244408667087555, 'learning_rate': 1.5171254465223789e-05, 'epoch': 1.59}


                                                      [A
 13%|█▎        | 6278/47590 [1:29:49<22:13, 30.97it/s]

{'embedding_loss': 0.0106, 'grad_norm': 0.35072022676467896, 'learning_rate': 1.5147906889869488e-05, 'epoch': 1.59}


                                                      
 13%|█▎        | 6278/47590 [1:29:50<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.6116422414779663, 'learning_rate': 1.5124559314515189e-05, 'epoch': 1.6}


                                                      
 13%|█▎        | 6278/47590 [1:29:51<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.302019864320755, 'learning_rate': 1.510121173916089e-05, 'epoch': 1.6}


                                                      
 13%|█▎        | 6278/47590 [1:29:52<22:13, 30.97it/s]

{'embedding_loss': 0.0122, 'grad_norm': 0.29711803793907166, 'learning_rate': 1.507786416380659e-05, 'epoch': 1.61}


                                                      
 13%|█▎        | 6278/47590 [1:29:53<22:13, 30.97it/s]

{'embedding_loss': 0.0129, 'grad_norm': 0.26900044083595276, 'learning_rate': 1.5054516588452291e-05, 'epoch': 1.61}


                                                      
 13%|█▎        | 6278/47590 [1:29:54<22:13, 30.97it/s]


{'embedding_loss': 0.0116, 'grad_norm': 0.6774003505706787, 'learning_rate': 1.503116901309799e-05, 'epoch': 1.62}


                                                      [A
 13%|█▎        | 6278/47590 [1:29:54<22:13, 30.97it/s]


{'embedding_loss': 0.0149, 'grad_norm': 0.49902600049972534, 'learning_rate': 1.5007821437743693e-05, 'epoch': 1.62}


                                                      [A
 13%|█▎        | 6278/47590 [1:29:55<22:13, 30.97it/s]

{'embedding_loss': 0.0126, 'grad_norm': 0.34163898229599, 'learning_rate': 1.4984473862389393e-05, 'epoch': 1.63}


                                                      
 13%|█▎        | 6278/47590 [1:29:57<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.15747229754924774, 'learning_rate': 1.4961126287035093e-05, 'epoch': 1.63}


                                                      
 13%|█▎        | 6278/47590 [1:29:58<22:13, 30.97it/s]

{'embedding_loss': 0.0105, 'grad_norm': 0.2889285981655121, 'learning_rate': 1.4937778711680793e-05, 'epoch': 1.64}


                                                      
 13%|█▎        | 6278/47590 [1:29:59<22:13, 30.97it/s]

{'embedding_loss': 0.0134, 'grad_norm': 0.5062015652656555, 'learning_rate': 1.4914431136326492e-05, 'epoch': 1.64}


                                                      
 13%|█▎        | 6278/47590 [1:30:00<22:13, 30.97it/s]

{'embedding_loss': 0.0115, 'grad_norm': 0.16255253553390503, 'learning_rate': 1.4891083560972195e-05, 'epoch': 1.65}


                                                      
 13%|█▎        | 6278/47590 [1:30:01<22:13, 30.97it/s]

{'embedding_loss': 0.0108, 'grad_norm': 0.3017946779727936, 'learning_rate': 1.4867735985617896e-05, 'epoch': 1.65}


                                                      
 13%|█▎        | 6278/47590 [1:30:02<22:13, 30.97it/s]


{'embedding_loss': 0.011, 'grad_norm': 0.16397887468338013, 'learning_rate': 1.4844388410263595e-05, 'epoch': 1.66}


                                                      [A
 13%|█▎        | 6278/47590 [1:30:03<22:13, 30.97it/s]

{'embedding_loss': 0.0118, 'grad_norm': 0.5087998509407043, 'learning_rate': 1.4821040834909296e-05, 'epoch': 1.67}


                                                      
 13%|█▎        | 6278/47590 [1:30:04<22:13, 30.97it/s]

{'embedding_loss': 0.0113, 'grad_norm': 0.1941063404083252, 'learning_rate': 1.4797693259554996e-05, 'epoch': 1.67}


                                                      
 13%|█▎        | 6278/47590 [1:30:05<22:13, 30.97it/s]

{'embedding_loss': 0.0097, 'grad_norm': 0.34090131521224976, 'learning_rate': 1.4774345684200697e-05, 'epoch': 1.68}


                                                      
 13%|█▎        | 6278/47590 [1:30:05<22:13, 30.97it/s]

{'embedding_loss': 0.0112, 'grad_norm': 0.8247016668319702, 'learning_rate': 1.4750998108846398e-05, 'epoch': 1.68}


                                                      
 13%|█▎        | 6278/47590 [1:30:07<22:13, 30.97it/s]

{'embedding_loss': 0.0099, 'grad_norm': 1.5354862213134766, 'learning_rate': 1.4727650533492097e-05, 'epoch': 1.69}


                                                      
 13%|█▎        | 6278/47590 [1:30:08<22:13, 30.97it/s]

{'embedding_loss': 0.0098, 'grad_norm': 0.33543258905410767, 'learning_rate': 1.47043029581378e-05, 'epoch': 1.69}


                                                      
 13%|█▎        | 6278/47590 [1:30:09<22:13, 30.97it/s]


{'embedding_loss': 0.0105, 'grad_norm': 0.27672359347343445, 'learning_rate': 1.46809553827835e-05, 'epoch': 1.7}


                                                      [A
 13%|█▎        | 6278/47590 [1:30:10<22:13, 30.97it/s]

{'embedding_loss': 0.0131, 'grad_norm': 0.3048015236854553, 'learning_rate': 1.46576078074292e-05, 'epoch': 1.7}


                                                      
 13%|█▎        | 6278/47590 [1:30:11<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.14099730551242828, 'learning_rate': 1.46342602320749e-05, 'epoch': 1.71}


                                                      
 13%|█▎        | 6278/47590 [1:30:12<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.11101043224334717, 'learning_rate': 1.4610912656720601e-05, 'epoch': 1.71}


                                                      
 13%|█▎        | 6278/47590 [1:30:13<22:13, 30.97it/s]


{'embedding_loss': 0.0084, 'grad_norm': 0.9129751920700073, 'learning_rate': 1.4587565081366302e-05, 'epoch': 1.72}


                                                      [A
 13%|█▎        | 6278/47590 [1:30:14<22:13, 30.97it/s]

{'embedding_loss': 0.0132, 'grad_norm': 0.14357399940490723, 'learning_rate': 1.4564217506012002e-05, 'epoch': 1.72}


                                                      
 13%|█▎        | 6278/47590 [1:30:15<22:13, 30.97it/s]

{'embedding_loss': 0.0113, 'grad_norm': 0.35426029562950134, 'learning_rate': 1.4540869930657702e-05, 'epoch': 1.73}


                                                      
 13%|█▎        | 6278/47590 [1:30:16<22:13, 30.97it/s]

{'embedding_loss': 0.0087, 'grad_norm': 0.19925174117088318, 'learning_rate': 1.4517522355303404e-05, 'epoch': 1.73}


                                                      
 13%|█▎        | 6278/47590 [1:30:17<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.49921172857284546, 'learning_rate': 1.4494174779949103e-05, 'epoch': 1.74}


                                                      
 13%|█▎        | 6278/47590 [1:30:18<22:13, 30.97it/s]

{'embedding_loss': 0.0134, 'grad_norm': 0.700186014175415, 'learning_rate': 1.4470827204594804e-05, 'epoch': 1.74}


                                                      
 13%|█▎        | 6278/47590 [1:30:19<22:13, 30.97it/s]


{'embedding_loss': 0.0094, 'grad_norm': 1.1186963319778442, 'learning_rate': 1.4447479629240505e-05, 'epoch': 1.75}


                                                      [A
 13%|█▎        | 6278/47590 [1:30:20<22:13, 30.97it/s]

{'embedding_loss': 0.009, 'grad_norm': 0.19154515862464905, 'learning_rate': 1.4424132053886205e-05, 'epoch': 1.75}


                                                      
 13%|█▎        | 6278/47590 [1:30:21<22:13, 30.97it/s]

{'embedding_loss': 0.0144, 'grad_norm': 0.5009361505508423, 'learning_rate': 1.4400784478531906e-05, 'epoch': 1.76}


                                                      
 13%|█▎        | 6278/47590 [1:30:22<22:13, 30.97it/s]

{'embedding_loss': 0.0089, 'grad_norm': 0.37663209438323975, 'learning_rate': 1.4377436903177607e-05, 'epoch': 1.77}


                                                      
 13%|█▎        | 6278/47590 [1:30:23<22:13, 30.97it/s]

{'embedding_loss': 0.0105, 'grad_norm': 0.20627713203430176, 'learning_rate': 1.4354089327823306e-05, 'epoch': 1.77}


                                                      
 13%|█▎        | 6278/47590 [1:30:24<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.7352363467216492, 'learning_rate': 1.4330741752469007e-05, 'epoch': 1.78}


                                                      
 13%|█▎        | 6278/47590 [1:30:25<22:13, 30.97it/s]

{'embedding_loss': 0.0113, 'grad_norm': 0.11419668793678284, 'learning_rate': 1.4307394177114708e-05, 'epoch': 1.78}


                                                      
 13%|█▎        | 6278/47590 [1:30:26<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.43224528431892395, 'learning_rate': 1.4284046601760408e-05, 'epoch': 1.79}


                                                      
 13%|█▎        | 6278/47590 [1:30:27<22:13, 30.97it/s]

{'embedding_loss': 0.0102, 'grad_norm': 0.6685012578964233, 'learning_rate': 1.426069902640611e-05, 'epoch': 1.79}


                                                      
 13%|█▎        | 6278/47590 [1:30:28<22:13, 30.97it/s]


{'embedding_loss': 0.0108, 'grad_norm': 0.6587611436843872, 'learning_rate': 1.4237351451051808e-05, 'epoch': 1.8}


                                                      [A
 13%|█▎        | 6278/47590 [1:30:29<22:13, 30.97it/s]


{'embedding_loss': 0.009, 'grad_norm': 1.0089632272720337, 'learning_rate': 1.421400387569751e-05, 'epoch': 1.8}


                                                      [A
 13%|█▎        | 6278/47590 [1:30:30<22:13, 30.97it/s]

{'embedding_loss': 0.012, 'grad_norm': 0.47520673274993896, 'learning_rate': 1.419065630034321e-05, 'epoch': 1.81}


                                                      
 13%|█▎        | 6278/47590 [1:30:31<22:13, 30.97it/s]

{'embedding_loss': 0.0106, 'grad_norm': 0.2508830726146698, 'learning_rate': 1.416730872498891e-05, 'epoch': 1.81}


                                                      
 13%|█▎        | 6278/47590 [1:30:32<22:13, 30.97it/s]

{'embedding_loss': 0.0088, 'grad_norm': 0.10318933427333832, 'learning_rate': 1.4143961149634611e-05, 'epoch': 1.82}


                                                      
 13%|█▎        | 6278/47590 [1:30:33<22:13, 30.97it/s]

{'embedding_loss': 0.0125, 'grad_norm': 0.44414204359054565, 'learning_rate': 1.4120613574280312e-05, 'epoch': 1.82}


                                                      
 13%|█▎        | 6278/47590 [1:30:34<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.36311015486717224, 'learning_rate': 1.4097265998926013e-05, 'epoch': 1.83}


                                                      
 13%|█▎        | 6278/47590 [1:30:35<22:13, 30.97it/s]

{'embedding_loss': 0.0117, 'grad_norm': 0.14724360406398773, 'learning_rate': 1.4073918423571714e-05, 'epoch': 1.83}


                                                      
 13%|█▎        | 6278/47590 [1:30:36<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.15522532165050507, 'learning_rate': 1.4050570848217413e-05, 'epoch': 1.84}


                                                      
 13%|█▎        | 6278/47590 [1:30:37<22:13, 30.97it/s]

{'embedding_loss': 0.0078, 'grad_norm': 0.12591104209423065, 'learning_rate': 1.4027223272863115e-05, 'epoch': 1.84}


                                                      
 13%|█▎        | 6278/47590 [1:30:38<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.46671295166015625, 'learning_rate': 1.4003875697508815e-05, 'epoch': 1.85}


                                                      
 13%|█▎        | 6278/47590 [1:30:39<22:13, 30.97it/s]

{'embedding_loss': 0.0099, 'grad_norm': 0.36652928590774536, 'learning_rate': 1.3980528122154515e-05, 'epoch': 1.85}


                                                      
 13%|█▎        | 6278/47590 [1:30:40<22:13, 30.97it/s]

{'embedding_loss': 0.0073, 'grad_norm': 0.18129776418209076, 'learning_rate': 1.3957180546800216e-05, 'epoch': 1.86}


                                                      
 13%|█▎        | 6278/47590 [1:30:41<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 1.244836449623108, 'learning_rate': 1.3933832971445917e-05, 'epoch': 1.86}


                                                      
 13%|█▎        | 6278/47590 [1:30:42<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.22189317643642426, 'learning_rate': 1.3910485396091618e-05, 'epoch': 1.87}


                                                      
 13%|█▎        | 6278/47590 [1:30:43<22:13, 30.97it/s]

{'embedding_loss': 0.0115, 'grad_norm': 0.5856582522392273, 'learning_rate': 1.3887137820737317e-05, 'epoch': 1.88}


                                                      
 13%|█▎        | 6278/47590 [1:30:44<22:13, 30.97it/s]


{'embedding_loss': 0.0119, 'grad_norm': 0.22804635763168335, 'learning_rate': 1.3863790245383018e-05, 'epoch': 1.88}


                                                      [A
 13%|█▎        | 6278/47590 [1:30:45<22:13, 30.97it/s]

{'embedding_loss': 0.0116, 'grad_norm': 0.38909676671028137, 'learning_rate': 1.3840442670028718e-05, 'epoch': 1.89}


                                                      
 13%|█▎        | 6278/47590 [1:30:46<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.7584103941917419, 'learning_rate': 1.3817095094674419e-05, 'epoch': 1.89}


                                                      
 13%|█▎        | 6278/47590 [1:30:48<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.20538246631622314, 'learning_rate': 1.379374751932012e-05, 'epoch': 1.9}


                                                      
 13%|█▎        | 6278/47590 [1:30:49<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.21346105635166168, 'learning_rate': 1.377039994396582e-05, 'epoch': 1.9}


                                                      
 13%|█▎        | 6278/47590 [1:30:50<22:13, 30.97it/s]

{'embedding_loss': 0.0095, 'grad_norm': 0.12126076221466064, 'learning_rate': 1.374705236861152e-05, 'epoch': 1.91}


                                                      
 13%|█▎        | 6278/47590 [1:30:51<22:13, 30.97it/s]

{'embedding_loss': 0.0113, 'grad_norm': 0.202759250998497, 'learning_rate': 1.3723704793257222e-05, 'epoch': 1.91}


                                                      
 13%|█▎        | 6278/47590 [1:30:52<22:13, 30.97it/s]

{'embedding_loss': 0.0108, 'grad_norm': 0.2834916412830353, 'learning_rate': 1.3700357217902921e-05, 'epoch': 1.92}


                                                      
 13%|█▎        | 6278/47590 [1:30:53<22:13, 30.97it/s]

{'embedding_loss': 0.0105, 'grad_norm': 0.6621257662773132, 'learning_rate': 1.3677009642548622e-05, 'epoch': 1.92}


                                                      
 13%|█▎        | 6278/47590 [1:30:54<22:13, 30.97it/s]

{'embedding_loss': 0.0107, 'grad_norm': 0.9776448607444763, 'learning_rate': 1.3653662067194323e-05, 'epoch': 1.93}


                                                      
 13%|█▎        | 6278/47590 [1:30:55<22:13, 30.97it/s]

{'embedding_loss': 0.0116, 'grad_norm': 0.24096880853176117, 'learning_rate': 1.3630314491840024e-05, 'epoch': 1.93}


                                                      
 13%|█▎        | 6278/47590 [1:30:56<22:13, 30.97it/s]


{'embedding_loss': 0.0093, 'grad_norm': 0.38548263907432556, 'learning_rate': 1.3606966916485724e-05, 'epoch': 1.94}


                                                      [A
 13%|█▎        | 6278/47590 [1:30:57<22:13, 30.97it/s]

{'embedding_loss': 0.0086, 'grad_norm': 0.4374406039714813, 'learning_rate': 1.3583619341131424e-05, 'epoch': 1.94}


                                                      
 13%|█▎        | 6278/47590 [1:30:58<22:13, 30.97it/s]

{'embedding_loss': 0.01, 'grad_norm': 0.588022768497467, 'learning_rate': 1.3560271765777124e-05, 'epoch': 1.95}


                                                      
 13%|█▎        | 6278/47590 [1:30:59<22:13, 30.97it/s]

{'embedding_loss': 0.0087, 'grad_norm': 0.33761146664619446, 'learning_rate': 1.3536924190422827e-05, 'epoch': 1.95}


                                                      
 13%|█▎        | 6278/47590 [1:31:00<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.6281702518463135, 'learning_rate': 1.3513576615068526e-05, 'epoch': 1.96}


                                                      
 13%|█▎        | 6278/47590 [1:31:01<22:13, 30.97it/s]

{'embedding_loss': 0.0094, 'grad_norm': 0.06965374201536179, 'learning_rate': 1.3490229039714227e-05, 'epoch': 1.96}


                                                      
 13%|█▎        | 6278/47590 [1:31:02<22:13, 30.97it/s]

{'embedding_loss': 0.0107, 'grad_norm': 0.03910152241587639, 'learning_rate': 1.3466881464359926e-05, 'epoch': 1.97}


                                                      
 13%|█▎        | 6278/47590 [1:31:03<22:13, 30.97it/s]

{'embedding_loss': 0.0096, 'grad_norm': 0.056554291397333145, 'learning_rate': 1.3443533889005628e-05, 'epoch': 1.98}


                                                      
 13%|█▎        | 6278/47590 [1:31:04<22:13, 30.97it/s]

{'embedding_loss': 0.0084, 'grad_norm': 0.3955976068973541, 'learning_rate': 1.3420186313651329e-05, 'epoch': 1.98}


                                                      
 13%|█▎        | 6278/47590 [1:31:05<22:13, 30.97it/s]

{'embedding_loss': 0.0093, 'grad_norm': 0.0647207573056221, 'learning_rate': 1.3396838738297028e-05, 'epoch': 1.99}


                                                      
 13%|█▎        | 6278/47590 [1:31:06<22:13, 30.97it/s]

{'embedding_loss': 0.0088, 'grad_norm': 0.6293537020683289, 'learning_rate': 1.3373491162942729e-05, 'epoch': 1.99}


                                                      
 13%|█▎        | 6278/47590 [1:31:07<22:13, 30.97it/s]

{'embedding_loss': 0.0104, 'grad_norm': 0.09112020581960678, 'learning_rate': 1.3350143587588431e-05, 'epoch': 2.0}


                                                      
 13%|█▎        | 6278/47590 [1:31:09<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.07473891973495483, 'learning_rate': 1.332679601223413e-05, 'epoch': 2.0}


                                                      
 13%|█▎        | 6278/47590 [1:31:10<22:13, 30.97it/s]

{'embedding_loss': 0.0089, 'grad_norm': 0.5710964798927307, 'learning_rate': 1.3303448436879831e-05, 'epoch': 2.01}


                                                      
 13%|█▎        | 6278/47590 [1:31:11<22:13, 30.97it/s]

{'embedding_loss': 0.0084, 'grad_norm': 0.625309407711029, 'learning_rate': 1.328010086152553e-05, 'epoch': 2.01}


                                                      
 13%|█▎        | 6278/47590 [1:31:12<22:13, 30.97it/s]

{'embedding_loss': 0.0103, 'grad_norm': 0.13311301171779633, 'learning_rate': 1.3256753286171231e-05, 'epoch': 2.02}


                                                      
 13%|█▎        | 6278/47590 [1:31:13<22:13, 30.97it/s]

{'embedding_loss': 0.0069, 'grad_norm': 0.10462882369756699, 'learning_rate': 1.3233405710816934e-05, 'epoch': 2.02}


                                                      
 13%|█▎        | 6278/47590 [1:31:14<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.645035445690155, 'learning_rate': 1.3210058135462633e-05, 'epoch': 2.03}


                                                      
 13%|█▎        | 6278/47590 [1:31:15<22:13, 30.97it/s]


{'embedding_loss': 0.0084, 'grad_norm': 0.39564311504364014, 'learning_rate': 1.3186710560108333e-05, 'epoch': 2.03}


                                                      [A
 13%|█▎        | 6278/47590 [1:31:16<22:13, 30.97it/s]

{'embedding_loss': 0.0105, 'grad_norm': 0.723076581954956, 'learning_rate': 1.3163362984754033e-05, 'epoch': 2.04}


                                                      
 13%|█▎        | 6278/47590 [1:31:17<22:13, 30.97it/s]

{'embedding_loss': 0.0089, 'grad_norm': 0.1434125453233719, 'learning_rate': 1.3140015409399735e-05, 'epoch': 2.04}


                                                      
 13%|█▎        | 6278/47590 [1:31:18<22:13, 30.97it/s]

{'embedding_loss': 0.0089, 'grad_norm': 0.34633156657218933, 'learning_rate': 1.3116667834045436e-05, 'epoch': 2.05}


                                                      
 13%|█▎        | 6278/47590 [1:31:20<22:13, 30.97it/s]


{'embedding_loss': 0.0092, 'grad_norm': 0.2620013654232025, 'learning_rate': 1.3093320258691135e-05, 'epoch': 2.05}


                                                      [A
 13%|█▎        | 6278/47590 [1:31:21<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 0.12507270276546478, 'learning_rate': 1.3069972683336836e-05, 'epoch': 2.06}


                                                      
 13%|█▎        | 6278/47590 [1:31:22<22:13, 30.97it/s]

{'embedding_loss': 0.0095, 'grad_norm': 0.5207775235176086, 'learning_rate': 1.3046625107982538e-05, 'epoch': 2.06}


                                                      
 13%|█▎        | 6278/47590 [1:31:23<22:13, 30.97it/s]

{'embedding_loss': 0.0112, 'grad_norm': 0.19971925020217896, 'learning_rate': 1.3023277532628237e-05, 'epoch': 2.07}


                                                      
 13%|█▎        | 6278/47590 [1:31:23<22:13, 30.97it/s]

{'embedding_loss': 0.0075, 'grad_norm': 0.13507197797298431, 'learning_rate': 1.2999929957273938e-05, 'epoch': 2.08}


                                                      
 13%|█▎        | 6278/47590 [1:31:24<22:13, 30.97it/s]

{'embedding_loss': 0.0091, 'grad_norm': 0.42125868797302246, 'learning_rate': 1.2976582381919637e-05, 'epoch': 2.08}


                                                      
 13%|█▎        | 6278/47590 [1:31:25<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.3322136700153351, 'learning_rate': 1.295323480656534e-05, 'epoch': 2.09}


                                                      
 13%|█▎        | 6278/47590 [1:31:26<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.07873092591762543, 'learning_rate': 1.292988723121104e-05, 'epoch': 2.09}


                                                      
 13%|█▎        | 6278/47590 [1:31:27<22:13, 30.97it/s]

{'embedding_loss': 0.0092, 'grad_norm': 0.3713534474372864, 'learning_rate': 1.290653965585674e-05, 'epoch': 2.1}


                                                      
 13%|█▎        | 6278/47590 [1:31:28<22:13, 30.97it/s]

{'embedding_loss': 0.0106, 'grad_norm': 0.26578861474990845, 'learning_rate': 1.288319208050244e-05, 'epoch': 2.1}


                                                      
 13%|█▎        | 6278/47590 [1:31:30<22:13, 30.97it/s]


{'embedding_loss': 0.0075, 'grad_norm': 0.08814579248428345, 'learning_rate': 1.2859844505148143e-05, 'epoch': 2.11}


                                                      [A
 13%|█▎        | 6278/47590 [1:31:31<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.06445600092411041, 'learning_rate': 1.2836496929793842e-05, 'epoch': 2.11}


                                                      
 13%|█▎        | 6278/47590 [1:31:32<22:13, 30.97it/s]


{'embedding_loss': 0.0073, 'grad_norm': 1.0479310750961304, 'learning_rate': 1.2813149354439543e-05, 'epoch': 2.12}


                                                      [A
 13%|█▎        | 6278/47590 [1:31:33<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.13176552951335907, 'learning_rate': 1.2789801779085242e-05, 'epoch': 2.12}


                                                      
 13%|█▎        | 6278/47590 [1:31:34<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.07977303117513657, 'learning_rate': 1.2766454203730944e-05, 'epoch': 2.13}


                                                      
 13%|█▎        | 6278/47590 [1:31:35<22:13, 30.97it/s]


{'embedding_loss': 0.0114, 'grad_norm': 0.1645066738128662, 'learning_rate': 1.2743106628376645e-05, 'epoch': 2.13}


                                                      [A
 13%|█▎        | 6278/47590 [1:31:36<22:13, 30.97it/s]

{'embedding_loss': 0.0093, 'grad_norm': 0.17071260511875153, 'learning_rate': 1.2719759053022344e-05, 'epoch': 2.14}


                                                      
 13%|█▎        | 6278/47590 [1:31:37<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.39886829257011414, 'learning_rate': 1.2696411477668045e-05, 'epoch': 2.14}


                                                      
 13%|█▎        | 6278/47590 [1:31:38<22:13, 30.97it/s]

{'embedding_loss': 0.0092, 'grad_norm': 0.5607883930206299, 'learning_rate': 1.2673063902313744e-05, 'epoch': 2.15}


                                                      
 13%|█▎        | 6278/47590 [1:31:38<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.10196885466575623, 'learning_rate': 1.2649716326959446e-05, 'epoch': 2.15}


                                                      
 13%|█▎        | 6278/47590 [1:31:40<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.5591272115707397, 'learning_rate': 1.2626368751605147e-05, 'epoch': 2.16}


                                                      
 13%|█▎        | 6278/47590 [1:31:41<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.3155018985271454, 'learning_rate': 1.2603021176250846e-05, 'epoch': 2.16}


                                                      
 13%|█▎        | 6278/47590 [1:31:42<22:13, 30.97it/s]


{'embedding_loss': 0.006, 'grad_norm': 0.6941565275192261, 'learning_rate': 1.2579673600896547e-05, 'epoch': 2.17}


                                                      [A
 13%|█▎        | 6278/47590 [1:31:43<22:13, 30.97it/s]

{'embedding_loss': 0.0093, 'grad_norm': 0.15057645738124847, 'learning_rate': 1.255632602554225e-05, 'epoch': 2.17}


                                                      
 13%|█▎        | 6278/47590 [1:31:44<22:13, 30.97it/s]


{'embedding_loss': 0.0091, 'grad_norm': 0.13044528663158417, 'learning_rate': 1.2532978450187949e-05, 'epoch': 2.18}


                                                      [A
 13%|█▎        | 6278/47590 [1:31:45<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.7268871068954468, 'learning_rate': 1.250963087483365e-05, 'epoch': 2.19}


                                                      
 13%|█▎        | 6278/47590 [1:31:46<22:13, 30.97it/s]

{'embedding_loss': 0.0099, 'grad_norm': 0.5977255702018738, 'learning_rate': 1.2486283299479349e-05, 'epoch': 2.19}


                                                      
 13%|█▎        | 6278/47590 [1:31:47<22:13, 30.97it/s]

{'embedding_loss': 0.0073, 'grad_norm': 0.8390213847160339, 'learning_rate': 1.2462935724125051e-05, 'epoch': 2.2}


                                                      
 13%|█▎        | 6278/47590 [1:31:47<22:13, 30.97it/s]

{'embedding_loss': 0.0096, 'grad_norm': 0.7141976952552795, 'learning_rate': 1.2439588148770752e-05, 'epoch': 2.2}


                                                      
 13%|█▎        | 6278/47590 [1:31:48<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.3715837299823761, 'learning_rate': 1.2416240573416451e-05, 'epoch': 2.21}


                                                      
 13%|█▎        | 6278/47590 [1:31:50<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.1074926033616066, 'learning_rate': 1.2392892998062152e-05, 'epoch': 2.21}


                                                      
 13%|█▎        | 6278/47590 [1:31:51<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.17373786866664886, 'learning_rate': 1.2369545422707854e-05, 'epoch': 2.22}


                                                      
 13%|█▎        | 6278/47590 [1:31:52<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.3343265950679779, 'learning_rate': 1.2346197847353553e-05, 'epoch': 2.22}


                                                      
 13%|█▎        | 6278/47590 [1:31:53<22:13, 30.97it/s]

{'embedding_loss': 0.0098, 'grad_norm': 0.14496822655200958, 'learning_rate': 1.2322850271999254e-05, 'epoch': 2.23}


                                                      
 13%|█▎        | 6278/47590 [1:31:54<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.05743272975087166, 'learning_rate': 1.2299502696644953e-05, 'epoch': 2.23}


                                                      
 13%|█▎        | 6278/47590 [1:31:55<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.15692520141601562, 'learning_rate': 1.2276155121290656e-05, 'epoch': 2.24}


                                                      
 13%|█▎        | 6278/47590 [1:31:56<22:13, 30.97it/s]


{'embedding_loss': 0.0101, 'grad_norm': 0.46174731850624084, 'learning_rate': 1.2252807545936356e-05, 'epoch': 2.24}


                                                      [A
 13%|█▎        | 6278/47590 [1:31:57<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.09330878406763077, 'learning_rate': 1.2229459970582055e-05, 'epoch': 2.25}


                                                      
 13%|█▎        | 6278/47590 [1:31:57<22:13, 30.97it/s]

{'embedding_loss': 0.0099, 'grad_norm': 0.6719987988471985, 'learning_rate': 1.2206112395227756e-05, 'epoch': 2.25}


                                                      
 13%|█▎        | 6278/47590 [1:31:58<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.10494262725114822, 'learning_rate': 1.2182764819873455e-05, 'epoch': 2.26}


                                                      
 13%|█▎        | 6278/47590 [1:32:00<22:13, 30.97it/s]

{'embedding_loss': 0.0087, 'grad_norm': 0.5609235167503357, 'learning_rate': 1.2159417244519158e-05, 'epoch': 2.26}


                                                      
 13%|█▎        | 6278/47590 [1:32:01<22:13, 30.97it/s]

{'embedding_loss': 0.0079, 'grad_norm': 0.06003369390964508, 'learning_rate': 1.2136069669164859e-05, 'epoch': 2.27}


                                                      
 13%|█▎        | 6278/47590 [1:32:02<22:13, 30.97it/s]

{'embedding_loss': 0.0091, 'grad_norm': 0.0547468326985836, 'learning_rate': 1.2112722093810558e-05, 'epoch': 2.27}


                                                      
 13%|█▎        | 6278/47590 [1:32:03<22:13, 30.97it/s]


{'embedding_loss': 0.009, 'grad_norm': 0.3662881851196289, 'learning_rate': 1.2089374518456258e-05, 'epoch': 2.28}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:03<22:13, 30.97it/s]

{'embedding_loss': 0.0079, 'grad_norm': 0.23852583765983582, 'learning_rate': 1.2066026943101961e-05, 'epoch': 2.29}


                                                      
 13%|█▎        | 6278/47590 [1:32:04<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.2353706806898117, 'learning_rate': 1.204267936774766e-05, 'epoch': 2.29}


                                                      
 13%|█▎        | 6278/47590 [1:32:05<22:13, 30.97it/s]

{'embedding_loss': 0.0091, 'grad_norm': 0.2930249869823456, 'learning_rate': 1.201933179239336e-05, 'epoch': 2.3}


                                                      
 13%|█▎        | 6278/47590 [1:32:06<22:13, 30.97it/s]

{'embedding_loss': 0.0078, 'grad_norm': 0.6276586055755615, 'learning_rate': 1.199598421703906e-05, 'epoch': 2.3}


                                                      
 13%|█▎        | 6278/47590 [1:32:07<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.036735519766807556, 'learning_rate': 1.1972636641684762e-05, 'epoch': 2.31}


                                                      
 13%|█▎        | 6278/47590 [1:32:08<22:13, 30.97it/s]

{'embedding_loss': 0.0066, 'grad_norm': 0.1609647125005722, 'learning_rate': 1.1949289066330463e-05, 'epoch': 2.31}


                                                      
 13%|█▎        | 6278/47590 [1:32:10<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.08443620055913925, 'learning_rate': 1.1925941490976162e-05, 'epoch': 2.32}


                                                      
 13%|█▎        | 6278/47590 [1:32:11<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.08361512422561646, 'learning_rate': 1.1902593915621863e-05, 'epoch': 2.32}


                                                      
 13%|█▎        | 6278/47590 [1:32:11<22:13, 30.97it/s]

{'embedding_loss': 0.0102, 'grad_norm': 0.6803467869758606, 'learning_rate': 1.1879246340267566e-05, 'epoch': 2.33}


                                                      
 13%|█▎        | 6278/47590 [1:32:12<22:13, 30.97it/s]


{'embedding_loss': 0.006, 'grad_norm': 0.038790151476860046, 'learning_rate': 1.1855898764913265e-05, 'epoch': 2.33}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:13<22:13, 30.97it/s]


{'embedding_loss': 0.0084, 'grad_norm': 0.16833704710006714, 'learning_rate': 1.1832551189558965e-05, 'epoch': 2.34}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:14<22:13, 30.97it/s]

{'embedding_loss': 0.0093, 'grad_norm': 0.2400016188621521, 'learning_rate': 1.1809203614204664e-05, 'epoch': 2.34}


                                                      
 13%|█▎        | 6278/47590 [1:32:15<22:13, 30.97it/s]

{'embedding_loss': 0.0097, 'grad_norm': 0.06850192695856094, 'learning_rate': 1.1785856038850367e-05, 'epoch': 2.35}


                                                      
 13%|█▎        | 6278/47590 [1:32:16<22:13, 30.97it/s]

{'embedding_loss': 0.0076, 'grad_norm': 0.46785226464271545, 'learning_rate': 1.1762508463496068e-05, 'epoch': 2.35}


                                                      
 13%|█▎        | 6278/47590 [1:32:17<22:13, 30.97it/s]


{'embedding_loss': 0.0078, 'grad_norm': 0.31644895672798157, 'learning_rate': 1.1739160888141767e-05, 'epoch': 2.36}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:18<22:13, 30.97it/s]

{'embedding_loss': 0.0093, 'grad_norm': 0.6464974880218506, 'learning_rate': 1.1715813312787468e-05, 'epoch': 2.36}


                                                      
 13%|█▎        | 6278/47590 [1:32:20<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.06853054463863373, 'learning_rate': 1.169246573743317e-05, 'epoch': 2.37}


                                                      
 13%|█▎        | 6278/47590 [1:32:21<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.1685846596956253, 'learning_rate': 1.166911816207887e-05, 'epoch': 2.37}


                                                      
 13%|█▎        | 6278/47590 [1:32:22<22:13, 30.97it/s]

{'embedding_loss': 0.0066, 'grad_norm': 0.16633589565753937, 'learning_rate': 1.164577058672457e-05, 'epoch': 2.38}


                                                      
 13%|█▎        | 6278/47590 [1:32:23<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.7357817888259888, 'learning_rate': 1.1622423011370269e-05, 'epoch': 2.38}


                                                      
 13%|█▎        | 6278/47590 [1:32:24<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.2851371467113495, 'learning_rate': 1.159907543601597e-05, 'epoch': 2.39}


                                                      
 13%|█▎        | 6278/47590 [1:32:24<22:13, 30.97it/s]

{'embedding_loss': 0.0073, 'grad_norm': 0.0963108241558075, 'learning_rate': 1.1575727860661672e-05, 'epoch': 2.4}


                                                      
 13%|█▎        | 6278/47590 [1:32:25<22:13, 30.97it/s]


{'embedding_loss': 0.0058, 'grad_norm': 0.036502279341220856, 'learning_rate': 1.1552380285307371e-05, 'epoch': 2.4}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:26<22:13, 30.97it/s]

{'embedding_loss': 0.0098, 'grad_norm': 0.6282883882522583, 'learning_rate': 1.1529032709953072e-05, 'epoch': 2.41}


                                                      
 13%|█▎        | 6278/47590 [1:32:27<22:13, 30.97it/s]

{'embedding_loss': 0.0098, 'grad_norm': 0.16926199197769165, 'learning_rate': 1.1505685134598771e-05, 'epoch': 2.41}


                                                      
 13%|█▎        | 6278/47590 [1:32:28<22:13, 30.97it/s]

{'embedding_loss': 0.0099, 'grad_norm': 0.309984028339386, 'learning_rate': 1.1482337559244474e-05, 'epoch': 2.42}


                                                      
 13%|█▎        | 6278/47590 [1:32:30<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.0694134458899498, 'learning_rate': 1.1458989983890175e-05, 'epoch': 2.42}


                                                      
 13%|█▎        | 6278/47590 [1:32:31<22:13, 30.97it/s]


{'embedding_loss': 0.0075, 'grad_norm': 0.12730318307876587, 'learning_rate': 1.1435642408535874e-05, 'epoch': 2.43}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:32<22:13, 30.97it/s]


{'embedding_loss': 0.0081, 'grad_norm': 0.1905733346939087, 'learning_rate': 1.1412294833181574e-05, 'epoch': 2.43}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:33<22:13, 30.97it/s]

{'embedding_loss': 0.0073, 'grad_norm': 0.40198415517807007, 'learning_rate': 1.1388947257827277e-05, 'epoch': 2.44}


                                                      
 13%|█▎        | 6278/47590 [1:32:34<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.5101521611213684, 'learning_rate': 1.1365599682472976e-05, 'epoch': 2.44}


                                                      
 13%|█▎        | 6278/47590 [1:32:35<22:13, 30.97it/s]

{'embedding_loss': 0.0084, 'grad_norm': 0.1866040825843811, 'learning_rate': 1.1342252107118677e-05, 'epoch': 2.45}


                                                      
 13%|█▎        | 6278/47590 [1:32:36<22:13, 30.97it/s]


{'embedding_loss': 0.0061, 'grad_norm': 0.28858762979507446, 'learning_rate': 1.1318904531764376e-05, 'epoch': 2.45}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:37<22:13, 30.97it/s]

{'embedding_loss': 0.0087, 'grad_norm': 0.18949265778064728, 'learning_rate': 1.1295556956410078e-05, 'epoch': 2.46}


                                                      
 13%|█▎        | 6278/47590 [1:32:38<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.5384265780448914, 'learning_rate': 1.1272209381055779e-05, 'epoch': 2.46}


                                                      
 13%|█▎        | 6278/47590 [1:32:39<22:13, 30.97it/s]

{'embedding_loss': 0.01, 'grad_norm': 0.05686647444963455, 'learning_rate': 1.1248861805701478e-05, 'epoch': 2.47}


                                                      
 13%|█▎        | 6278/47590 [1:32:41<22:13, 30.97it/s]

{'embedding_loss': 0.0076, 'grad_norm': 0.07140794396400452, 'learning_rate': 1.1225514230347179e-05, 'epoch': 2.47}


                                                      
 13%|█▎        | 6278/47590 [1:32:42<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.3562232255935669, 'learning_rate': 1.1202166654992881e-05, 'epoch': 2.48}


                                                      
 13%|█▎        | 6278/47590 [1:32:43<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.05435891076922417, 'learning_rate': 1.117881907963858e-05, 'epoch': 2.48}


                                                      
 13%|█▎        | 6278/47590 [1:32:44<22:13, 30.97it/s]

{'embedding_loss': 0.0078, 'grad_norm': 0.1448059231042862, 'learning_rate': 1.1155471504284281e-05, 'epoch': 2.49}


                                                      
 13%|█▎        | 6278/47590 [1:32:45<22:13, 30.97it/s]

{'embedding_loss': 0.0084, 'grad_norm': 0.19102735817432404, 'learning_rate': 1.113212392892998e-05, 'epoch': 2.5}


                                                      
 13%|█▎        | 6278/47590 [1:32:46<22:13, 30.97it/s]


{'embedding_loss': 0.009, 'grad_norm': 0.2708945572376251, 'learning_rate': 1.1108776353575683e-05, 'epoch': 2.5}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:47<22:13, 30.97it/s]

{'embedding_loss': 0.0079, 'grad_norm': 0.38059142231941223, 'learning_rate': 1.1085428778221384e-05, 'epoch': 2.51}


                                                      
 13%|█▎        | 6278/47590 [1:32:48<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.05889437347650528, 'learning_rate': 1.1062081202867083e-05, 'epoch': 2.51}


                                                      
 13%|█▎        | 6278/47590 [1:32:49<22:13, 30.97it/s]

{'embedding_loss': 0.0088, 'grad_norm': 0.7116144895553589, 'learning_rate': 1.1038733627512784e-05, 'epoch': 2.52}


                                                      
 13%|█▎        | 6278/47590 [1:32:50<22:13, 30.97it/s]

{'embedding_loss': 0.0091, 'grad_norm': 0.1214769035577774, 'learning_rate': 1.1015386052158483e-05, 'epoch': 2.52}


                                                      
 13%|█▎        | 6278/47590 [1:32:51<22:13, 30.97it/s]

{'embedding_loss': 0.0078, 'grad_norm': 0.14425449073314667, 'learning_rate': 1.0992038476804185e-05, 'epoch': 2.53}


                                                      
 13%|█▎        | 6278/47590 [1:32:52<22:13, 30.97it/s]


{'embedding_loss': 0.0076, 'grad_norm': 0.1243816465139389, 'learning_rate': 1.0968690901449886e-05, 'epoch': 2.53}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:53<22:13, 30.97it/s]


{'embedding_loss': 0.0066, 'grad_norm': 0.14414700865745544, 'learning_rate': 1.0945343326095585e-05, 'epoch': 2.54}


                                                      [A
 13%|█▎        | 6278/47590 [1:32:54<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.14688877761363983, 'learning_rate': 1.0921995750741286e-05, 'epoch': 2.54}


                                                      
 13%|█▎        | 6278/47590 [1:32:55<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.11527732759714127, 'learning_rate': 1.0898648175386988e-05, 'epoch': 2.55}


                                                      
 13%|█▎        | 6278/47590 [1:32:56<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.20508627593517303, 'learning_rate': 1.0875300600032687e-05, 'epoch': 2.55}


                                                      
 13%|█▎        | 6278/47590 [1:32:57<22:13, 30.97it/s]

{'embedding_loss': 0.0077, 'grad_norm': 0.3948826789855957, 'learning_rate': 1.0851953024678388e-05, 'epoch': 2.56}


                                                      
 13%|█▎        | 6278/47590 [1:32:58<22:13, 30.97it/s]

{'embedding_loss': 0.0069, 'grad_norm': 0.04617685824632645, 'learning_rate': 1.0828605449324087e-05, 'epoch': 2.56}


                                                      
 13%|█▎        | 6278/47590 [1:32:59<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.4955039322376251, 'learning_rate': 1.080525787396979e-05, 'epoch': 2.57}


                                                      
 13%|█▎        | 6278/47590 [1:33:00<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.4583583474159241, 'learning_rate': 1.078191029861549e-05, 'epoch': 2.57}


                                                      
 13%|█▎        | 6278/47590 [1:33:01<22:13, 30.97it/s]

{'embedding_loss': 0.0085, 'grad_norm': 0.21264775097370148, 'learning_rate': 1.075856272326119e-05, 'epoch': 2.58}


                                                      
 13%|█▎        | 6278/47590 [1:33:02<22:13, 30.97it/s]

{'embedding_loss': 0.0095, 'grad_norm': 0.3738883435726166, 'learning_rate': 1.073521514790689e-05, 'epoch': 2.58}


                                                      
 13%|█▎        | 6278/47590 [1:33:03<22:13, 30.97it/s]

{'embedding_loss': 0.0066, 'grad_norm': 0.09619759768247604, 'learning_rate': 1.0711867572552593e-05, 'epoch': 2.59}


                                                      
 13%|█▎        | 6278/47590 [1:33:04<22:13, 30.97it/s]


{'embedding_loss': 0.0062, 'grad_norm': 0.2183266282081604, 'learning_rate': 1.0688519997198292e-05, 'epoch': 2.6}


                                                      [A
 13%|█▎        | 6278/47590 [1:33:05<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.5013205409049988, 'learning_rate': 1.0665172421843993e-05, 'epoch': 2.6}


                                                      
 13%|█▎        | 6278/47590 [1:33:06<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.03347420319914818, 'learning_rate': 1.0641824846489692e-05, 'epoch': 2.61}


                                                      
 13%|█▎        | 6278/47590 [1:33:07<22:13, 30.97it/s]


{'embedding_loss': 0.0078, 'grad_norm': 0.19079802930355072, 'learning_rate': 1.0618477271135394e-05, 'epoch': 2.61}


                                                      [A
 13%|█▎        | 6278/47590 [1:33:08<22:13, 30.97it/s]

{'embedding_loss': 0.0079, 'grad_norm': 0.26003241539001465, 'learning_rate': 1.0595129695781095e-05, 'epoch': 2.62}


                                                      
 13%|█▎        | 6278/47590 [1:33:09<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.13930447399616241, 'learning_rate': 1.0571782120426794e-05, 'epoch': 2.62}


                                                      
 13%|█▎        | 6278/47590 [1:33:10<22:13, 30.97it/s]

{'embedding_loss': 0.0083, 'grad_norm': 0.24130889773368835, 'learning_rate': 1.0548434545072495e-05, 'epoch': 2.63}


                                                      
 13%|█▎        | 6278/47590 [1:33:11<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.48822078108787537, 'learning_rate': 1.0525086969718194e-05, 'epoch': 2.63}


                                                      
 13%|█▎        | 6278/47590 [1:33:12<22:13, 30.97it/s]


{'embedding_loss': 0.006, 'grad_norm': 0.22658401727676392, 'learning_rate': 1.0501739394363897e-05, 'epoch': 2.64}


                                                      [A
 13%|█▎        | 6278/47590 [1:33:13<22:13, 30.97it/s]

{'embedding_loss': 0.0093, 'grad_norm': 0.08540920913219452, 'learning_rate': 1.0478391819009597e-05, 'epoch': 2.64}


                                                      
 13%|█▎        | 6278/47590 [1:33:14<22:13, 30.97it/s]

{'embedding_loss': 0.0083, 'grad_norm': 0.09418836236000061, 'learning_rate': 1.0455044243655296e-05, 'epoch': 2.65}


                                                      
 13%|█▎        | 6278/47590 [1:33:15<22:13, 30.97it/s]

{'embedding_loss': 0.0076, 'grad_norm': 0.10950678586959839, 'learning_rate': 1.0431696668300997e-05, 'epoch': 2.65}


                                                      
 13%|█▎        | 6278/47590 [1:33:16<22:13, 30.97it/s]

{'embedding_loss': 0.0091, 'grad_norm': 0.3434213101863861, 'learning_rate': 1.04083490929467e-05, 'epoch': 2.66}


                                                      
 13%|█▎        | 6278/47590 [1:33:17<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.2020271271467209, 'learning_rate': 1.0385001517592399e-05, 'epoch': 2.66}


                                                      
 13%|█▎        | 6278/47590 [1:33:18<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 1.0949684381484985, 'learning_rate': 1.03616539422381e-05, 'epoch': 2.67}


                                                      
 13%|█▎        | 6278/47590 [1:33:19<22:13, 30.97it/s]


{'embedding_loss': 0.0076, 'grad_norm': 0.37692028284072876, 'learning_rate': 1.0338306366883799e-05, 'epoch': 2.67}


                                                      [A
 13%|█▎        | 6278/47590 [1:33:20<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.17130860686302185, 'learning_rate': 1.0314958791529501e-05, 'epoch': 2.68}


                                                      
 13%|█▎        | 6278/47590 [1:33:21<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.1917208731174469, 'learning_rate': 1.0291611216175202e-05, 'epoch': 2.68}


                                                      
 13%|█▎        | 6278/47590 [1:33:22<22:13, 30.97it/s]

{'embedding_loss': 0.0077, 'grad_norm': 0.06638646125793457, 'learning_rate': 1.0268263640820901e-05, 'epoch': 2.69}


                                                      
 13%|█▎        | 6278/47590 [1:33:23<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.16640788316726685, 'learning_rate': 1.0244916065466602e-05, 'epoch': 2.69}


                                                      
 13%|█▎        | 6278/47590 [1:33:24<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.05371525138616562, 'learning_rate': 1.0221568490112304e-05, 'epoch': 2.7}


                                                      
 13%|█▎        | 6278/47590 [1:33:25<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.5501933097839355, 'learning_rate': 1.0198220914758003e-05, 'epoch': 2.71}


                                                      
 13%|█▎        | 6278/47590 [1:33:26<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.06264945864677429, 'learning_rate': 1.0174873339403704e-05, 'epoch': 2.71}


                                                      
 13%|█▎        | 6278/47590 [1:33:27<22:13, 30.97it/s]

{'embedding_loss': 0.0041, 'grad_norm': 0.14302337169647217, 'learning_rate': 1.0151525764049403e-05, 'epoch': 2.72}


                                                      
 13%|█▎        | 6278/47590 [1:33:28<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.6173353791236877, 'learning_rate': 1.0128178188695106e-05, 'epoch': 2.72}


                                                      
 13%|█▎        | 6278/47590 [1:33:29<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.04145195335149765, 'learning_rate': 1.0104830613340806e-05, 'epoch': 2.73}


                                                      
 13%|█▎        | 6278/47590 [1:33:30<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 0.24766424298286438, 'learning_rate': 1.0081483037986506e-05, 'epoch': 2.73}


                                                      
 13%|█▎        | 6278/47590 [1:33:31<22:13, 30.97it/s]


{'embedding_loss': 0.0055, 'grad_norm': 0.45639362931251526, 'learning_rate': 1.0058135462632206e-05, 'epoch': 2.74}


                                                      [A
 13%|█▎        | 6278/47590 [1:33:32<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.09844937175512314, 'learning_rate': 1.0034787887277909e-05, 'epoch': 2.74}


                                                      
 13%|█▎        | 6278/47590 [1:33:33<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.05177716538310051, 'learning_rate': 1.0011440311923608e-05, 'epoch': 2.75}


                                                      
 13%|█▎        | 6278/47590 [1:33:34<22:13, 30.97it/s]


{'embedding_loss': 0.0083, 'grad_norm': 0.252584308385849, 'learning_rate': 9.988092736569309e-06, 'epoch': 2.75}


                                                      [A
 13%|█▎        | 6278/47590 [1:33:35<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.09188835322856903, 'learning_rate': 9.964745161215008e-06, 'epoch': 2.76}


                                                      
 13%|█▎        | 6278/47590 [1:33:36<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.12594768404960632, 'learning_rate': 9.941397585860709e-06, 'epoch': 2.76}


                                                      
 13%|█▎        | 6278/47590 [1:33:37<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 0.8404479026794434, 'learning_rate': 9.91805001050641e-06, 'epoch': 2.77}


                                                      
 13%|█▎        | 6278/47590 [1:33:38<22:13, 30.97it/s]

{'embedding_loss': 0.0088, 'grad_norm': 0.10406196862459183, 'learning_rate': 9.89470243515211e-06, 'epoch': 2.77}


                                                      
 13%|█▎        | 6278/47590 [1:33:39<22:13, 30.97it/s]

{'embedding_loss': 0.0089, 'grad_norm': 0.09337922930717468, 'learning_rate': 9.871354859797811e-06, 'epoch': 2.78}


                                                      
 13%|█▎        | 6278/47590 [1:33:40<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.40855154395103455, 'learning_rate': 9.848007284443512e-06, 'epoch': 2.78}


                                                      
 13%|█▎        | 6278/47590 [1:33:42<22:13, 30.97it/s]

{'embedding_loss': 0.0075, 'grad_norm': 0.20036767423152924, 'learning_rate': 9.82465970908921e-06, 'epoch': 2.79}


                                                      
 13%|█▎        | 6278/47590 [1:33:43<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.23939792811870575, 'learning_rate': 9.801312133734913e-06, 'epoch': 2.79}


                                                      
 13%|█▎        | 6278/47590 [1:33:44<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.35375314950942993, 'learning_rate': 9.777964558380612e-06, 'epoch': 2.8}


                                                      
 13%|█▎        | 6278/47590 [1:33:44<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.510127604007721, 'learning_rate': 9.754616983026313e-06, 'epoch': 2.81}


                                                      
 13%|█▎        | 6278/47590 [1:33:45<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.05921976640820503, 'learning_rate': 9.731269407672014e-06, 'epoch': 2.81}


                                                      
 13%|█▎        | 6278/47590 [1:33:46<22:13, 30.97it/s]


{'embedding_loss': 0.0067, 'grad_norm': 0.2110278159379959, 'learning_rate': 9.707921832317715e-06, 'epoch': 2.82}


                                                      [A
 13%|█▎        | 6278/47590 [1:33:47<22:13, 30.97it/s]

{'embedding_loss': 0.0079, 'grad_norm': 0.06796673685312271, 'learning_rate': 9.684574256963415e-06, 'epoch': 2.82}


                                                      
 13%|█▎        | 6278/47590 [1:33:48<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.3090689480304718, 'learning_rate': 9.661226681609116e-06, 'epoch': 2.83}


                                                      
 13%|█▎        | 6278/47590 [1:33:49<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.10088330507278442, 'learning_rate': 9.637879106254815e-06, 'epoch': 2.83}


                                                      
 13%|█▎        | 6278/47590 [1:33:50<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.5218889713287354, 'learning_rate': 9.614531530900518e-06, 'epoch': 2.84}


                                                      
 13%|█▎        | 6278/47590 [1:33:52<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.32201725244522095, 'learning_rate': 9.591183955546217e-06, 'epoch': 2.84}


                                                      
 13%|█▎        | 6278/47590 [1:33:53<22:13, 30.97it/s]

{'embedding_loss': 0.0075, 'grad_norm': 0.2809394299983978, 'learning_rate': 9.567836380191918e-06, 'epoch': 2.85}


                                                      
 13%|█▎        | 6278/47590 [1:33:54<22:13, 30.97it/s]

{'embedding_loss': 0.0084, 'grad_norm': 0.3078750669956207, 'learning_rate': 9.544488804837618e-06, 'epoch': 2.85}


                                                      
 13%|█▎        | 6278/47590 [1:33:55<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.13123159110546112, 'learning_rate': 9.52114122948332e-06, 'epoch': 2.86}


                                                      
 13%|█▎        | 6278/47590 [1:33:56<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.054072681814432144, 'learning_rate': 9.49779365412902e-06, 'epoch': 2.86}


                                                      
 13%|█▎        | 6278/47590 [1:33:57<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.6730029582977295, 'learning_rate': 9.47444607877472e-06, 'epoch': 2.87}


                                                      
 13%|█▎        | 6278/47590 [1:33:58<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.02137969620525837, 'learning_rate': 9.45109850342042e-06, 'epoch': 2.87}


                                                      
 13%|█▎        | 6278/47590 [1:33:58<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.17647340893745422, 'learning_rate': 9.42775092806612e-06, 'epoch': 2.88}


                                                      
 13%|█▎        | 6278/47590 [1:33:59<22:13, 30.97it/s]

{'embedding_loss': 0.0091, 'grad_norm': 0.13303637504577637, 'learning_rate': 9.404403352711821e-06, 'epoch': 2.88}


                                                      
 13%|█▎        | 6278/47590 [1:34:00<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.17450697720050812, 'learning_rate': 9.381055777357522e-06, 'epoch': 2.89}


                                                      
 13%|█▎        | 6278/47590 [1:34:02<22:13, 30.97it/s]

{'embedding_loss': 0.0091, 'grad_norm': 0.12445362657308578, 'learning_rate': 9.357708202003223e-06, 'epoch': 2.89}


                                                      
 13%|█▎        | 6278/47590 [1:34:03<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.1387508064508438, 'learning_rate': 9.334360626648922e-06, 'epoch': 2.9}


                                                      
 13%|█▎        | 6278/47590 [1:34:04<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.39080479741096497, 'learning_rate': 9.311013051294625e-06, 'epoch': 2.91}


                                                      
 13%|█▎        | 6278/47590 [1:34:05<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.40082305669784546, 'learning_rate': 9.287665475940324e-06, 'epoch': 2.91}


                                                      
 13%|█▎        | 6278/47590 [1:34:06<22:13, 30.97it/s]

{'embedding_loss': 0.0043, 'grad_norm': 0.06828544288873672, 'learning_rate': 9.264317900586025e-06, 'epoch': 2.92}


                                                      
 13%|█▎        | 6278/47590 [1:34:07<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.03539152443408966, 'learning_rate': 9.240970325231725e-06, 'epoch': 2.92}


                                                      
 13%|█▎        | 6278/47590 [1:34:08<22:13, 30.97it/s]


{'embedding_loss': 0.0066, 'grad_norm': 0.2122724950313568, 'learning_rate': 9.217622749877426e-06, 'epoch': 2.93}


                                                      [A
 13%|█▎        | 6278/47590 [1:34:08<22:13, 30.97it/s]


{'embedding_loss': 0.0052, 'grad_norm': 0.15962862968444824, 'learning_rate': 9.194275174523127e-06, 'epoch': 2.93}


                                                      [A
 13%|█▎        | 6278/47590 [1:34:09<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.11116471886634827, 'learning_rate': 9.170927599168828e-06, 'epoch': 2.94}


                                                      
 13%|█▎        | 6278/47590 [1:34:10<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.034810904413461685, 'learning_rate': 9.147580023814527e-06, 'epoch': 2.94}


                                                      
 13%|█▎        | 6278/47590 [1:34:12<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.04344943165779114, 'learning_rate': 9.12423244846023e-06, 'epoch': 2.95}


                                                      
 13%|█▎        | 6278/47590 [1:34:13<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.1013740822672844, 'learning_rate': 9.100884873105928e-06, 'epoch': 2.95}


                                                      
 13%|█▎        | 6278/47590 [1:34:14<22:13, 30.97it/s]


{'embedding_loss': 0.0068, 'grad_norm': 0.08611948788166046, 'learning_rate': 9.077537297751629e-06, 'epoch': 2.96}


                                                      [A
 13%|█▎        | 6278/47590 [1:34:15<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.7458412051200867, 'learning_rate': 9.05418972239733e-06, 'epoch': 2.96}


                                                      
 13%|█▎        | 6278/47590 [1:34:16<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.34717556834220886, 'learning_rate': 9.03084214704303e-06, 'epoch': 2.97}


                                                      
 13%|█▎        | 6278/47590 [1:34:17<22:13, 30.97it/s]

{'embedding_loss': 0.0075, 'grad_norm': 1.0739109516143799, 'learning_rate': 9.007494571688731e-06, 'epoch': 2.97}


                                                      
 13%|█▎        | 6278/47590 [1:34:18<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.13735714554786682, 'learning_rate': 8.984146996334432e-06, 'epoch': 2.98}


                                                      
 13%|█▎        | 6278/47590 [1:34:19<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.166911780834198, 'learning_rate': 8.960799420980131e-06, 'epoch': 2.98}


                                                      
 13%|█▎        | 6278/47590 [1:34:20<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.7408866882324219, 'learning_rate': 8.937451845625834e-06, 'epoch': 2.99}


                                                      
 13%|█▎        | 6278/47590 [1:34:20<22:13, 30.97it/s]

{'embedding_loss': 0.0066, 'grad_norm': 0.2261762171983719, 'learning_rate': 8.914104270271533e-06, 'epoch': 2.99}


                                                      
 13%|█▎        | 6278/47590 [1:34:22<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.2941518723964691, 'learning_rate': 8.890756694917234e-06, 'epoch': 3.0}


                                                      
 13%|█▎        | 6278/47590 [1:34:23<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.07688495516777039, 'learning_rate': 8.867409119562934e-06, 'epoch': 3.0}


                                                      
 13%|█▎        | 6278/47590 [1:34:24<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.3440288007259369, 'learning_rate': 8.844061544208634e-06, 'epoch': 3.01}


                                                      
 13%|█▎        | 6278/47590 [1:34:25<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.6951867341995239, 'learning_rate': 8.820713968854336e-06, 'epoch': 3.02}


                                                      
 13%|█▎        | 6278/47590 [1:34:26<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.14487400650978088, 'learning_rate': 8.797366393500035e-06, 'epoch': 3.02}


                                                      
 13%|█▎        | 6278/47590 [1:34:27<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.2525278925895691, 'learning_rate': 8.774018818145736e-06, 'epoch': 3.03}


                                                      
 13%|█▎        | 6278/47590 [1:34:28<22:13, 30.97it/s]

{'embedding_loss': 0.0076, 'grad_norm': 0.3639892637729645, 'learning_rate': 8.750671242791437e-06, 'epoch': 3.03}


                                                      
 13%|█▎        | 6278/47590 [1:34:29<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.6844564080238342, 'learning_rate': 8.727323667437137e-06, 'epoch': 3.04}


                                                      
 13%|█▎        | 6278/47590 [1:34:30<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.3788096308708191, 'learning_rate': 8.703976092082838e-06, 'epoch': 3.04}


                                                      
 13%|█▎        | 6278/47590 [1:34:31<22:13, 30.97it/s]

{'embedding_loss': 0.0075, 'grad_norm': 0.05411002039909363, 'learning_rate': 8.680628516728539e-06, 'epoch': 3.05}


                                                      
 13%|█▎        | 6278/47590 [1:34:32<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.26987332105636597, 'learning_rate': 8.657280941374238e-06, 'epoch': 3.05}


                                                      
 13%|█▎        | 6278/47590 [1:34:33<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.3913179337978363, 'learning_rate': 8.63393336601994e-06, 'epoch': 3.06}


                                                      
 13%|█▎        | 6278/47590 [1:34:34<22:13, 30.97it/s]


{'embedding_loss': 0.0049, 'grad_norm': 0.036376334726810455, 'learning_rate': 8.61058579066564e-06, 'epoch': 3.06}


                                                      [A
 13%|█▎        | 6278/47590 [1:34:35<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 1.1568182706832886, 'learning_rate': 8.58723821531134e-06, 'epoch': 3.07}


                                                      
 13%|█▎        | 6278/47590 [1:34:36<22:13, 30.97it/s]

{'embedding_loss': 0.0045, 'grad_norm': 0.08872390538454056, 'learning_rate': 8.563890639957041e-06, 'epoch': 3.07}


                                                      
 13%|█▎        | 6278/47590 [1:34:37<22:13, 30.97it/s]

{'embedding_loss': 0.0048, 'grad_norm': 0.18636023998260498, 'learning_rate': 8.540543064602742e-06, 'epoch': 3.08}


                                                      
 13%|█▎        | 6278/47590 [1:34:38<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.23333680629730225, 'learning_rate': 8.517195489248443e-06, 'epoch': 3.08}


                                                      
 13%|█▎        | 6278/47590 [1:34:39<22:13, 30.97it/s]


{'embedding_loss': 0.0042, 'grad_norm': 0.053767476230859756, 'learning_rate': 8.493847913894144e-06, 'epoch': 3.09}


                                                      [A
 13%|█▎        | 6278/47590 [1:34:40<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.08658803254365921, 'learning_rate': 8.470500338539843e-06, 'epoch': 3.09}


                                                      
 13%|█▎        | 6278/47590 [1:34:41<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.14194385707378387, 'learning_rate': 8.447152763185545e-06, 'epoch': 3.1}


                                                      
 13%|█▎        | 6278/47590 [1:34:42<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.5265982151031494, 'learning_rate': 8.423805187831244e-06, 'epoch': 3.1}


                                                      
 13%|█▎        | 6278/47590 [1:34:43<22:13, 30.97it/s]

{'embedding_loss': 0.0079, 'grad_norm': 0.07545400410890579, 'learning_rate': 8.400457612476945e-06, 'epoch': 3.11}


                                                      
 13%|█▎        | 6278/47590 [1:34:44<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.20020659267902374, 'learning_rate': 8.377110037122646e-06, 'epoch': 3.12}


                                                      
 13%|█▎        | 6278/47590 [1:34:45<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.030226945877075195, 'learning_rate': 8.353762461768345e-06, 'epoch': 3.12}


                                                      
 13%|█▎        | 6278/47590 [1:34:46<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.09481018036603928, 'learning_rate': 8.330414886414047e-06, 'epoch': 3.13}


                                                      
 13%|█▎        | 6278/47590 [1:34:47<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.3431136906147003, 'learning_rate': 8.307067311059746e-06, 'epoch': 3.13}


                                                      
 13%|█▎        | 6278/47590 [1:34:48<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.520711362361908, 'learning_rate': 8.283719735705447e-06, 'epoch': 3.14}


                                                      
 13%|█▎        | 6278/47590 [1:34:49<22:13, 30.97it/s]

{'embedding_loss': 0.0076, 'grad_norm': 0.6397967338562012, 'learning_rate': 8.260372160351148e-06, 'epoch': 3.14}


                                                      
 13%|█▎        | 6278/47590 [1:34:50<22:13, 30.97it/s]


{'embedding_loss': 0.0057, 'grad_norm': 0.1834477037191391, 'learning_rate': 8.237024584996849e-06, 'epoch': 3.15}


                                                      [A
 13%|█▎        | 6278/47590 [1:34:51<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.4924345314502716, 'learning_rate': 8.21367700964255e-06, 'epoch': 3.15}


                                                      
 13%|█▎        | 6278/47590 [1:34:52<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.3259661793708801, 'learning_rate': 8.19032943428825e-06, 'epoch': 3.16}


                                                      
 13%|█▎        | 6278/47590 [1:34:53<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.37990251183509827, 'learning_rate': 8.16698185893395e-06, 'epoch': 3.16}


                                                      
 13%|█▎        | 6278/47590 [1:34:54<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.044434066861867905, 'learning_rate': 8.143634283579652e-06, 'epoch': 3.17}


                                                      
 13%|█▎        | 6278/47590 [1:34:55<22:13, 30.97it/s]

{'embedding_loss': 0.0066, 'grad_norm': 0.0828540250658989, 'learning_rate': 8.120286708225351e-06, 'epoch': 3.17}


                                                      
 13%|█▎        | 6278/47590 [1:34:56<22:13, 30.97it/s]


{'embedding_loss': 0.009, 'grad_norm': 0.5187591910362244, 'learning_rate': 8.096939132871052e-06, 'epoch': 3.18}


                                                      [A
 13%|█▎        | 6278/47590 [1:34:57<22:13, 30.97it/s]


{'embedding_loss': 0.0059, 'grad_norm': 0.05004100874066353, 'learning_rate': 8.073591557516753e-06, 'epoch': 3.18}


                                                      [A
 13%|█▎        | 6278/47590 [1:34:58<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.03667263314127922, 'learning_rate': 8.050243982162453e-06, 'epoch': 3.19}


                                                      
 13%|█▎        | 6278/47590 [1:34:59<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.43149709701538086, 'learning_rate': 8.026896406808154e-06, 'epoch': 3.19}


                                                      
 13%|█▎        | 6278/47590 [1:35:00<22:13, 30.97it/s]

{'embedding_loss': 0.0081, 'grad_norm': 0.14374925196170807, 'learning_rate': 8.003548831453855e-06, 'epoch': 3.2}


                                                      
 13%|█▎        | 6278/47590 [1:35:01<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.4606078267097473, 'learning_rate': 7.980201256099554e-06, 'epoch': 3.2}


                                                      
 13%|█▎        | 6278/47590 [1:35:02<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.06582442671060562, 'learning_rate': 7.956853680745255e-06, 'epoch': 3.21}


                                                      
 13%|█▎        | 6278/47590 [1:35:03<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.1137225329875946, 'learning_rate': 7.933506105390956e-06, 'epoch': 3.21}


                                                      
 13%|█▎        | 6278/47590 [1:35:04<22:13, 30.97it/s]

{'embedding_loss': 0.0047, 'grad_norm': 0.11867868900299072, 'learning_rate': 7.910158530036656e-06, 'epoch': 3.22}


                                                      
 13%|█▎        | 6278/47590 [1:35:05<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.24441766738891602, 'learning_rate': 7.886810954682357e-06, 'epoch': 3.23}


                                                      
 13%|█▎        | 6278/47590 [1:35:06<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.05849367752671242, 'learning_rate': 7.863463379328058e-06, 'epoch': 3.23}


                                                      
 13%|█▎        | 6278/47590 [1:35:07<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.030273662880063057, 'learning_rate': 7.840115803973759e-06, 'epoch': 3.24}


                                                      
 13%|█▎        | 6278/47590 [1:35:08<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.03134797886013985, 'learning_rate': 7.81676822861946e-06, 'epoch': 3.24}


                                                      
 13%|█▎        | 6278/47590 [1:35:09<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.4732360243797302, 'learning_rate': 7.793420653265159e-06, 'epoch': 3.25}


                                                      
 13%|█▎        | 6278/47590 [1:35:10<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.1685507744550705, 'learning_rate': 7.77007307791086e-06, 'epoch': 3.25}


                                                      
 13%|█▎        | 6278/47590 [1:35:11<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.022057779133319855, 'learning_rate': 7.74672550255656e-06, 'epoch': 3.26}


                                                      
 13%|█▎        | 6278/47590 [1:35:12<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.4156160056591034, 'learning_rate': 7.723377927202261e-06, 'epoch': 3.26}


                                                      
 13%|█▎        | 6278/47590 [1:35:13<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.11277890205383301, 'learning_rate': 7.700030351847962e-06, 'epoch': 3.27}


                                                      
 13%|█▎        | 6278/47590 [1:35:14<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.40298303961753845, 'learning_rate': 7.676682776493661e-06, 'epoch': 3.27}


                                                      
 13%|█▎        | 6278/47590 [1:35:15<22:13, 30.97it/s]

{'embedding_loss': 0.0076, 'grad_norm': 0.06801033765077591, 'learning_rate': 7.653335201139362e-06, 'epoch': 3.28}


                                                      
 13%|█▎        | 6278/47590 [1:35:16<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.09667794406414032, 'learning_rate': 7.629987625785062e-06, 'epoch': 3.28}


                                                      
 13%|█▎        | 6278/47590 [1:35:17<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.09473224729299545, 'learning_rate': 7.606640050430763e-06, 'epoch': 3.29}


                                                      
 13%|█▎        | 6278/47590 [1:35:18<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.02886875905096531, 'learning_rate': 7.583292475076464e-06, 'epoch': 3.29}


                                                      
 13%|█▎        | 6278/47590 [1:35:19<22:13, 30.97it/s]

{'embedding_loss': 0.0037, 'grad_norm': 0.09311670064926147, 'learning_rate': 7.559944899722165e-06, 'epoch': 3.3}


                                                      
 13%|█▎        | 6278/47590 [1:35:20<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.7405386567115784, 'learning_rate': 7.536597324367865e-06, 'epoch': 3.3}


                                                      
 13%|█▎        | 6278/47590 [1:35:21<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.048752181231975555, 'learning_rate': 7.5132497490135655e-06, 'epoch': 3.31}


                                                      
 13%|█▎        | 6278/47590 [1:35:22<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.1166851595044136, 'learning_rate': 7.4899021736592654e-06, 'epoch': 3.31}


                                                      
 13%|█▎        | 6278/47590 [1:35:23<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.2924557328224182, 'learning_rate': 7.466554598304967e-06, 'epoch': 3.32}


                                                      
 13%|█▎        | 6278/47590 [1:35:24<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.5415814518928528, 'learning_rate': 7.443207022950667e-06, 'epoch': 3.33}


                                                      
 13%|█▎        | 6278/47590 [1:35:25<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.20061710476875305, 'learning_rate': 7.419859447596368e-06, 'epoch': 3.33}


                                                      
 13%|█▎        | 6278/47590 [1:35:26<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.07267638295888901, 'learning_rate': 7.396511872242068e-06, 'epoch': 3.34}


                                                      
 13%|█▎        | 6278/47590 [1:35:27<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.03883262723684311, 'learning_rate': 7.373164296887769e-06, 'epoch': 3.34}


                                                      
 13%|█▎        | 6278/47590 [1:35:28<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.05427103117108345, 'learning_rate': 7.349816721533469e-06, 'epoch': 3.35}


                                                      
 13%|█▎        | 6278/47590 [1:35:29<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.0954415500164032, 'learning_rate': 7.32646914617917e-06, 'epoch': 3.35}


                                                      
 13%|█▎        | 6278/47590 [1:35:30<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.06970337778329849, 'learning_rate': 7.30312157082487e-06, 'epoch': 3.36}


                                                      
 13%|█▎        | 6278/47590 [1:35:30<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.1598052680492401, 'learning_rate': 7.279773995470572e-06, 'epoch': 3.36}


                                                      
 13%|█▎        | 6278/47590 [1:35:32<22:13, 30.97it/s]


{'embedding_loss': 0.0064, 'grad_norm': 0.06600577384233475, 'learning_rate': 7.256426420116272e-06, 'epoch': 3.37}


                                                      [A
 13%|█▎        | 6278/47590 [1:35:33<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.1854476034641266, 'learning_rate': 7.2330788447619715e-06, 'epoch': 3.37}


                                                      
 13%|█▎        | 6278/47590 [1:35:34<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.5755729079246521, 'learning_rate': 7.209731269407672e-06, 'epoch': 3.38}


                                                      
 13%|█▎        | 6278/47590 [1:35:35<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.11130575835704803, 'learning_rate': 7.186383694053372e-06, 'epoch': 3.38}


                                                      
 13%|█▎        | 6278/47590 [1:35:36<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.23322129249572754, 'learning_rate': 7.163036118699074e-06, 'epoch': 3.39}


                                                      
 13%|█▎        | 6278/47590 [1:35:37<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.2453417032957077, 'learning_rate': 7.139688543344774e-06, 'epoch': 3.39}


                                                      
 13%|█▎        | 6278/47590 [1:35:38<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.07802638411521912, 'learning_rate': 7.116340967990475e-06, 'epoch': 3.4}


                                                      
 13%|█▎        | 6278/47590 [1:35:38<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.2177332043647766, 'learning_rate': 7.0929933926361745e-06, 'epoch': 3.4}


                                                      
 13%|█▎        | 6278/47590 [1:35:39<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.014055141247808933, 'learning_rate': 7.069645817281876e-06, 'epoch': 3.41}


                                                      
 13%|█▎        | 6278/47590 [1:35:40<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.04756404459476471, 'learning_rate': 7.046298241927576e-06, 'epoch': 3.41}


                                                      
 13%|█▎        | 6278/47590 [1:35:42<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.06936103105545044, 'learning_rate': 7.022950666573277e-06, 'epoch': 3.42}


                                                      
 13%|█▎        | 6278/47590 [1:35:43<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.460277259349823, 'learning_rate': 6.999603091218977e-06, 'epoch': 3.43}


                                                      
 13%|█▎        | 6278/47590 [1:35:44<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.03895333781838417, 'learning_rate': 6.9762555158646785e-06, 'epoch': 3.43}


                                                      
 13%|█▎        | 6278/47590 [1:35:45<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.3637443482875824, 'learning_rate': 6.952907940510378e-06, 'epoch': 3.44}


                                                      
 13%|█▎        | 6278/47590 [1:35:46<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.09138081222772598, 'learning_rate': 6.929560365156079e-06, 'epoch': 3.44}


                                                      
 13%|█▎        | 6278/47590 [1:35:47<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.1684264987707138, 'learning_rate': 6.906212789801779e-06, 'epoch': 3.45}


                                                      
 13%|█▎        | 6278/47590 [1:35:48<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 0.9088876247406006, 'learning_rate': 6.882865214447481e-06, 'epoch': 3.45}


                                                      
 13%|█▎        | 6278/47590 [1:35:49<22:13, 30.97it/s]


{'embedding_loss': 0.0068, 'grad_norm': 0.2572309374809265, 'learning_rate': 6.859517639093181e-06, 'epoch': 3.46}


                                                      [A
 13%|█▎        | 6278/47590 [1:35:50<22:13, 30.97it/s]

{'embedding_loss': 0.0087, 'grad_norm': 0.40500688552856445, 'learning_rate': 6.8361700637388815e-06, 'epoch': 3.46}


                                                      
 13%|█▎        | 6278/47590 [1:35:51<22:13, 30.97it/s]

{'embedding_loss': 0.0082, 'grad_norm': 0.03128489479422569, 'learning_rate': 6.812822488384581e-06, 'epoch': 3.47}


                                                      
 13%|█▎        | 6278/47590 [1:35:52<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.4791947901248932, 'learning_rate': 6.789474913030283e-06, 'epoch': 3.47}


                                                      
 13%|█▎        | 6278/47590 [1:35:53<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.10669615119695663, 'learning_rate': 6.766127337675983e-06, 'epoch': 3.48}


                                                      
 13%|█▎        | 6278/47590 [1:35:54<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 0.09611828625202179, 'learning_rate': 6.742779762321684e-06, 'epoch': 3.48}


                                                      
 13%|█▎        | 6278/47590 [1:35:55<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.4863217771053314, 'learning_rate': 6.719432186967384e-06, 'epoch': 3.49}


                                                      
 13%|█▎        | 6278/47590 [1:35:56<22:13, 30.97it/s]

{'embedding_loss': 0.0046, 'grad_norm': 0.047297462821006775, 'learning_rate': 6.696084611613084e-06, 'epoch': 3.49}


                                                      
 13%|█▎        | 6278/47590 [1:35:57<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.08287189900875092, 'learning_rate': 6.672737036258785e-06, 'epoch': 3.5}


                                                      
 13%|█▎        | 6278/47590 [1:35:58<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.062240734696388245, 'learning_rate': 6.649389460904485e-06, 'epoch': 3.5}


                                                      
 13%|█▎        | 6278/47590 [1:35:59<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.09771569818258286, 'learning_rate': 6.626041885550186e-06, 'epoch': 3.51}


                                                      
 13%|█▎        | 6278/47590 [1:36:00<22:13, 30.97it/s]


{'embedding_loss': 0.0054, 'grad_norm': 0.3738134205341339, 'learning_rate': 6.602694310195886e-06, 'epoch': 3.51}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:01<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.39560022950172424, 'learning_rate': 6.5793467348415875e-06, 'epoch': 3.52}


                                                      
 13%|█▎        | 6278/47590 [1:36:03<22:13, 30.97it/s]

{'embedding_loss': 0.0076, 'grad_norm': 0.01985279656946659, 'learning_rate': 6.5559991594872875e-06, 'epoch': 3.52}


                                                      
 13%|█▎        | 6278/47590 [1:36:04<22:13, 30.97it/s]

{'embedding_loss': 0.0079, 'grad_norm': 0.547642171382904, 'learning_rate': 6.532651584132988e-06, 'epoch': 3.53}


                                                      
 13%|█▎        | 6278/47590 [1:36:05<22:13, 30.97it/s]


{'embedding_loss': 0.0076, 'grad_norm': 0.4991397559642792, 'learning_rate': 6.509304008778688e-06, 'epoch': 3.54}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:06<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.3799939751625061, 'learning_rate': 6.48595643342439e-06, 'epoch': 3.54}


                                                      
 13%|█▎        | 6278/47590 [1:36:06<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.6922802925109863, 'learning_rate': 6.46260885807009e-06, 'epoch': 3.55}


                                                      
 13%|█▎        | 6278/47590 [1:36:07<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.4330369532108307, 'learning_rate': 6.4392612827157906e-06, 'epoch': 3.55}


                                                      
 13%|█▎        | 6278/47590 [1:36:08<22:13, 30.97it/s]

{'embedding_loss': 0.0047, 'grad_norm': 0.015712009742856026, 'learning_rate': 6.4159137073614905e-06, 'epoch': 3.56}


                                                      
 13%|█▎        | 6278/47590 [1:36:09<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.868460476398468, 'learning_rate': 6.392566132007192e-06, 'epoch': 3.56}


                                                      
 13%|█▎        | 6278/47590 [1:36:10<22:13, 30.97it/s]


{'embedding_loss': 0.0071, 'grad_norm': 0.04215283691883087, 'learning_rate': 6.369218556652892e-06, 'epoch': 3.57}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:11<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.07856225222349167, 'learning_rate': 6.345870981298593e-06, 'epoch': 3.57}


                                                      
 13%|█▎        | 6278/47590 [1:36:13<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.1313955932855606, 'learning_rate': 6.322523405944293e-06, 'epoch': 3.58}


                                                      
 13%|█▎        | 6278/47590 [1:36:14<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.03061976097524166, 'learning_rate': 6.299175830589994e-06, 'epoch': 3.58}


                                                      
 13%|█▎        | 6278/47590 [1:36:15<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.3155727684497833, 'learning_rate': 6.275828255235694e-06, 'epoch': 3.59}


                                                      
 13%|█▎        | 6278/47590 [1:36:16<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.16379554569721222, 'learning_rate': 6.252480679881395e-06, 'epoch': 3.59}


                                                      
 13%|█▎        | 6278/47590 [1:36:16<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.429899662733078, 'learning_rate': 6.229133104527095e-06, 'epoch': 3.6}


                                                      
 13%|█▎        | 6278/47590 [1:36:17<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 1.7436078786849976, 'learning_rate': 6.205785529172797e-06, 'epoch': 3.6}


                                                      
 13%|█▎        | 6278/47590 [1:36:18<22:13, 30.97it/s]


{'embedding_loss': 0.0054, 'grad_norm': 0.5093243718147278, 'learning_rate': 6.182437953818497e-06, 'epoch': 3.61}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:19<22:13, 30.97it/s]


{'embedding_loss': 0.0051, 'grad_norm': 0.4025042951107025, 'learning_rate': 6.159090378464197e-06, 'epoch': 3.61}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:20<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.045107852667570114, 'learning_rate': 6.135742803109897e-06, 'epoch': 3.62}


                                                      
 13%|█▎        | 6278/47590 [1:36:21<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.10833147913217545, 'learning_rate': 6.112395227755597e-06, 'epoch': 3.62}


                                                      
 13%|█▎        | 6278/47590 [1:36:23<22:13, 30.97it/s]


{'embedding_loss': 0.004, 'grad_norm': 0.2115406095981598, 'learning_rate': 6.089047652401299e-06, 'epoch': 3.63}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:24<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.1713034063577652, 'learning_rate': 6.065700077046999e-06, 'epoch': 3.64}


                                                      
 13%|█▎        | 6278/47590 [1:36:25<22:13, 30.97it/s]

{'embedding_loss': 0.0046, 'grad_norm': 0.0960591658949852, 'learning_rate': 6.0423525016927e-06, 'epoch': 3.64}


                                                      
 13%|█▎        | 6278/47590 [1:36:25<22:13, 30.97it/s]


{'embedding_loss': 0.0072, 'grad_norm': 0.06211984530091286, 'learning_rate': 6.0190049263384e-06, 'epoch': 3.65}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:26<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.3757646977901459, 'learning_rate': 5.995657350984101e-06, 'epoch': 3.65}


                                                      
 13%|█▎        | 6278/47590 [1:36:27<22:13, 30.97it/s]


{'embedding_loss': 0.0064, 'grad_norm': 0.16724516451358795, 'learning_rate': 5.972309775629801e-06, 'epoch': 3.66}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:28<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.06846214085817337, 'learning_rate': 5.948962200275502e-06, 'epoch': 3.66}


                                                      
 13%|█▎        | 6278/47590 [1:36:29<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.030353814363479614, 'learning_rate': 5.925614624921202e-06, 'epoch': 3.67}


                                                      
 13%|█▎        | 6278/47590 [1:36:30<22:13, 30.97it/s]


{'embedding_loss': 0.0085, 'grad_norm': 0.7607101202011108, 'learning_rate': 5.9022670495669035e-06, 'epoch': 3.67}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:31<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.02422792837023735, 'learning_rate': 5.8789194742126034e-06, 'epoch': 3.68}


                                                      
 13%|█▎        | 6278/47590 [1:36:33<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.625941812992096, 'learning_rate': 5.855571898858304e-06, 'epoch': 3.68}


                                                      
 13%|█▎        | 6278/47590 [1:36:34<22:13, 30.97it/s]


{'embedding_loss': 0.0059, 'grad_norm': 0.042763032019138336, 'learning_rate': 5.832224323504004e-06, 'epoch': 3.69}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:35<22:13, 30.97it/s]


{'embedding_loss': 0.0059, 'grad_norm': 0.07299928367137909, 'learning_rate': 5.808876748149706e-06, 'epoch': 3.69}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:35<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.032718803733587265, 'learning_rate': 5.785529172795406e-06, 'epoch': 3.7}


                                                      
 13%|█▎        | 6278/47590 [1:36:36<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.10299038887023926, 'learning_rate': 5.7621815974411065e-06, 'epoch': 3.7}


                                                      
 13%|█▎        | 6278/47590 [1:36:37<22:13, 30.97it/s]

{'embedding_loss': 0.0085, 'grad_norm': 0.4215483069419861, 'learning_rate': 5.7388340220868064e-06, 'epoch': 3.71}


                                                      
 13%|█▎        | 6278/47590 [1:36:38<22:13, 30.97it/s]

{'embedding_loss': 0.0047, 'grad_norm': 0.01519619207829237, 'learning_rate': 5.715486446732508e-06, 'epoch': 3.71}


                                                      
 13%|█▎        | 6278/47590 [1:36:39<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.020260212942957878, 'learning_rate': 5.692138871378208e-06, 'epoch': 3.72}


                                                      
 13%|█▎        | 6278/47590 [1:36:40<22:13, 30.97it/s]


{'embedding_loss': 0.0053, 'grad_norm': 0.13212203979492188, 'learning_rate': 5.668791296023909e-06, 'epoch': 3.72}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:41<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.07553692907094955, 'learning_rate': 5.645443720669609e-06, 'epoch': 3.73}


                                                      
 13%|█▎        | 6278/47590 [1:36:43<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.5992501974105835, 'learning_rate': 5.62209614531531e-06, 'epoch': 3.74}


                                                      
 13%|█▎        | 6278/47590 [1:36:44<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.10007690638303757, 'learning_rate': 5.59874856996101e-06, 'epoch': 3.74}


                                                      
 13%|█▎        | 6278/47590 [1:36:45<22:13, 30.97it/s]


{'embedding_loss': 0.0057, 'grad_norm': 0.3717533349990845, 'learning_rate': 5.57540099460671e-06, 'epoch': 3.75}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:46<22:13, 30.97it/s]


{'embedding_loss': 0.0064, 'grad_norm': 0.11330375075340271, 'learning_rate': 5.552053419252411e-06, 'epoch': 3.75}


                                                      [A
 13%|█▎        | 6278/47590 [1:36:47<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.09698715060949326, 'learning_rate': 5.528705843898111e-06, 'epoch': 3.76}


                                                      
 13%|█▎        | 6278/47590 [1:36:48<22:13, 30.97it/s]

{'embedding_loss': 0.0039, 'grad_norm': 0.49749821424484253, 'learning_rate': 5.505358268543813e-06, 'epoch': 3.76}


                                                      
 13%|█▎        | 6278/47590 [1:36:48<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.03379429131746292, 'learning_rate': 5.4820106931895125e-06, 'epoch': 3.77}


                                                      
 13%|█▎        | 6278/47590 [1:36:49<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.039839863777160645, 'learning_rate': 5.458663117835213e-06, 'epoch': 3.77}


                                                      
 13%|█▎        | 6278/47590 [1:36:50<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.2598828971385956, 'learning_rate': 5.435315542480913e-06, 'epoch': 3.78}


                                                      
 13%|█▎        | 6278/47590 [1:36:51<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.31846413016319275, 'learning_rate': 5.411967967126615e-06, 'epoch': 3.78}


                                                      
 13%|█▎        | 6278/47590 [1:36:53<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.1271020770072937, 'learning_rate': 5.388620391772315e-06, 'epoch': 3.79}


                                                      
 13%|█▎        | 6278/47590 [1:36:54<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.049874983727931976, 'learning_rate': 5.365272816418016e-06, 'epoch': 3.79}


                                                      
 13%|█▎        | 6278/47590 [1:36:55<22:13, 30.97it/s]

{'embedding_loss': 0.0076, 'grad_norm': 0.34637215733528137, 'learning_rate': 5.3419252410637155e-06, 'epoch': 3.8}


                                                      
 13%|█▎        | 6278/47590 [1:36:56<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.023761825636029243, 'learning_rate': 5.318577665709417e-06, 'epoch': 3.8}


                                                      
 13%|█▎        | 6278/47590 [1:36:57<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.3078667223453522, 'learning_rate': 5.295230090355117e-06, 'epoch': 3.81}


                                                      
 13%|█▎        | 6278/47590 [1:36:57<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.9557293057441711, 'learning_rate': 5.271882515000818e-06, 'epoch': 3.81}


                                                      
 13%|█▎        | 6278/47590 [1:36:58<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.06497137993574142, 'learning_rate': 5.248534939646518e-06, 'epoch': 3.82}


                                                      
 13%|█▎        | 6278/47590 [1:36:59<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.10570545494556427, 'learning_rate': 5.2251873642922195e-06, 'epoch': 3.82}


                                                      
 13%|█▎        | 6278/47590 [1:37:00<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.24439917504787445, 'learning_rate': 5.201839788937919e-06, 'epoch': 3.83}


                                                      
 13%|█▎        | 6278/47590 [1:37:01<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.20525433123111725, 'learning_rate': 5.17849221358362e-06, 'epoch': 3.83}


                                                      
 13%|█▎        | 6278/47590 [1:37:03<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.12347619235515594, 'learning_rate': 5.15514463822932e-06, 'epoch': 3.84}


                                                      
 13%|█▎        | 6278/47590 [1:37:04<22:13, 30.97it/s]


{'embedding_loss': 0.0052, 'grad_norm': 0.04073398932814598, 'learning_rate': 5.131797062875022e-06, 'epoch': 3.85}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:05<22:13, 30.97it/s]


{'embedding_loss': 0.006, 'grad_norm': 0.10670147836208344, 'learning_rate': 5.108449487520722e-06, 'epoch': 3.85}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:06<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.5436508059501648, 'learning_rate': 5.0851019121664225e-06, 'epoch': 3.86}


                                                      
 13%|█▎        | 6278/47590 [1:37:07<22:13, 30.97it/s]

{'embedding_loss': 0.0066, 'grad_norm': 0.07219459861516953, 'learning_rate': 5.061754336812122e-06, 'epoch': 3.86}


                                                      
 13%|█▎        | 6278/47590 [1:37:07<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.6545196175575256, 'learning_rate': 5.038406761457822e-06, 'epoch': 3.87}


                                                      
 13%|█▎        | 6278/47590 [1:37:08<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.2800610065460205, 'learning_rate': 5.015059186103524e-06, 'epoch': 3.87}


                                                      
 13%|█▎        | 6278/47590 [1:37:09<22:13, 30.97it/s]

{'embedding_loss': 0.0032, 'grad_norm': 0.05743920058012009, 'learning_rate': 4.991711610749224e-06, 'epoch': 3.88}


                                                      
 13%|█▎        | 6278/47590 [1:37:10<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.014357009902596474, 'learning_rate': 4.968364035394925e-06, 'epoch': 3.88}


                                                      
 13%|█▎        | 6278/47590 [1:37:11<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.03579767793416977, 'learning_rate': 4.9450164600406255e-06, 'epoch': 3.89}


                                                      
 13%|█▎        | 6278/47590 [1:37:13<22:13, 30.97it/s]

{'embedding_loss': 0.0069, 'grad_norm': 0.027122821658849716, 'learning_rate': 4.921668884686326e-06, 'epoch': 3.89}


                                                      
 13%|█▎        | 6278/47590 [1:37:14<22:13, 30.97it/s]

{'embedding_loss': 0.0043, 'grad_norm': 0.2568538188934326, 'learning_rate': 4.898321309332026e-06, 'epoch': 3.9}


                                                      
 13%|█▎        | 6278/47590 [1:37:15<22:13, 30.97it/s]

{'embedding_loss': 0.0034, 'grad_norm': 0.02702757716178894, 'learning_rate': 4.874973733977727e-06, 'epoch': 3.9}


                                                      
 13%|█▎        | 6278/47590 [1:37:16<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.06962113082408905, 'learning_rate': 4.851626158623428e-06, 'epoch': 3.91}


                                                      
 13%|█▎        | 6278/47590 [1:37:17<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.3249835669994354, 'learning_rate': 4.8282785832691285e-06, 'epoch': 3.91}


                                                      
 13%|█▎        | 6278/47590 [1:37:18<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.03210154175758362, 'learning_rate': 4.8049310079148285e-06, 'epoch': 3.92}


                                                      
 13%|█▎        | 6278/47590 [1:37:19<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.051601167768239975, 'learning_rate': 4.781583432560528e-06, 'epoch': 3.92}


                                                      
 13%|█▎        | 6278/47590 [1:37:20<22:13, 30.97it/s]


{'embedding_loss': 0.0045, 'grad_norm': 0.034547969698905945, 'learning_rate': 4.758235857206229e-06, 'epoch': 3.93}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:20<22:13, 30.97it/s]


{'embedding_loss': 0.0067, 'grad_norm': 0.03481694683432579, 'learning_rate': 4.73488828185193e-06, 'epoch': 3.93}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:21<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.05353108420968056, 'learning_rate': 4.711540706497631e-06, 'epoch': 3.94}


                                                      
 13%|█▎        | 6278/47590 [1:37:23<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.4713110625743866, 'learning_rate': 4.688193131143331e-06, 'epoch': 3.95}


                                                      
 13%|█▎        | 6278/47590 [1:37:24<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.3032165765762329, 'learning_rate': 4.6648455557890315e-06, 'epoch': 3.95}


                                                      
 13%|█▎        | 6278/47590 [1:37:25<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.05369077995419502, 'learning_rate': 4.641497980434732e-06, 'epoch': 3.96}


                                                      
 13%|█▎        | 6278/47590 [1:37:26<22:13, 30.97it/s]

{'embedding_loss': 0.0048, 'grad_norm': 0.021743979305028915, 'learning_rate': 4.618150405080433e-06, 'epoch': 3.96}


                                                      
 13%|█▎        | 6278/47590 [1:37:27<22:13, 30.97it/s]


{'embedding_loss': 0.0062, 'grad_norm': 0.1291073113679886, 'learning_rate': 4.594802829726133e-06, 'epoch': 3.97}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:28<22:13, 30.97it/s]


{'embedding_loss': 0.0052, 'grad_norm': 0.15547841787338257, 'learning_rate': 4.571455254371834e-06, 'epoch': 3.97}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:29<22:13, 30.97it/s]


{'embedding_loss': 0.0063, 'grad_norm': 0.407630980014801, 'learning_rate': 4.5481076790175346e-06, 'epoch': 3.98}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:30<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.07284515351057053, 'learning_rate': 4.5247601036632345e-06, 'epoch': 3.98}


                                                      
 13%|█▎        | 6278/47590 [1:37:30<22:13, 30.97it/s]


{'embedding_loss': 0.0063, 'grad_norm': 0.9356083869934082, 'learning_rate': 4.501412528308935e-06, 'epoch': 3.99}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:31<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.09041082113981247, 'learning_rate': 4.478064952954636e-06, 'epoch': 3.99}


                                                      
 13%|█▎        | 6278/47590 [1:37:33<22:13, 30.97it/s]

{'embedding_loss': 0.0073, 'grad_norm': 0.06352806091308594, 'learning_rate': 4.454717377600337e-06, 'epoch': 4.0}


                                                      
 13%|█▎        | 6278/47590 [1:37:35<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.4251873791217804, 'learning_rate': 4.431369802246037e-06, 'epoch': 4.0}


                                                      
 13%|█▎        | 6278/47590 [1:37:35<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.06304967403411865, 'learning_rate': 4.4080222268917376e-06, 'epoch': 4.01}


                                                      
 13%|█▎        | 6278/47590 [1:37:36<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.2367090880870819, 'learning_rate': 4.384674651537438e-06, 'epoch': 4.01}


                                                      
 13%|█▎        | 6278/47590 [1:37:37<22:13, 30.97it/s]


{'embedding_loss': 0.0054, 'grad_norm': 0.35200345516204834, 'learning_rate': 4.361327076183139e-06, 'epoch': 4.02}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:38<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.03856664150953293, 'learning_rate': 4.337979500828839e-06, 'epoch': 4.02}


                                                      
 13%|█▎        | 6278/47590 [1:37:39<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.4829850494861603, 'learning_rate': 4.31463192547454e-06, 'epoch': 4.03}


                                                      
 13%|█▎        | 6278/47590 [1:37:40<22:13, 30.97it/s]

{'embedding_loss': 0.0041, 'grad_norm': 0.23506401479244232, 'learning_rate': 4.291284350120241e-06, 'epoch': 4.03}


                                                      
 13%|█▎        | 6278/47590 [1:37:41<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.10905390977859497, 'learning_rate': 4.2679367747659414e-06, 'epoch': 4.04}


                                                      
 13%|█▎        | 6278/47590 [1:37:42<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.04605971649289131, 'learning_rate': 4.244589199411641e-06, 'epoch': 4.04}


                                                      
 13%|█▎        | 6278/47590 [1:37:44<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.062565378844738, 'learning_rate': 4.221241624057341e-06, 'epoch': 4.05}


                                                      
 13%|█▎        | 6278/47590 [1:37:45<22:13, 30.97it/s]


{'embedding_loss': 0.0059, 'grad_norm': 0.3684215843677521, 'learning_rate': 4.197894048703042e-06, 'epoch': 4.06}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:45<22:13, 30.97it/s]

{'embedding_loss': 0.0045, 'grad_norm': 0.05615558475255966, 'learning_rate': 4.174546473348743e-06, 'epoch': 4.06}


                                                      
 13%|█▎        | 6278/47590 [1:37:46<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.06038255989551544, 'learning_rate': 4.151198897994444e-06, 'epoch': 4.07}


                                                      
 13%|█▎        | 6278/47590 [1:37:47<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.05321686714887619, 'learning_rate': 4.127851322640144e-06, 'epoch': 4.07}


                                                      
 13%|█▎        | 6278/47590 [1:37:48<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.054625239223241806, 'learning_rate': 4.104503747285844e-06, 'epoch': 4.08}


                                                      
 13%|█▎        | 6278/47590 [1:37:49<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.17673465609550476, 'learning_rate': 4.081156171931545e-06, 'epoch': 4.08}


                                                      
 13%|█▎        | 6278/47590 [1:37:50<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.3365211486816406, 'learning_rate': 4.057808596577246e-06, 'epoch': 4.09}


                                                      
 13%|█▎        | 6278/47590 [1:37:51<22:13, 30.97it/s]

{'embedding_loss': 0.0046, 'grad_norm': 0.23649701476097107, 'learning_rate': 4.034461021222946e-06, 'epoch': 4.09}


                                                      
 13%|█▎        | 6278/47590 [1:37:52<22:13, 30.97it/s]

{'embedding_loss': 0.0045, 'grad_norm': 0.04537303373217583, 'learning_rate': 4.011113445868647e-06, 'epoch': 4.1}


                                                      
 13%|█▎        | 6278/47590 [1:37:53<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.055761177092790604, 'learning_rate': 3.9877658705143474e-06, 'epoch': 4.1}


                                                      
 13%|█▎        | 6278/47590 [1:37:54<22:13, 30.97it/s]

{'embedding_loss': 0.0075, 'grad_norm': 0.09450129419565201, 'learning_rate': 3.964418295160048e-06, 'epoch': 4.11}


                                                      
 13%|█▎        | 6278/47590 [1:37:55<22:13, 30.97it/s]


{'embedding_loss': 0.005, 'grad_norm': 0.412418931722641, 'learning_rate': 3.941070719805748e-06, 'epoch': 4.11}


                                                      [A
 13%|█▎        | 6278/47590 [1:37:56<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.4345877170562744, 'learning_rate': 3.917723144451449e-06, 'epoch': 4.12}


                                                      
 13%|█▎        | 6278/47590 [1:37:57<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.4979916512966156, 'learning_rate': 3.89437556909715e-06, 'epoch': 4.12}


                                                      
 13%|█▎        | 6278/47590 [1:37:58<22:13, 30.97it/s]

{'embedding_loss': 0.0046, 'grad_norm': 0.19598224759101868, 'learning_rate': 3.8710279937428505e-06, 'epoch': 4.13}


                                                      
 13%|█▎        | 6278/47590 [1:37:59<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.08476688712835312, 'learning_rate': 3.8476804183885505e-06, 'epoch': 4.13}


                                                      
 13%|█▎        | 6278/47590 [1:38:00<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.46988946199417114, 'learning_rate': 3.824332843034251e-06, 'epoch': 4.14}


                                                      
 13%|█▎        | 6278/47590 [1:38:01<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.4430726170539856, 'learning_rate': 3.800985267679952e-06, 'epoch': 4.14}


                                                      
 13%|█▎        | 6278/47590 [1:38:02<22:13, 30.97it/s]

{'embedding_loss': 0.0047, 'grad_norm': 0.3391924798488617, 'learning_rate': 3.7776376923256524e-06, 'epoch': 4.15}


                                                      
 13%|█▎        | 6278/47590 [1:38:03<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.138862743973732, 'learning_rate': 3.754290116971353e-06, 'epoch': 4.16}


                                                      
 13%|█▎        | 6278/47590 [1:38:04<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.3255714178085327, 'learning_rate': 3.7309425416170535e-06, 'epoch': 4.16}


                                                      
 13%|█▎        | 6278/47590 [1:38:05<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.023743348196148872, 'learning_rate': 3.7075949662627543e-06, 'epoch': 4.17}


                                                      
 13%|█▎        | 6278/47590 [1:38:06<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.29566019773483276, 'learning_rate': 3.6842473909084547e-06, 'epoch': 4.17}


                                                      
 13%|█▎        | 6278/47590 [1:38:07<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.27721649408340454, 'learning_rate': 3.6608998155541546e-06, 'epoch': 4.18}


                                                      
 13%|█▎        | 6278/47590 [1:38:08<22:13, 30.97it/s]


{'embedding_loss': 0.0049, 'grad_norm': 0.13380949199199677, 'learning_rate': 3.6375522401998554e-06, 'epoch': 4.18}


                                                      [A
 13%|█▎        | 6278/47590 [1:38:09<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.0375586673617363, 'learning_rate': 3.6142046648455558e-06, 'epoch': 4.19}


                                                      
 13%|█▎        | 6278/47590 [1:38:10<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.27764707803726196, 'learning_rate': 3.5908570894912565e-06, 'epoch': 4.19}


                                                      
 13%|█▎        | 6278/47590 [1:38:11<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.366292268037796, 'learning_rate': 3.567509514136957e-06, 'epoch': 4.2}


                                                      
 13%|█▎        | 6278/47590 [1:38:12<22:13, 30.97it/s]

{'embedding_loss': 0.0061, 'grad_norm': 0.26490455865859985, 'learning_rate': 3.5441619387826577e-06, 'epoch': 4.2}


                                                      
 13%|█▎        | 6278/47590 [1:38:13<22:13, 30.97it/s]

{'embedding_loss': 0.0037, 'grad_norm': 0.5146277546882629, 'learning_rate': 3.520814363428358e-06, 'epoch': 4.21}


                                                      
 13%|█▎        | 6278/47590 [1:38:14<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.15085247159004211, 'learning_rate': 3.497466788074059e-06, 'epoch': 4.21}


                                                      
 13%|█▎        | 6278/47590 [1:38:15<22:13, 30.97it/s]

{'embedding_loss': 0.0045, 'grad_norm': 0.8300791382789612, 'learning_rate': 3.474119212719759e-06, 'epoch': 4.22}


                                                      
 13%|█▎        | 6278/47590 [1:38:16<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.2636779844760895, 'learning_rate': 3.45077163736546e-06, 'epoch': 4.22}


                                                      
 13%|█▎        | 6278/47590 [1:38:17<22:13, 30.97it/s]

{'embedding_loss': 0.0045, 'grad_norm': 0.26773548126220703, 'learning_rate': 3.4274240620111603e-06, 'epoch': 4.23}


                                                      
 13%|█▎        | 6278/47590 [1:38:18<22:13, 30.97it/s]


{'embedding_loss': 0.006, 'grad_norm': 0.24158918857574463, 'learning_rate': 3.404076486656861e-06, 'epoch': 4.23}


                                                      [A
 13%|█▎        | 6278/47590 [1:38:19<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.21397638320922852, 'learning_rate': 3.3807289113025615e-06, 'epoch': 4.24}


                                                      
 13%|█▎        | 6278/47590 [1:38:20<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.49227622151374817, 'learning_rate': 3.3573813359482623e-06, 'epoch': 4.24}


                                                      
 13%|█▎        | 6278/47590 [1:38:21<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.08530177921056747, 'learning_rate': 3.3340337605939626e-06, 'epoch': 4.25}


                                                      
 13%|█▎        | 6278/47590 [1:38:22<22:13, 30.97it/s]

{'embedding_loss': 0.0068, 'grad_norm': 0.04518191143870354, 'learning_rate': 3.3106861852396634e-06, 'epoch': 4.26}


                                                      
 13%|█▎        | 6278/47590 [1:38:23<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.2610683739185333, 'learning_rate': 3.2873386098853638e-06, 'epoch': 4.26}


                                                      
 13%|█▎        | 6278/47590 [1:38:24<22:13, 30.97it/s]


{'embedding_loss': 0.0075, 'grad_norm': 0.5831518173217773, 'learning_rate': 3.2639910345310646e-06, 'epoch': 4.27}


                                                      [A
 13%|█▎        | 6278/47590 [1:38:25<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.2074371874332428, 'learning_rate': 3.240643459176765e-06, 'epoch': 4.27}


                                                      
 13%|█▎        | 6278/47590 [1:38:26<22:13, 30.97it/s]

{'embedding_loss': 0.0045, 'grad_norm': 0.6899134516716003, 'learning_rate': 3.2172958838224657e-06, 'epoch': 4.28}


                                                      
 13%|█▎        | 6278/47590 [1:38:27<22:13, 30.97it/s]

{'embedding_loss': 0.0048, 'grad_norm': 0.013604879379272461, 'learning_rate': 3.193948308468166e-06, 'epoch': 4.28}


                                                      
 13%|█▎        | 6278/47590 [1:38:28<22:13, 30.97it/s]

{'embedding_loss': 0.0036, 'grad_norm': 0.024396462365984917, 'learning_rate': 3.1706007331138664e-06, 'epoch': 4.29}


                                                      
 13%|█▎        | 6278/47590 [1:38:29<22:13, 30.97it/s]

{'embedding_loss': 0.0046, 'grad_norm': 0.19219465553760529, 'learning_rate': 3.147253157759567e-06, 'epoch': 4.29}


                                                      
 13%|█▎        | 6278/47590 [1:38:30<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.040527183562517166, 'learning_rate': 3.123905582405267e-06, 'epoch': 4.3}


                                                      
 13%|█▎        | 6278/47590 [1:38:31<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.856840968132019, 'learning_rate': 3.100558007050968e-06, 'epoch': 4.3}


                                                      
 13%|█▎        | 6278/47590 [1:38:32<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.4706674814224243, 'learning_rate': 3.0772104316966683e-06, 'epoch': 4.31}


                                                      
 13%|█▎        | 6278/47590 [1:38:33<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.6993419528007507, 'learning_rate': 3.053862856342369e-06, 'epoch': 4.31}


                                                      
 13%|█▎        | 6278/47590 [1:38:34<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.21393871307373047, 'learning_rate': 3.0305152809880694e-06, 'epoch': 4.32}


                                                      
 13%|█▎        | 6278/47590 [1:38:35<22:13, 30.97it/s]

{'embedding_loss': 0.0079, 'grad_norm': 0.219061478972435, 'learning_rate': 3.00716770563377e-06, 'epoch': 4.32}


                                                      
 13%|█▎        | 6278/47590 [1:38:36<22:13, 30.97it/s]

{'embedding_loss': 0.0078, 'grad_norm': 0.7788129448890686, 'learning_rate': 2.9838201302794706e-06, 'epoch': 4.33}


                                                      
 13%|█▎        | 6278/47590 [1:38:37<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.14256587624549866, 'learning_rate': 2.9604725549251714e-06, 'epoch': 4.33}


                                                      
 13%|█▎        | 6278/47590 [1:38:38<22:13, 30.97it/s]

{'embedding_loss': 0.0047, 'grad_norm': 0.1586703509092331, 'learning_rate': 2.9371249795708717e-06, 'epoch': 4.34}


                                                      
 13%|█▎        | 6278/47590 [1:38:39<22:13, 30.97it/s]

{'embedding_loss': 0.0039, 'grad_norm': 0.14771981537342072, 'learning_rate': 2.9137774042165725e-06, 'epoch': 4.34}


                                                      
 13%|█▎        | 6278/47590 [1:38:40<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.4519554078578949, 'learning_rate': 2.890429828862273e-06, 'epoch': 4.35}


                                                      
 13%|█▎        | 6278/47590 [1:38:41<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.5528957843780518, 'learning_rate': 2.8670822535079732e-06, 'epoch': 4.35}


                                                      
 13%|█▎        | 6278/47590 [1:38:41<22:13, 30.97it/s]

{'embedding_loss': 0.0038, 'grad_norm': 0.05712858960032463, 'learning_rate': 2.843734678153674e-06, 'epoch': 4.36}


                                                      
 13%|█▎        | 6278/47590 [1:38:43<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.24490994215011597, 'learning_rate': 2.8203871027993744e-06, 'epoch': 4.37}


                                                      
 13%|█▎        | 6278/47590 [1:38:44<22:13, 30.97it/s]

{'embedding_loss': 0.0043, 'grad_norm': 0.04366672784090042, 'learning_rate': 2.797039527445075e-06, 'epoch': 4.37}


                                                      
 13%|█▎        | 6278/47590 [1:38:45<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.0509813092648983, 'learning_rate': 2.7736919520907755e-06, 'epoch': 4.38}


                                                      
 13%|█▎        | 6278/47590 [1:38:46<22:13, 30.97it/s]


{'embedding_loss': 0.0067, 'grad_norm': 0.42699894309043884, 'learning_rate': 2.7503443767364763e-06, 'epoch': 4.38}


                                                      [A
 13%|█▎        | 6278/47590 [1:38:47<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.4472990930080414, 'learning_rate': 2.7269968013821766e-06, 'epoch': 4.39}


                                                      
 13%|█▎        | 6278/47590 [1:38:48<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.6735175251960754, 'learning_rate': 2.7036492260278774e-06, 'epoch': 4.39}


                                                      
 13%|█▎        | 6278/47590 [1:38:49<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.2672518789768219, 'learning_rate': 2.680301650673578e-06, 'epoch': 4.4}


                                                      
 13%|█▎        | 6278/47590 [1:38:50<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.03517285734415054, 'learning_rate': 2.6569540753192786e-06, 'epoch': 4.4}


                                                      
 13%|█▎        | 6278/47590 [1:38:50<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.04304743930697441, 'learning_rate': 2.633606499964979e-06, 'epoch': 4.41}


                                                      
 13%|█▎        | 6278/47590 [1:38:51<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.05827949568629265, 'learning_rate': 2.6102589246106797e-06, 'epoch': 4.41}


                                                      
 13%|█▎        | 6278/47590 [1:38:53<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.05018986016511917, 'learning_rate': 2.58691134925638e-06, 'epoch': 4.42}


                                                      
 13%|█▎        | 6278/47590 [1:38:54<22:13, 30.97it/s]


{'embedding_loss': 0.0057, 'grad_norm': 0.3086594045162201, 'learning_rate': 2.56356377390208e-06, 'epoch': 4.42}


                                                      [A
 13%|█▎        | 6278/47590 [1:38:55<22:13, 30.97it/s]


{'embedding_loss': 0.004, 'grad_norm': 0.5504021644592285, 'learning_rate': 2.540216198547781e-06, 'epoch': 4.43}


                                                      [A
 13%|█▎        | 6278/47590 [1:38:56<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.026333831250667572, 'learning_rate': 2.516868623193481e-06, 'epoch': 4.43}


                                                      
 13%|█▎        | 6278/47590 [1:38:57<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.012944690883159637, 'learning_rate': 2.4935210478391824e-06, 'epoch': 4.44}


                                                      
 13%|█▎        | 6278/47590 [1:38:58<22:13, 30.97it/s]

{'embedding_loss': 0.0043, 'grad_norm': 0.17348183691501617, 'learning_rate': 2.4701734724848827e-06, 'epoch': 4.44}


                                                      
 13%|█▎        | 6278/47590 [1:38:59<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.13211624324321747, 'learning_rate': 2.4468258971305835e-06, 'epoch': 4.45}


                                                      
 13%|█▎        | 6278/47590 [1:39:00<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.04160122200846672, 'learning_rate': 2.4234783217762835e-06, 'epoch': 4.45}


                                                      
 13%|█▎        | 6278/47590 [1:39:00<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.5308601260185242, 'learning_rate': 2.4001307464219842e-06, 'epoch': 4.46}


                                                      
 13%|█▎        | 6278/47590 [1:39:01<22:13, 30.97it/s]

{'embedding_loss': 0.0074, 'grad_norm': 0.5062286853790283, 'learning_rate': 2.3767831710676846e-06, 'epoch': 4.47}


                                                      
 13%|█▎        | 6278/47590 [1:39:03<22:13, 30.97it/s]

{'embedding_loss': 0.007, 'grad_norm': 0.5649215579032898, 'learning_rate': 2.3534355957133854e-06, 'epoch': 4.47}


                                                      
 13%|█▎        | 6278/47590 [1:39:04<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.05855868011713028, 'learning_rate': 2.3300880203590857e-06, 'epoch': 4.48}


                                                      
 13%|█▎        | 6278/47590 [1:39:05<22:13, 30.97it/s]
 90%|████████▉ | 42659/47590 [15:22<01:35, 51.41it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.02087949588894844, 'learning_rate': 2.3067404450047865e-06, 'epoch': 4.48}


                                                      
 13%|█▎        | 6278/47590 [1:39:06<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.5132005214691162, 'learning_rate': 2.283392869650487e-06, 'epoch': 4.49}


                                                      
 13%|█▎        | 6278/47590 [1:39:07<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.16257207095623016, 'learning_rate': 2.2600452942961877e-06, 'epoch': 4.49}


                                                      
 13%|█▎        | 6278/47590 [1:39:08<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.20197968184947968, 'learning_rate': 2.236697718941888e-06, 'epoch': 4.5}


                                                      
 13%|█▎        | 6278/47590 [1:39:08<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.18561697006225586, 'learning_rate': 2.213350143587589e-06, 'epoch': 4.5}


                                                      
 13%|█▎        | 6278/47590 [1:39:09<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.1935243159532547, 'learning_rate': 2.190002568233289e-06, 'epoch': 4.51}


                                                      
 13%|█▎        | 6278/47590 [1:39:10<22:13, 30.97it/s]

{'embedding_loss': 0.0035, 'grad_norm': 0.41951513290405273, 'learning_rate': 2.1666549928789895e-06, 'epoch': 4.51}


                                                      
 13%|█▎        | 6278/47590 [1:39:11<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.006597854662686586, 'learning_rate': 2.1433074175246903e-06, 'epoch': 4.52}


                                                      
 13%|█▎        | 6278/47590 [1:39:13<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.10080874711275101, 'learning_rate': 2.1199598421703907e-06, 'epoch': 4.52}


                                                      
 13%|█▎        | 6278/47590 [1:39:14<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.09646184742450714, 'learning_rate': 2.0966122668160915e-06, 'epoch': 4.53}


                                                      
 13%|█▎        | 6278/47590 [1:39:15<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.23876367509365082, 'learning_rate': 2.073264691461792e-06, 'epoch': 4.53}


                                                      
 13%|█▎        | 6278/47590 [1:39:16<22:13, 30.97it/s]

{'embedding_loss': 0.0035, 'grad_norm': 0.03758222609758377, 'learning_rate': 2.049917116107492e-06, 'epoch': 4.54}


                                                      
 13%|█▎        | 6278/47590 [1:39:16<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.054888125509023666, 'learning_rate': 2.026569540753193e-06, 'epoch': 4.54}


                                                      
 13%|█▎        | 6278/47590 [1:39:17<22:13, 30.97it/s]

{'embedding_loss': 0.0043, 'grad_norm': 0.3456238806247711, 'learning_rate': 2.0032219653988933e-06, 'epoch': 4.55}


                                                      
 13%|█▎        | 6278/47590 [1:39:18<22:13, 30.97it/s]


{'embedding_loss': 0.0053, 'grad_norm': 0.09766547381877899, 'learning_rate': 1.979874390044594e-06, 'epoch': 4.55}


                                                      [A
 13%|█▎        | 6278/47590 [1:39:19<22:13, 30.97it/s]

{'embedding_loss': 0.0041, 'grad_norm': 0.0438050702214241, 'learning_rate': 1.9565268146902945e-06, 'epoch': 4.56}


                                                      
 13%|█▎        | 6278/47590 [1:39:20<22:13, 30.97it/s]

{'embedding_loss': 0.0036, 'grad_norm': 0.04091496765613556, 'learning_rate': 1.9331792393359953e-06, 'epoch': 4.57}


                                                      
 13%|█▎        | 6278/47590 [1:39:21<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.2572529911994934, 'learning_rate': 1.9098316639816956e-06, 'epoch': 4.57}


                                                      
 13%|█▎        | 6278/47590 [1:39:23<22:13, 30.97it/s]


{'embedding_loss': 0.0033, 'grad_norm': 0.027091730386018753, 'learning_rate': 1.886484088627396e-06, 'epoch': 4.58}


                                                      [A
 13%|█▎        | 6278/47590 [1:39:24<22:13, 30.97it/s]


{'embedding_loss': 0.0047, 'grad_norm': 0.0502091646194458, 'learning_rate': 1.8631365132730965e-06, 'epoch': 4.58}


                                                      [A
 13%|█▎        | 6278/47590 [1:39:24<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.05293036252260208, 'learning_rate': 1.8397889379187971e-06, 'epoch': 4.59}


                                                      
 13%|█▎        | 6278/47590 [1:39:25<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.04248960316181183, 'learning_rate': 1.8164413625644977e-06, 'epoch': 4.59}


                                                      
 13%|█▎        | 6278/47590 [1:39:26<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.041822582483291626, 'learning_rate': 1.7930937872101983e-06, 'epoch': 4.6}


                                                      
 13%|█▎        | 6278/47590 [1:39:27<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.13942968845367432, 'learning_rate': 1.7697462118558988e-06, 'epoch': 4.6}


                                                      
 13%|█▎        | 6278/47590 [1:39:28<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.5729119181632996, 'learning_rate': 1.7463986365015994e-06, 'epoch': 4.61}


                                                      
 13%|█▎        | 6278/47590 [1:39:29<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.654589056968689, 'learning_rate': 1.7230510611473e-06, 'epoch': 4.61}


                                                      
 13%|█▎        | 6278/47590 [1:39:30<22:13, 30.97it/s]

{'embedding_loss': 0.0062, 'grad_norm': 0.3586931824684143, 'learning_rate': 1.6997034857930006e-06, 'epoch': 4.62}


                                                      
 13%|█▎        | 6278/47590 [1:39:31<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 0.6541916131973267, 'learning_rate': 1.6763559104387011e-06, 'epoch': 4.62}


                                                      
 13%|█▎        | 6278/47590 [1:39:33<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.5396046042442322, 'learning_rate': 1.6530083350844017e-06, 'epoch': 4.63}


                                                      
 13%|█▎        | 6278/47590 [1:39:34<22:13, 30.97it/s]

{'embedding_loss': 0.0048, 'grad_norm': 0.032736390829086304, 'learning_rate': 1.6296607597301023e-06, 'epoch': 4.63}


                                                      
 13%|█▎        | 6278/47590 [1:39:35<22:13, 30.97it/s]

{'embedding_loss': 0.0041, 'grad_norm': 0.15212424099445343, 'learning_rate': 1.6063131843758026e-06, 'epoch': 4.64}


                                                      
 13%|█▎        | 6278/47590 [1:39:36<22:13, 30.97it/s]

{'embedding_loss': 0.0048, 'grad_norm': 0.43495526909828186, 'learning_rate': 1.5829656090215032e-06, 'epoch': 4.64}


                                                      
 13%|█▎        | 6278/47590 [1:39:37<22:13, 30.97it/s]

{'embedding_loss': 0.0043, 'grad_norm': 0.14486166834831238, 'learning_rate': 1.5596180336672038e-06, 'epoch': 4.65}


                                                      
 13%|█▎        | 6278/47590 [1:39:38<22:13, 30.97it/s]


{'embedding_loss': 0.0053, 'grad_norm': 0.0980958566069603, 'learning_rate': 1.5362704583129043e-06, 'epoch': 4.65}


                                                      [A
 13%|█▎        | 6278/47590 [1:39:39<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.044210035353899, 'learning_rate': 1.512922882958605e-06, 'epoch': 4.66}


                                                      
 13%|█▎        | 6278/47590 [1:39:40<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.5686902403831482, 'learning_rate': 1.4895753076043055e-06, 'epoch': 4.66}


                                                      
 13%|█▎        | 6278/47590 [1:39:40<22:13, 30.97it/s]

{'embedding_loss': 0.0064, 'grad_norm': 0.42413488030433655, 'learning_rate': 1.466227732250006e-06, 'epoch': 4.67}


                                                      
 13%|█▎        | 6278/47590 [1:39:41<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.1779061257839203, 'learning_rate': 1.4428801568957066e-06, 'epoch': 4.68}


                                                      
 13%|█▎        | 6278/47590 [1:39:43<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.24552986025810242, 'learning_rate': 1.4195325815414072e-06, 'epoch': 4.68}


                                                      
 13%|█▎        | 6278/47590 [1:39:44<22:13, 30.97it/s]

{'embedding_loss': 0.0034, 'grad_norm': 0.17639115452766418, 'learning_rate': 1.3961850061871076e-06, 'epoch': 4.69}


                                                      
 13%|█▎        | 6278/47590 [1:39:45<22:13, 30.97it/s]

{'embedding_loss': 0.0034, 'grad_norm': 0.012445870786905289, 'learning_rate': 1.3728374308328081e-06, 'epoch': 4.69}


                                                      
 13%|█▎        | 6278/47590 [1:39:46<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.4689768850803375, 'learning_rate': 1.3494898554785087e-06, 'epoch': 4.7}


                                                      
 13%|█▎        | 6278/47590 [1:39:47<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.07367775589227676, 'learning_rate': 1.326142280124209e-06, 'epoch': 4.7}


                                                      
 13%|█▎        | 6278/47590 [1:39:48<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.41999366879463196, 'learning_rate': 1.3027947047699096e-06, 'epoch': 4.71}


                                                      
 13%|█▎        | 6278/47590 [1:39:49<22:13, 30.97it/s]

{'embedding_loss': 0.0039, 'grad_norm': 0.07844354957342148, 'learning_rate': 1.2794471294156102e-06, 'epoch': 4.71}


                                                      
 13%|█▎        | 6278/47590 [1:39:49<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.155018612742424, 'learning_rate': 1.2560995540613108e-06, 'epoch': 4.72}


                                                      
 13%|█▎        | 6278/47590 [1:39:50<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.11090657114982605, 'learning_rate': 1.2327519787070114e-06, 'epoch': 4.72}


                                                      
 13%|█▎        | 6278/47590 [1:39:51<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.01627301424741745, 'learning_rate': 1.209404403352712e-06, 'epoch': 4.73}


                                                      
 13%|█▎        | 6278/47590 [1:39:53<22:13, 30.97it/s]

{'embedding_loss': 0.0043, 'grad_norm': 0.08876108378171921, 'learning_rate': 1.1860568279984125e-06, 'epoch': 4.73}


                                                      
 13%|█▎        | 6278/47590 [1:39:54<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.09685059636831284, 'learning_rate': 1.162709252644113e-06, 'epoch': 4.74}


                                                      
 13%|█▎        | 6278/47590 [1:39:55<22:13, 30.97it/s]

{'embedding_loss': 0.006, 'grad_norm': 0.01879091002047062, 'learning_rate': 1.1393616772898134e-06, 'epoch': 4.74}


                                                      
 13%|█▎        | 6278/47590 [1:39:56<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.42697256803512573, 'learning_rate': 1.116014101935514e-06, 'epoch': 4.75}


                                                      
 13%|█▎        | 6278/47590 [1:39:57<22:13, 30.97it/s]

{'embedding_loss': 0.0057, 'grad_norm': 0.05313856527209282, 'learning_rate': 1.0926665265812146e-06, 'epoch': 4.75}


                                                      
 13%|█▎        | 6278/47590 [1:39:58<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 0.5505443811416626, 'learning_rate': 1.0693189512269152e-06, 'epoch': 4.76}


                                                      
 13%|█▎        | 6278/47590 [1:39:58<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.07338081300258636, 'learning_rate': 1.0459713758726157e-06, 'epoch': 4.76}


                                                      
 13%|█▎        | 6278/47590 [1:39:59<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.023606644943356514, 'learning_rate': 1.0226238005183163e-06, 'epoch': 4.77}


                                                      
 13%|█▎        | 6278/47590 [1:40:00<22:13, 30.97it/s]

{'embedding_loss': 0.0058, 'grad_norm': 0.3919816017150879, 'learning_rate': 9.992762251640167e-07, 'epoch': 4.78}


                                                      
 13%|█▎        | 6278/47590 [1:40:01<22:13, 30.97it/s]

{'embedding_loss': 0.0075, 'grad_norm': 0.3764684200286865, 'learning_rate': 9.759286498097172e-07, 'epoch': 4.78}


                                                      
 13%|█▎        | 6278/47590 [1:40:03<22:13, 30.97it/s]

{'embedding_loss': 0.0046, 'grad_norm': 0.06333301961421967, 'learning_rate': 9.525810744554179e-07, 'epoch': 4.79}


                                                      
 13%|█▎        | 6278/47590 [1:40:04<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.6126773953437805, 'learning_rate': 9.292334991011185e-07, 'epoch': 4.79}


                                                      
 13%|█▎        | 6278/47590 [1:40:05<22:13, 30.97it/s]

{'embedding_loss': 0.0042, 'grad_norm': 0.025679761543869972, 'learning_rate': 9.058859237468189e-07, 'epoch': 4.8}


                                                      
 13%|█▎        | 6278/47590 [1:40:06<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.080882228910923, 'learning_rate': 8.825383483925195e-07, 'epoch': 4.8}


                                                      
 13%|█▎        | 6278/47590 [1:40:07<22:13, 30.97it/s]
 96%|█████████▌| 45759/47590 [16:24<00:38, 47.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.042156703770160675, 'learning_rate': 8.5919077303822e-07, 'epoch': 4.81}


                                                      
 13%|█▎        | 6278/47590 [1:40:08<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.07829222083091736, 'learning_rate': 8.358431976839206e-07, 'epoch': 4.81}


                                                      
 13%|█▎        | 6278/47590 [1:40:09<22:13, 30.97it/s]

{'embedding_loss': 0.0035, 'grad_norm': 0.22878116369247437, 'learning_rate': 8.124956223296211e-07, 'epoch': 4.82}


                                                      
 13%|█▎        | 6278/47590 [1:40:10<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.13666744530200958, 'learning_rate': 7.891480469753217e-07, 'epoch': 4.82}


                                                      
 13%|█▎        | 6278/47590 [1:40:11<22:13, 30.97it/s]


{'embedding_loss': 0.0062, 'grad_norm': 0.11873524636030197, 'learning_rate': 7.658004716210223e-07, 'epoch': 4.83}


                                                      [A
 13%|█▎        | 6278/47590 [1:40:12<22:13, 30.97it/s]

{'embedding_loss': 0.0032, 'grad_norm': 0.10567905008792877, 'learning_rate': 7.424528962667228e-07, 'epoch': 4.83}


                                                      
 13%|█▎        | 6278/47590 [1:40:13<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.05001528188586235, 'learning_rate': 7.191053209124232e-07, 'epoch': 4.84}


                                                      
 13%|█▎        | 6278/47590 [1:40:14<22:13, 30.97it/s]

{'embedding_loss': 0.0046, 'grad_norm': 0.03487604856491089, 'learning_rate': 6.957577455581238e-07, 'epoch': 4.84}


                                                      
 13%|█▎        | 6278/47590 [1:40:15<22:13, 30.97it/s]

{'embedding_loss': 0.0034, 'grad_norm': 0.07545559108257294, 'learning_rate': 6.724101702038243e-07, 'epoch': 4.85}


                                                      
 13%|█▎        | 6278/47590 [1:40:16<22:13, 30.97it/s]

{'embedding_loss': 0.0039, 'grad_norm': 0.05418768525123596, 'learning_rate': 6.490625948495249e-07, 'epoch': 4.85}


                                                      
 13%|█▎        | 6278/47590 [1:40:17<22:13, 30.97it/s]

{'embedding_loss': 0.0071, 'grad_norm': 0.05160598084330559, 'learning_rate': 6.257150194952255e-07, 'epoch': 4.86}


                                                      
 13%|█▎        | 6278/47590 [1:40:18<22:13, 30.97it/s]

{'embedding_loss': 0.0067, 'grad_norm': 0.19660413265228271, 'learning_rate': 6.02367444140926e-07, 'epoch': 4.86}


                                                      
 13%|█▎        | 6278/47590 [1:40:19<22:13, 30.97it/s]


{'embedding_loss': 0.0045, 'grad_norm': 0.6150403618812561, 'learning_rate': 5.790198687866265e-07, 'epoch': 4.87}


                                                      [A
 13%|█▎        | 6278/47590 [1:40:20<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.3132697641849518, 'learning_rate': 5.556722934323271e-07, 'epoch': 4.87}


                                                      
 13%|█▎        | 6278/47590 [1:40:20<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.38352254033088684, 'learning_rate': 5.323247180780277e-07, 'epoch': 4.88}


                                                      
 13%|█▎        | 6278/47590 [1:40:21<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.1564895510673523, 'learning_rate': 5.089771427237282e-07, 'epoch': 4.89}


                                                      
 13%|█▎        | 6278/47590 [1:40:23<22:13, 30.97it/s]

{'embedding_loss': 0.0073, 'grad_norm': 0.4728254973888397, 'learning_rate': 4.856295673694287e-07, 'epoch': 4.89}


                                                      
 13%|█▎        | 6278/47590 [1:40:24<22:13, 30.97it/s]


{'embedding_loss': 0.0048, 'grad_norm': 0.25087469816207886, 'learning_rate': 4.6228199201512923e-07, 'epoch': 4.9}


                                                      [A
 13%|█▎        | 6278/47590 [1:40:25<22:13, 30.97it/s]


{'embedding_loss': 0.0047, 'grad_norm': 0.09895983338356018, 'learning_rate': 4.389344166608298e-07, 'epoch': 4.9}


                                                      [A
 13%|█▎        | 6278/47590 [1:40:26<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.03475150093436241, 'learning_rate': 4.155868413065303e-07, 'epoch': 4.91}


                                                      
 13%|█▎        | 6278/47590 [1:40:27<22:13, 30.97it/s]

{'embedding_loss': 0.0052, 'grad_norm': 0.8466743230819702, 'learning_rate': 3.922392659522309e-07, 'epoch': 4.91}


                                                      
 13%|█▎        | 6278/47590 [1:40:28<22:13, 30.97it/s]

{'embedding_loss': 0.0059, 'grad_norm': 0.06055743247270584, 'learning_rate': 3.6889169059793147e-07, 'epoch': 4.92}


                                                      
 13%|█▎        | 6278/47590 [1:40:29<22:13, 30.97it/s]

{'embedding_loss': 0.0055, 'grad_norm': 0.02670016884803772, 'learning_rate': 3.4554411524363194e-07, 'epoch': 4.92}


                                                      
 13%|█▎        | 6278/47590 [1:40:29<22:13, 30.97it/s]

{'embedding_loss': 0.0053, 'grad_norm': 0.14551885426044464, 'learning_rate': 3.221965398893325e-07, 'epoch': 4.93}


                                                      
 13%|█▎        | 6278/47590 [1:40:30<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.5096489787101746, 'learning_rate': 2.98848964535033e-07, 'epoch': 4.93}


                                                      
 13%|█▎        | 6278/47590 [1:40:31<22:13, 30.97it/s]

{'embedding_loss': 0.005, 'grad_norm': 0.06209811568260193, 'learning_rate': 2.755013891807336e-07, 'epoch': 4.94}


                                                      
 13%|█▎        | 6278/47590 [1:40:33<22:13, 30.97it/s]

{'embedding_loss': 0.0049, 'grad_norm': 0.039487335830926895, 'learning_rate': 2.521538138264341e-07, 'epoch': 4.94}


                                                      
 13%|█▎        | 6278/47590 [1:40:34<22:13, 30.97it/s]

{'embedding_loss': 0.0063, 'grad_norm': 0.016179606318473816, 'learning_rate': 2.2880623847213466e-07, 'epoch': 4.95}


                                                      
 13%|█▎        | 6278/47590 [1:40:35<22:13, 30.97it/s]

{'embedding_loss': 0.0039, 'grad_norm': 0.0526658333837986, 'learning_rate': 2.0545866311783524e-07, 'epoch': 4.95}


                                                      
 13%|█▎        | 6278/47590 [1:40:36<22:13, 30.97it/s]

{'embedding_loss': 0.0056, 'grad_norm': 0.7154487371444702, 'learning_rate': 1.8211108776353578e-07, 'epoch': 4.96}


                                                      
 13%|█▎        | 6278/47590 [1:40:37<22:13, 30.97it/s]

{'embedding_loss': 0.004, 'grad_norm': 0.01894097775220871, 'learning_rate': 1.587635124092363e-07, 'epoch': 4.96}


                                                      
 13%|█▎        | 6278/47590 [1:40:37<22:13, 30.97it/s]

{'embedding_loss': 0.0044, 'grad_norm': 0.17298904061317444, 'learning_rate': 1.3541593705493685e-07, 'epoch': 4.97}


                                                      
 13%|█▎        | 6278/47590 [1:40:38<22:13, 30.97it/s]

{'embedding_loss': 0.0072, 'grad_norm': 0.08867412805557251, 'learning_rate': 1.1206836170063739e-07, 'epoch': 4.97}


                                                      
 13%|█▎        | 6278/47590 [1:40:39<22:13, 30.97it/s]

{'embedding_loss': 0.0043, 'grad_norm': 0.12454869598150253, 'learning_rate': 8.872078634633795e-08, 'epoch': 4.98}


                                                      
 13%|█▎        | 6278/47590 [1:40:40<22:13, 30.97it/s]

{'embedding_loss': 0.0065, 'grad_norm': 0.2937626540660858, 'learning_rate': 6.537321099203848e-08, 'epoch': 4.99}


                                                      
 13%|█▎        | 6278/47590 [1:40:41<22:13, 30.97it/s]

{'embedding_loss': 0.0051, 'grad_norm': 0.06167925149202347, 'learning_rate': 4.202563563773902e-08, 'epoch': 4.99}


                                                      
 13%|█▎        | 6278/47590 [1:40:43<22:13, 30.97it/s]

{'embedding_loss': 0.0054, 'grad_norm': 0.8440632224082947, 'learning_rate': 1.8678060283439564e-08, 'epoch': 5.0}


                                                      
100%|██████████| 47590/47590 [17:01<00:00, 46.57it/s]]


{'train_runtime': 1021.8673, 'train_samples_per_second': 1490.213, 'train_steps_per_second': 46.572, 'train_loss': 0.025664358941615444, 'epoch': 5.0}


Next, the configuration for the python model.

In [14]:
conf_python = {
  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": None,
  "gradient_checkpointing": False,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.2",
  "type_vocab_size": 2,
  "use_cache": True,
  "vocab_size": 30522
}

The transformer is taken from huggingface hub and the conf_python configuration arguments are applied. The model is then constructed from the transformer and pooling using the SentenceTransformer (SBERT)library.

In [15]:
transformer = models.Transformer("sentence-transformers/all-MiniLM-L6-v2", max_seq_length=256, config_args=conf_python)
pooling = models.Pooling(transformer.get_word_embedding_dimension(), 
    pooling_mode_cls_token=False, pooling_mode_mean_tokens=True, pooling_mode_max_tokens=False, pooling_mode_mean_sqrt_len_tokens=False, pooling_mode_weightedmean_tokens=False, pooling_mode_lasttoken=False, include_prompt=True)
modelMy = SentenceTransformer(modules=[transformer, pooling])
modelMy.save_pretrained('./models/aight-l6-python')

Some weights of the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 were not used when initializing BertModel: ['encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.2.attention.output.LayerNorm.weight', 'encoder.layer.2.attention.output.dense.bias', 'encoder.layer.2.attention.output.dense.weight', 'encoder.layer.2.attention.self.key.bias', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.2.attention.self.query.bias', 'encoder.layer.2.attention.self.query.weight', 'encoder.layer.2.attention.self.value.bias', 'encoder.layer.2.attention.self.value.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.attention.output.LayerNorm.bias', 'encoder.layer.3.attention.output.LayerNorm.weight', 'encoder.layer.3.attention.output.dense.bias', 'encod

The untrained model is loaded as a SetFitModel, and a trainer is set up using the dataset. The trainer then trains the model, and it is saved.

In [16]:
model = SetFitModel.from_pretrained("./models/aight-l6-python", multi_target_strategy="multi-output",device='cuda', normalize_embeddings=False)
trainer = SetFitTrainer(
    model=model,
    train_dataset=ds['python_train'],
    column_mapping={"combo": "text", "labels": "label"},
    num_epochs=5,
    batch_size=32,
)
trainer.train()
trainer.model.save_pretrained('./models/aight-l6-python-trained')

model_head.pkl not found in D:\Documents\!Colleg\CS440-Final\Finalized\models\aight-l6-python, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Applying column mapping to the training dataset
***** Running training *****
  Num unique pairs = 75360
  Batch size = 32
  Num epochs = 5
                                                      
 13%|█▎        | 6278/47590 [1:41:29<22:13, 30.97it/s]

{'embedding_loss': 0.2599, 'grad_norm': 0.39053067564964294, 'learning_rate': 1.6977928692699493e-08, 'epoch': 0.0}


                                                      
 13%|█▎        | 6278/47590 [1:41:30<22:13, 30.97it/s]

{'embedding_loss': 0.2605, 'grad_norm': 0.47217968106269836, 'learning_rate': 8.488964346349746e-07, 'epoch': 0.02}


                                                      
 13%|█▎        | 6278/47590 [1:41:31<22:13, 30.97it/s]

{'embedding_loss': 0.2712, 'grad_norm': 0.42901062965393066, 'learning_rate': 1.6977928692699491e-06, 'epoch': 0.04}


                                                      
 13%|█▎        | 6278/47590 [1:41:32<22:13, 30.97it/s]

{'embedding_loss': 0.266, 'grad_norm': 0.4445774257183075, 'learning_rate': 2.546689303904924e-06, 'epoch': 0.06}


                                                      
 13%|█▎        | 6278/47590 [1:41:33<22:13, 30.97it/s]


{'embedding_loss': 0.2584, 'grad_norm': 0.33240556716918945, 'learning_rate': 3.3955857385398982e-06, 'epoch': 0.08}


                                                      
 13%|█▎        | 6278/47590 [1:41:34<22:13, 30.97it/s]

{'embedding_loss': 0.2496, 'grad_norm': 0.39813464879989624, 'learning_rate': 4.244482173174873e-06, 'epoch': 0.11}


                                                      
 13%|█▎        | 6278/47590 [1:41:34<22:13, 30.97it/s]

{'embedding_loss': 0.2541, 'grad_norm': 0.5638915300369263, 'learning_rate': 5.093378607809848e-06, 'epoch': 0.13}


                                                      
 13%|█▎        | 6278/47590 [1:41:35<22:13, 30.97it/s]

{'embedding_loss': 0.2525, 'grad_norm': 0.5340790748596191, 'learning_rate': 5.942275042444822e-06, 'epoch': 0.15}


                                                      
 13%|█▎        | 6278/47590 [1:41:36<22:13, 30.97it/s]

{'embedding_loss': 0.2504, 'grad_norm': 0.3911206126213074, 'learning_rate': 6.7911714770797965e-06, 'epoch': 0.17}


                                                      
 13%|█▎        | 6278/47590 [1:41:37<22:13, 30.97it/s]

{'embedding_loss': 0.2486, 'grad_norm': 0.4231305718421936, 'learning_rate': 7.640067911714771e-06, 'epoch': 0.19}


                                                      
 13%|█▎        | 6278/47590 [1:41:38<22:13, 30.97it/s]

{'embedding_loss': 0.2464, 'grad_norm': 0.6085556149482727, 'learning_rate': 8.488964346349745e-06, 'epoch': 0.21}



[A
                                                      
 13%|█▎        | 6278/47590 [1:41:40<22:13, 30.97it/s]


{'embedding_loss': 0.2485, 'grad_norm': 0.43917202949523926, 'learning_rate': 9.337860780984721e-06, 'epoch': 0.23}


                                                      
 13%|█▎        | 6278/47590 [1:41:41<22:13, 30.97it/s]

{'embedding_loss': 0.2451, 'grad_norm': 0.4409492313861847, 'learning_rate': 1.0186757215619695e-05, 'epoch': 0.25}


                                                      
 13%|█▎        | 6278/47590 [1:41:42<22:13, 30.97it/s]

{'embedding_loss': 0.2428, 'grad_norm': 0.47376739978790283, 'learning_rate': 1.103565365025467e-05, 'epoch': 0.28}


                                                      
 13%|█▎        | 6278/47590 [1:41:42<22:13, 30.97it/s]

{'embedding_loss': 0.2408, 'grad_norm': 0.44009828567504883, 'learning_rate': 1.1884550084889643e-05, 'epoch': 0.3}


                                                      
 13%|█▎        | 6278/47590 [1:41:43<22:13, 30.97it/s]


{'embedding_loss': 0.242, 'grad_norm': 0.48125219345092773, 'learning_rate': 1.2733446519524619e-05, 'epoch': 0.32}


                                                      
 13%|█▎        | 6278/47590 [1:41:44<22:13, 30.97it/s]

{'embedding_loss': 0.2356, 'grad_norm': 0.5811839699745178, 'learning_rate': 1.3582342954159593e-05, 'epoch': 0.34}


                                                      
 13%|█▎        | 6278/47590 [1:41:45<22:13, 30.97it/s]

{'embedding_loss': 0.2356, 'grad_norm': 0.49925845861434937, 'learning_rate': 1.4431239388794569e-05, 'epoch': 0.36}


                                                      
 13%|█▎        | 6278/47590 [1:41:46<22:13, 30.97it/s]

{'embedding_loss': 0.2325, 'grad_norm': 0.6507218480110168, 'learning_rate': 1.5280135823429543e-05, 'epoch': 0.38}


                                                      
 13%|█▎        | 6278/47590 [1:41:47<22:13, 30.97it/s]

{'embedding_loss': 0.2289, 'grad_norm': 0.5087506175041199, 'learning_rate': 1.6129032258064517e-05, 'epoch': 0.4}


                                                      
 13%|█▎        | 6278/47590 [1:41:47<22:13, 30.97it/s]

{'embedding_loss': 0.218, 'grad_norm': 0.823246419429779, 'learning_rate': 1.697792869269949e-05, 'epoch': 0.42}


                                                      
 13%|█▎        | 6278/47590 [1:41:49<22:13, 30.97it/s]


{'embedding_loss': 0.2151, 'grad_norm': 0.811379611492157, 'learning_rate': 1.7826825127334465e-05, 'epoch': 0.45}


                                                      A
 13%|█▎        | 6278/47590 [1:41:50<22:13, 30.97it/s]

{'embedding_loss': 0.2006, 'grad_norm': 0.9522343873977661, 'learning_rate': 1.8675721561969442e-05, 'epoch': 0.47}


                                                      
 13%|█▎        | 6278/47590 [1:41:51<22:13, 30.97it/s]


{'embedding_loss': 0.1929, 'grad_norm': 0.9298590421676636, 'learning_rate': 1.9524617996604416e-05, 'epoch': 0.49}


                                                      A
 13%|█▎        | 6278/47590 [1:41:52<22:13, 30.97it/s]

{'embedding_loss': 0.1864, 'grad_norm': 1.0321351289749146, 'learning_rate': 1.9958478814758893e-05, 'epoch': 0.51}


                                                      
 13%|█▎        | 6278/47590 [1:41:53<22:13, 30.97it/s]

{'embedding_loss': 0.1872, 'grad_norm': 0.826076328754425, 'learning_rate': 1.986411248466547e-05, 'epoch': 0.53}


                                                      
 13%|█▎        | 6278/47590 [1:41:53<22:13, 30.97it/s]

{'embedding_loss': 0.172, 'grad_norm': 1.212126612663269, 'learning_rate': 1.9769746154572048e-05, 'epoch': 0.55}


                                                      
 13%|█▎        | 6278/47590 [1:41:54<22:13, 30.97it/s]

{'embedding_loss': 0.1595, 'grad_norm': 0.7458695769309998, 'learning_rate': 1.967537982447863e-05, 'epoch': 0.57}


                                                      
 13%|█▎        | 6278/47590 [1:41:55<22:13, 30.97it/s]

{'embedding_loss': 0.16, 'grad_norm': 1.746485710144043, 'learning_rate': 1.9581013494385206e-05, 'epoch': 0.59}


                                                      
 13%|█▎        | 6278/47590 [1:41:56<22:13, 30.97it/s]

{'embedding_loss': 0.1577, 'grad_norm': 1.0362039804458618, 'learning_rate': 1.9486647164291784e-05, 'epoch': 0.62}


                                                      
 13%|█▎        | 6278/47590 [1:41:57<22:13, 30.97it/s]

{'embedding_loss': 0.1557, 'grad_norm': 1.1585294008255005, 'learning_rate': 1.939228083419836e-05, 'epoch': 0.64}


                                                      
 13%|█▎        | 6278/47590 [1:41:58<22:13, 30.97it/s]

{'embedding_loss': 0.1429, 'grad_norm': 0.9821093678474426, 'learning_rate': 1.929791450410494e-05, 'epoch': 0.66}


                                                      
 13%|█▎        | 6278/47590 [1:41:59<22:13, 30.97it/s]


{'embedding_loss': 0.1446, 'grad_norm': 1.039841651916504, 'learning_rate': 1.9203548174011516e-05, 'epoch': 0.68}


                                                      A
 13%|█▎        | 6278/47590 [1:42:00<22:13, 30.97it/s]

{'embedding_loss': 0.1445, 'grad_norm': 0.9739137887954712, 'learning_rate': 1.9109181843918093e-05, 'epoch': 0.7}


                                                      
 13%|█▎        | 6278/47590 [1:42:01<22:13, 30.97it/s]

{'embedding_loss': 0.1264, 'grad_norm': 1.3591004610061646, 'learning_rate': 1.901481551382467e-05, 'epoch': 0.72}


                                                      
 13%|█▎        | 6278/47590 [1:42:02<22:13, 30.97it/s]

{'embedding_loss': 0.1214, 'grad_norm': 1.160138487815857, 'learning_rate': 1.8920449183731248e-05, 'epoch': 0.74}


                                                      
 13%|█▎        | 6278/47590 [1:42:03<22:13, 30.97it/s]

{'embedding_loss': 0.126, 'grad_norm': 1.5432324409484863, 'learning_rate': 1.8826082853637822e-05, 'epoch': 0.76}


                                                      
 13%|█▎        | 6278/47590 [1:42:04<22:13, 30.97it/s]

{'embedding_loss': 0.1192, 'grad_norm': 1.4203063249588013, 'learning_rate': 1.87317165235444e-05, 'epoch': 0.79}


                                                      
 13%|█▎        | 6278/47590 [1:42:04<22:13, 30.97it/s]


{'embedding_loss': 0.1148, 'grad_norm': 2.61142635345459, 'learning_rate': 1.8637350193450977e-05, 'epoch': 0.81}


                                                      A
 13%|█▎        | 6278/47590 [1:42:05<22:13, 30.97it/s]

{'embedding_loss': 0.1153, 'grad_norm': 1.146870732307434, 'learning_rate': 1.8542983863357554e-05, 'epoch': 0.83}


                                                      
 13%|█▎        | 6278/47590 [1:42:07<22:13, 30.97it/s]

{'embedding_loss': 0.1106, 'grad_norm': 2.0022037029266357, 'learning_rate': 1.8448617533264132e-05, 'epoch': 0.85}


                                                      
 13%|█▎        | 6278/47590 [1:42:09<22:13, 30.97it/s]

{'embedding_loss': 0.1098, 'grad_norm': 1.4927603006362915, 'learning_rate': 1.835425120317071e-05, 'epoch': 0.87}


                                                      
 13%|█▎        | 6278/47590 [1:42:11<22:13, 30.97it/s]

{'embedding_loss': 0.1085, 'grad_norm': 1.5916166305541992, 'learning_rate': 1.8259884873077287e-05, 'epoch': 0.89}


                                                      
 13%|█▎        | 6278/47590 [1:42:14<22:13, 30.97it/s]

{'embedding_loss': 0.1102, 'grad_norm': 1.3477866649627686, 'learning_rate': 1.8165518542983864e-05, 'epoch': 0.91}


                                                      
 13%|█▎        | 6278/47590 [1:42:16<22:13, 30.97it/s]

{'embedding_loss': 0.1025, 'grad_norm': 1.5078800916671753, 'learning_rate': 1.807115221289044e-05, 'epoch': 0.93}


                                                      
 13%|█▎        | 6278/47590 [1:42:17<22:13, 30.97it/s]

{'embedding_loss': 0.1009, 'grad_norm': 1.4973108768463135, 'learning_rate': 1.797678588279702e-05, 'epoch': 0.96}


                                                      
 13%|█▎        | 6278/47590 [1:42:18<22:13, 30.97it/s]

{'embedding_loss': 0.0978, 'grad_norm': 0.7777794599533081, 'learning_rate': 1.7882419552703596e-05, 'epoch': 0.98}


                                                      
 13%|█▎        | 6278/47590 [1:42:19<22:13, 30.97it/s]


{'embedding_loss': 0.0935, 'grad_norm': 0.9650525450706482, 'learning_rate': 1.7788053222610174e-05, 'epoch': 1.0}


                                                      A
 13%|█▎        | 6278/47590 [1:42:20<22:13, 30.97it/s]

{'embedding_loss': 0.0883, 'grad_norm': 1.1192431449890137, 'learning_rate': 1.769368689251675e-05, 'epoch': 1.02}


                                                      
 13%|█▎        | 6278/47590 [1:42:21<22:13, 30.97it/s]

{'embedding_loss': 0.0894, 'grad_norm': 1.7276209592819214, 'learning_rate': 1.759932056242333e-05, 'epoch': 1.04}


                                                      
 13%|█▎        | 6278/47590 [1:42:22<22:13, 30.97it/s]

{'embedding_loss': 0.0842, 'grad_norm': 1.1933916807174683, 'learning_rate': 1.7504954232329906e-05, 'epoch': 1.06}


                                                      
 13%|█▎        | 6278/47590 [1:42:24<22:13, 30.97it/s]

{'embedding_loss': 0.0892, 'grad_norm': 1.2040925025939941, 'learning_rate': 1.7410587902236483e-05, 'epoch': 1.08}


                                                      
 13%|█▎        | 6278/47590 [1:42:25<22:13, 30.97it/s]

{'embedding_loss': 0.0833, 'grad_norm': 1.5934720039367676, 'learning_rate': 1.731622157214306e-05, 'epoch': 1.1}


                                                      
 13%|█▎        | 6278/47590 [1:42:26<22:13, 30.97it/s]

{'embedding_loss': 0.0871, 'grad_norm': 1.1956161260604858, 'learning_rate': 1.7221855242049638e-05, 'epoch': 1.13}


                                                      
 13%|█▎        | 6278/47590 [1:42:27<22:13, 30.97it/s]


{'embedding_loss': 0.0873, 'grad_norm': 2.4105141162872314, 'learning_rate': 1.7127488911956216e-05, 'epoch': 1.15}


                                                      A
 13%|█▎        | 6278/47590 [1:42:29<22:13, 30.97it/s]

{'embedding_loss': 0.0877, 'grad_norm': 1.554222583770752, 'learning_rate': 1.7033122581862793e-05, 'epoch': 1.17}


                                                      
 13%|█▎        | 6278/47590 [1:42:30<22:13, 30.97it/s]

{'embedding_loss': 0.0822, 'grad_norm': 1.0311168432235718, 'learning_rate': 1.693875625176937e-05, 'epoch': 1.19}


                                                      
 13%|█▎        | 6278/47590 [1:42:31<22:13, 30.97it/s]

{'embedding_loss': 0.0805, 'grad_norm': 1.5694512128829956, 'learning_rate': 1.6844389921675948e-05, 'epoch': 1.21}


                                                      
 13%|█▎        | 6278/47590 [1:42:32<22:13, 30.97it/s]

{'embedding_loss': 0.0731, 'grad_norm': 1.624383568763733, 'learning_rate': 1.6750023591582525e-05, 'epoch': 1.23}


                                                      
 13%|█▎        | 6278/47590 [1:42:34<22:13, 30.97it/s]

{'embedding_loss': 0.0776, 'grad_norm': 1.0167440176010132, 'learning_rate': 1.6655657261489103e-05, 'epoch': 1.25}


                                                      
 13%|█▎        | 6278/47590 [1:42:35<22:13, 30.97it/s]

{'embedding_loss': 0.0739, 'grad_norm': 1.165687918663025, 'learning_rate': 1.656129093139568e-05, 'epoch': 1.27}


                                                      
 13%|█▎        | 6278/47590 [1:42:37<22:13, 30.97it/s]

{'embedding_loss': 0.079, 'grad_norm': 1.3095195293426514, 'learning_rate': 1.6466924601302257e-05, 'epoch': 1.3}


                                                      
 13%|█▎        | 6278/47590 [1:42:39<22:13, 30.97it/s]

{'embedding_loss': 0.0722, 'grad_norm': 1.6775633096694946, 'learning_rate': 1.6372558271208835e-05, 'epoch': 1.32}


                                                      
 13%|█▎        | 6278/47590 [1:42:41<22:13, 30.97it/s]

{'embedding_loss': 0.0776, 'grad_norm': 0.49943849444389343, 'learning_rate': 1.6278191941115412e-05, 'epoch': 1.34}


                                                      
 13%|█▎        | 6278/47590 [1:42:43<22:13, 30.97it/s]

{'embedding_loss': 0.0748, 'grad_norm': 1.1457141637802124, 'learning_rate': 1.618382561102199e-05, 'epoch': 1.36}


                                                      
 13%|█▎        | 6278/47590 [1:42:44<22:13, 30.97it/s]


{'embedding_loss': 0.0714, 'grad_norm': 1.2526897192001343, 'learning_rate': 1.6089459280928567e-05, 'epoch': 1.38}


                                                      A
 13%|█▎        | 6278/47590 [1:42:46<22:13, 30.97it/s]

{'embedding_loss': 0.0727, 'grad_norm': 1.77902090549469, 'learning_rate': 1.599509295083514e-05, 'epoch': 1.4}


                                                      
 13%|█▎        | 6278/47590 [1:42:47<22:13, 30.97it/s]

{'embedding_loss': 0.0704, 'grad_norm': 1.0541859865188599, 'learning_rate': 1.590072662074172e-05, 'epoch': 1.42}


                                                      
 13%|█▎        | 6278/47590 [1:42:49<22:13, 30.97it/s]

{'embedding_loss': 0.0672, 'grad_norm': 0.8980148434638977, 'learning_rate': 1.5806360290648296e-05, 'epoch': 1.44}


                                                      
 13%|█▎        | 6278/47590 [1:42:51<22:13, 30.97it/s]

{'embedding_loss': 0.0692, 'grad_norm': 1.3063253164291382, 'learning_rate': 1.5711993960554873e-05, 'epoch': 1.46}


                                                      
 13%|█▎        | 6278/47590 [1:42:52<22:13, 30.97it/s]

{'embedding_loss': 0.0719, 'grad_norm': 1.5628786087036133, 'learning_rate': 1.561762763046145e-05, 'epoch': 1.49}


                                                      
 13%|█▎        | 6278/47590 [1:42:55<22:13, 30.97it/s]

{'embedding_loss': 0.0654, 'grad_norm': 1.3986934423446655, 'learning_rate': 1.552326130036803e-05, 'epoch': 1.51}


                                                      
 13%|█▎        | 6278/47590 [1:42:57<22:13, 30.97it/s]


{'embedding_loss': 0.0635, 'grad_norm': 1.5078580379486084, 'learning_rate': 1.542889497027461e-05, 'epoch': 1.53}


                                                      A
 13%|█▎        | 6278/47590 [1:42:59<22:13, 30.97it/s]

{'embedding_loss': 0.0677, 'grad_norm': 0.9491287469863892, 'learning_rate': 1.5334528640181186e-05, 'epoch': 1.55}


                                                      
 13%|█▎        | 6278/47590 [1:43:00<22:13, 30.97it/s]

{'embedding_loss': 0.0564, 'grad_norm': 1.6838582754135132, 'learning_rate': 1.5240162310087762e-05, 'epoch': 1.57}


                                                      
 13%|█▎        | 6278/47590 [1:43:02<22:13, 30.97it/s]

{'embedding_loss': 0.064, 'grad_norm': 1.6300220489501953, 'learning_rate': 1.514579597999434e-05, 'epoch': 1.59}


                                                      
 13%|█▎        | 6278/47590 [1:43:04<22:13, 30.97it/s]

{'embedding_loss': 0.0614, 'grad_norm': 0.9638175964355469, 'learning_rate': 1.5051429649900917e-05, 'epoch': 1.61}


                                                      
 13%|█▎        | 6278/47590 [1:43:06<22:13, 30.97it/s]


{'embedding_loss': 0.0595, 'grad_norm': 0.4593752324581146, 'learning_rate': 1.4957063319807494e-05, 'epoch': 1.63}


                                                      A
 13%|█▎        | 6278/47590 [1:43:08<22:13, 30.97it/s]

{'embedding_loss': 0.0619, 'grad_norm': 0.7311236262321472, 'learning_rate': 1.4862696989714072e-05, 'epoch': 1.66}


                                                      
 13%|█▎        | 6278/47590 [1:43:09<22:13, 30.97it/s]

{'embedding_loss': 0.0606, 'grad_norm': 0.9747270941734314, 'learning_rate': 1.4768330659620649e-05, 'epoch': 1.68}


                                                      
 13%|█▎        | 6278/47590 [1:43:10<22:13, 30.97it/s]

{'embedding_loss': 0.0559, 'grad_norm': 1.19496488571167, 'learning_rate': 1.4673964329527226e-05, 'epoch': 1.7}


                                                      
 13%|█▎        | 6278/47590 [1:43:12<22:13, 30.97it/s]


{'embedding_loss': 0.0554, 'grad_norm': 1.236689567565918, 'learning_rate': 1.4579597999433804e-05, 'epoch': 1.72}


                                                      A
 13%|█▎        | 6278/47590 [1:43:13<22:13, 30.97it/s]

{'embedding_loss': 0.0593, 'grad_norm': 1.7251988649368286, 'learning_rate': 1.4485231669340381e-05, 'epoch': 1.74}


                                                      
 13%|█▎        | 6278/47590 [1:43:14<22:13, 30.97it/s]

{'embedding_loss': 0.053, 'grad_norm': 1.2222778797149658, 'learning_rate': 1.4390865339246957e-05, 'epoch': 1.76}


                                                      
 13%|█▎        | 6278/47590 [1:43:15<22:13, 30.97it/s]

{'embedding_loss': 0.0602, 'grad_norm': 0.8434850573539734, 'learning_rate': 1.4296499009153534e-05, 'epoch': 1.78}


                                                      
 13%|█▎        | 6278/47590 [1:43:17<22:13, 30.97it/s]

{'embedding_loss': 0.0591, 'grad_norm': 1.5330034494400024, 'learning_rate': 1.4202132679060112e-05, 'epoch': 1.8}


                                                      
 13%|█▎        | 6278/47590 [1:43:18<22:13, 30.97it/s]

{'embedding_loss': 0.0549, 'grad_norm': 0.9224586486816406, 'learning_rate': 1.410776634896669e-05, 'epoch': 1.83}


                                                      
 13%|█▎        | 6278/47590 [1:43:19<22:13, 30.97it/s]

{'embedding_loss': 0.0536, 'grad_norm': 1.169957160949707, 'learning_rate': 1.4013400018873267e-05, 'epoch': 1.85}


                                                      
 13%|█▎        | 6278/47590 [1:43:20<22:13, 30.97it/s]

{'embedding_loss': 0.0561, 'grad_norm': 1.0908294916152954, 'learning_rate': 1.3919033688779844e-05, 'epoch': 1.87}


                                                      
 13%|█▎        | 6278/47590 [1:43:21<22:13, 30.97it/s]


{'embedding_loss': 0.0547, 'grad_norm': 2.2266271114349365, 'learning_rate': 1.3824667358686421e-05, 'epoch': 1.89}


                                                      A
 13%|█▎        | 6278/47590 [1:43:22<22:13, 30.97it/s]

{'embedding_loss': 0.0538, 'grad_norm': 0.681567370891571, 'learning_rate': 1.3730301028592999e-05, 'epoch': 1.91}


                                                      
 13%|█▎        | 6278/47590 [1:43:24<22:13, 30.97it/s]

{'embedding_loss': 0.0552, 'grad_norm': 1.7120333909988403, 'learning_rate': 1.3635934698499576e-05, 'epoch': 1.93}


                                                      
 13%|█▎        | 6278/47590 [1:43:25<22:13, 30.97it/s]

{'embedding_loss': 0.0462, 'grad_norm': 1.9791457653045654, 'learning_rate': 1.3541568368406152e-05, 'epoch': 1.95}


                                                      
 13%|█▎        | 6278/47590 [1:43:26<22:13, 30.97it/s]


{'embedding_loss': 0.0431, 'grad_norm': 1.6466846466064453, 'learning_rate': 1.3447202038312733e-05, 'epoch': 1.97}


                                                      A
 13%|█▎        | 6278/47590 [1:43:28<22:13, 30.97it/s]


{'embedding_loss': 0.0506, 'grad_norm': 1.0583937168121338, 'learning_rate': 1.335283570821931e-05, 'epoch': 2.0}


                                                      A
 13%|█▎        | 6278/47590 [1:43:29<22:13, 30.97it/s]


{'embedding_loss': 0.0486, 'grad_norm': 1.4488005638122559, 'learning_rate': 1.3258469378125886e-05, 'epoch': 2.02}


                                                      A
 13%|█▎        | 6278/47590 [1:43:30<22:13, 30.97it/s]

{'embedding_loss': 0.0448, 'grad_norm': 1.3286322355270386, 'learning_rate': 1.3164103048032463e-05, 'epoch': 2.04}


                                                      
 13%|█▎        | 6278/47590 [1:43:32<22:13, 30.97it/s]


{'embedding_loss': 0.047, 'grad_norm': 1.1561821699142456, 'learning_rate': 1.306973671793904e-05, 'epoch': 2.06}


                                                      A
 13%|█▎        | 6278/47590 [1:43:35<22:13, 30.97it/s]

{'embedding_loss': 0.0428, 'grad_norm': 1.1890009641647339, 'learning_rate': 1.2975370387845618e-05, 'epoch': 2.08}


                                                      
 13%|█▎        | 6278/47590 [1:43:38<22:13, 30.97it/s]


{'embedding_loss': 0.0443, 'grad_norm': 2.4920670986175537, 'learning_rate': 1.2881004057752196e-05, 'epoch': 2.1}


                                                      A
 13%|█▎        | 6278/47590 [1:43:39<22:13, 30.97it/s]

{'embedding_loss': 0.0443, 'grad_norm': 0.5676487684249878, 'learning_rate': 1.2786637727658773e-05, 'epoch': 2.12}


                                                      
 13%|█▎        | 6278/47590 [1:43:42<22:13, 30.97it/s]

{'embedding_loss': 0.0427, 'grad_norm': 1.3954823017120361, 'learning_rate': 1.269227139756535e-05, 'epoch': 2.14}


                                                      
 13%|█▎        | 6278/47590 [1:43:47<22:13, 30.97it/s]

{'embedding_loss': 0.0458, 'grad_norm': 0.7764974236488342, 'learning_rate': 1.2597905067471928e-05, 'epoch': 2.17}


                                                      
 13%|█▎        | 6278/47590 [1:43:53<22:13, 30.97it/s]

{'embedding_loss': 0.0422, 'grad_norm': 0.2696070075035095, 'learning_rate': 1.2503538737378505e-05, 'epoch': 2.19}


                                                      
 13%|█▎        | 6278/47590 [1:43:56<22:13, 30.97it/s]

{'embedding_loss': 0.0444, 'grad_norm': 1.1065552234649658, 'learning_rate': 1.2409172407285081e-05, 'epoch': 2.21}


                                                      
 13%|█▎        | 6278/47590 [1:44:06<22:13, 30.97it/s]


{'embedding_loss': 0.0416, 'grad_norm': 0.39030539989471436, 'learning_rate': 1.2314806077191658e-05, 'epoch': 2.23}


                                                      A
 13%|█▎        | 6278/47590 [1:44:27<22:13, 30.97it/s]

{'embedding_loss': 0.0442, 'grad_norm': 1.4544107913970947, 'learning_rate': 1.2220439747098236e-05, 'epoch': 2.25}


                                                      
 13%|█▎        | 6278/47590 [1:44:54<22:13, 30.97it/s]

{'embedding_loss': 0.0391, 'grad_norm': 1.7017053365707397, 'learning_rate': 1.2126073417004813e-05, 'epoch': 2.27}


                                                      
 13%|█▎        | 6278/47590 [1:45:04<22:13, 30.97it/s]

{'embedding_loss': 0.0449, 'grad_norm': 1.556042194366455, 'learning_rate': 1.203170708691139e-05, 'epoch': 2.29}


                                                      
 13%|█▎        | 6278/47590 [1:45:13<22:13, 30.97it/s]

{'embedding_loss': 0.041, 'grad_norm': 1.3575490713119507, 'learning_rate': 1.1937340756817968e-05, 'epoch': 2.31}


                                                      
 13%|█▎        | 6278/47590 [1:45:20<22:13, 30.97it/s]

{'embedding_loss': 0.0396, 'grad_norm': 0.7825729250907898, 'learning_rate': 1.1842974426724545e-05, 'epoch': 2.34}


                                                      
 13%|█▎        | 6278/47590 [1:45:22<22:13, 30.97it/s]

{'embedding_loss': 0.0363, 'grad_norm': 1.1615163087844849, 'learning_rate': 1.1748608096631123e-05, 'epoch': 2.36}


                                                      
 13%|█▎        | 6278/47590 [1:45:44<22:13, 30.97it/s]

{'embedding_loss': 0.0438, 'grad_norm': 1.300649881362915, 'learning_rate': 1.16542417665377e-05, 'epoch': 2.38}


                                                      
 13%|█▎        | 6278/47590 [1:45:54<22:13, 30.97it/s]

{'embedding_loss': 0.0395, 'grad_norm': 0.6668403744697571, 'learning_rate': 1.1559875436444276e-05, 'epoch': 2.4}


                                                      
 13%|█▎        | 6278/47590 [1:45:57<22:13, 30.97it/s]

{'embedding_loss': 0.0377, 'grad_norm': 0.4945701062679291, 'learning_rate': 1.1465509106350853e-05, 'epoch': 2.42}


                                                      
 13%|█▎        | 6278/47590 [1:46:23<22:13, 30.97it/s]

{'embedding_loss': 0.0389, 'grad_norm': 1.6008461713790894, 'learning_rate': 1.1371142776257434e-05, 'epoch': 2.44}


                                                      
 13%|█▎        | 6278/47590 [1:46:43<22:13, 30.97it/s]

{'embedding_loss': 0.0374, 'grad_norm': 0.5373523235321045, 'learning_rate': 1.127677644616401e-05, 'epoch': 2.46}


                                                      
 13%|█▎        | 6278/47590 [1:46:54<22:13, 30.97it/s]

{'embedding_loss': 0.0366, 'grad_norm': 0.7065171003341675, 'learning_rate': 1.1182410116070587e-05, 'epoch': 2.48}


                                                      
 13%|█▎        | 6278/47590 [1:46:58<22:13, 30.97it/s]

{'embedding_loss': 0.0375, 'grad_norm': 0.8941874504089355, 'learning_rate': 1.1088043785977165e-05, 'epoch': 2.51}


                                                      
 13%|█▎        | 6278/47590 [1:47:09<22:13, 30.97it/s]

{'embedding_loss': 0.04, 'grad_norm': 1.1114424467086792, 'learning_rate': 1.0993677455883742e-05, 'epoch': 2.53}


                                                      
 13%|█▎        | 6278/47590 [1:47:16<22:13, 30.97it/s]

{'embedding_loss': 0.0377, 'grad_norm': 1.2407290935516357, 'learning_rate': 1.089931112579032e-05, 'epoch': 2.55}


                                                      
 13%|█▎        | 6278/47590 [1:47:24<22:13, 30.97it/s]

{'embedding_loss': 0.0382, 'grad_norm': 1.6338927745819092, 'learning_rate': 1.0804944795696897e-05, 'epoch': 2.57}


                                                      
 13%|█▎        | 6278/47590 [1:47:28<22:13, 30.97it/s]

{'embedding_loss': 0.0319, 'grad_norm': 1.0304394960403442, 'learning_rate': 1.0710578465603474e-05, 'epoch': 2.59}


                                                      
 13%|█▎        | 6278/47590 [1:47:35<22:13, 30.97it/s]

{'embedding_loss': 0.0361, 'grad_norm': 0.5261619091033936, 'learning_rate': 1.0616212135510052e-05, 'epoch': 2.61}


                                                      
 13%|█▎        | 6278/47590 [1:47:51<22:13, 30.97it/s]

{'embedding_loss': 0.0341, 'grad_norm': 1.6666195392608643, 'learning_rate': 1.0521845805416629e-05, 'epoch': 2.63}


                                                      
 13%|█▎        | 6278/47590 [1:48:03<22:13, 30.97it/s]

{'embedding_loss': 0.0317, 'grad_norm': 0.8397555351257324, 'learning_rate': 1.0427479475323205e-05, 'epoch': 2.65}


                                                      
 13%|█▎        | 6278/47590 [1:48:09<22:13, 30.97it/s]

{'embedding_loss': 0.0334, 'grad_norm': 1.3571332693099976, 'learning_rate': 1.0333113145229782e-05, 'epoch': 2.68}


                                                      
 13%|█▎        | 6278/47590 [1:48:15<22:13, 30.97it/s]


{'embedding_loss': 0.034, 'grad_norm': 1.5749050378799438, 'learning_rate': 1.023874681513636e-05, 'epoch': 2.7}


                                                      A
 13%|█▎        | 6278/47590 [1:48:17<22:13, 30.97it/s]

{'embedding_loss': 0.033, 'grad_norm': 1.1221414804458618, 'learning_rate': 1.0144380485042937e-05, 'epoch': 2.72}


                                                      
 13%|█▎        | 6278/47590 [1:48:19<22:13, 30.97it/s]

{'embedding_loss': 0.0365, 'grad_norm': 1.1867215633392334, 'learning_rate': 1.0050014154949514e-05, 'epoch': 2.74}


                                                      
 13%|█▎        | 6278/47590 [1:48:21<22:13, 30.97it/s]

{'embedding_loss': 0.0345, 'grad_norm': 0.6357465982437134, 'learning_rate': 9.955647824856092e-06, 'epoch': 2.76}


                                                      
 13%|█▎        | 6278/47590 [1:48:37<22:13, 30.97it/s]


{'embedding_loss': 0.0279, 'grad_norm': 0.741698682308197, 'learning_rate': 9.86128149476267e-06, 'epoch': 2.78}


                                                      A
 13%|█▎        | 6278/47590 [1:48:54<22:13, 30.97it/s]


{'embedding_loss': 0.0331, 'grad_norm': 1.449977159500122, 'learning_rate': 9.766915164669247e-06, 'epoch': 2.8}


                                                      A
 13%|█▎        | 6278/47590 [1:48:55<22:13, 30.97it/s]

{'embedding_loss': 0.0323, 'grad_norm': 1.4040957689285278, 'learning_rate': 9.672548834575824e-06, 'epoch': 2.82}


                                                      
 13%|█▎        | 6278/47590 [1:49:00<22:13, 30.97it/s]

{'embedding_loss': 0.032, 'grad_norm': 0.9004467725753784, 'learning_rate': 9.578182504482401e-06, 'epoch': 2.85}


                                                      
 13%|█▎        | 6278/47590 [1:49:09<22:13, 30.97it/s]


{'embedding_loss': 0.0304, 'grad_norm': 0.6537604928016663, 'learning_rate': 9.483816174388979e-06, 'epoch': 2.87}


                                                      A
 13%|█▎        | 6278/47590 [1:49:20<22:13, 30.97it/s]

{'embedding_loss': 0.0343, 'grad_norm': 0.9732991456985474, 'learning_rate': 9.389449844295556e-06, 'epoch': 2.89}


                                                      
 13%|█▎        | 6278/47590 [1:49:32<22:13, 30.97it/s]


{'embedding_loss': 0.0326, 'grad_norm': 1.1898083686828613, 'learning_rate': 9.295083514202134e-06, 'epoch': 2.91}


                                                      A
 13%|█▎        | 6278/47590 [1:49:45<22:13, 30.97it/s]

{'embedding_loss': 0.0339, 'grad_norm': 0.5529609322547913, 'learning_rate': 9.200717184108711e-06, 'epoch': 2.93}


                                                      
 13%|█▎        | 6278/47590 [1:50:00<22:13, 30.97it/s]

{'embedding_loss': 0.0286, 'grad_norm': 0.6332519054412842, 'learning_rate': 9.106350854015289e-06, 'epoch': 2.95}


                                                      
 13%|█▎        | 6278/47590 [1:50:19<22:13, 30.97it/s]

{'embedding_loss': 0.0312, 'grad_norm': 1.019378423690796, 'learning_rate': 9.011984523921864e-06, 'epoch': 2.97}


                                                      
 13%|█▎        | 6278/47590 [1:50:29<22:13, 30.97it/s]

{'embedding_loss': 0.0263, 'grad_norm': 1.367685079574585, 'learning_rate': 8.917618193828442e-06, 'epoch': 2.99}


                                                      
 13%|█▎        | 6278/47590 [1:50:30<22:13, 30.97it/s]

{'embedding_loss': 0.0268, 'grad_norm': 1.0801682472229004, 'learning_rate': 8.823251863735019e-06, 'epoch': 3.01}


                                                      
 13%|█▎        | 6278/47590 [1:50:31<22:13, 30.97it/s]

{'embedding_loss': 0.0293, 'grad_norm': 1.0132293701171875, 'learning_rate': 8.728885533641598e-06, 'epoch': 3.04}


                                                      
 13%|█▎        | 6278/47590 [1:50:41<22:13, 30.97it/s]

{'embedding_loss': 0.0239, 'grad_norm': 0.9687350988388062, 'learning_rate': 8.634519203548176e-06, 'epoch': 3.06}


                                                      
 13%|█▎        | 6278/47590 [1:50:46<22:13, 30.97it/s]


{'embedding_loss': 0.0333, 'grad_norm': 2.0273075103759766, 'learning_rate': 8.540152873454753e-06, 'epoch': 3.08}


                                                      A
 13%|█▎        | 6278/47590 [1:50:48<22:13, 30.97it/s]

{'embedding_loss': 0.0257, 'grad_norm': 1.2760766744613647, 'learning_rate': 8.44578654336133e-06, 'epoch': 3.1}


                                                      
 13%|█▎        | 6278/47590 [1:51:04<22:13, 30.97it/s]

{'embedding_loss': 0.029, 'grad_norm': 2.282566547393799, 'learning_rate': 8.351420213267906e-06, 'epoch': 3.12}


                                                      
 13%|█▎        | 6278/47590 [1:51:09<22:13, 30.97it/s]

{'embedding_loss': 0.0299, 'grad_norm': 0.9110583066940308, 'learning_rate': 8.257053883174483e-06, 'epoch': 3.14}


                                                      
 13%|█▎        | 6278/47590 [1:51:15<22:13, 30.97it/s]

{'embedding_loss': 0.0297, 'grad_norm': 0.5143120884895325, 'learning_rate': 8.162687553081061e-06, 'epoch': 3.16}


                                                      
 13%|█▎        | 6278/47590 [1:51:21<22:13, 30.97it/s]

{'embedding_loss': 0.0262, 'grad_norm': 1.035187840461731, 'learning_rate': 8.068321222987638e-06, 'epoch': 3.18}


                                                      
 13%|█▎        | 6278/47590 [1:51:25<22:13, 30.97it/s]


{'embedding_loss': 0.0248, 'grad_norm': 0.5258978605270386, 'learning_rate': 7.973954892894216e-06, 'epoch': 3.21}


                                                      A
 13%|█▎        | 6278/47590 [1:51:28<22:13, 30.97it/s]

{'embedding_loss': 0.0276, 'grad_norm': 1.4755195379257202, 'learning_rate': 7.879588562800793e-06, 'epoch': 3.23}


                                                      
 13%|█▎        | 6278/47590 [1:51:31<22:13, 30.97it/s]

{'embedding_loss': 0.0317, 'grad_norm': 1.5312126874923706, 'learning_rate': 7.78522223270737e-06, 'epoch': 3.25}


                                                      
 13%|█▎        | 6278/47590 [1:51:34<22:13, 30.97it/s]

{'embedding_loss': 0.0284, 'grad_norm': 0.9904761910438538, 'learning_rate': 7.690855902613948e-06, 'epoch': 3.27}


                                                      
 13%|█▎        | 6278/47590 [1:51:39<22:13, 30.97it/s]

{'embedding_loss': 0.0308, 'grad_norm': 1.4790172576904297, 'learning_rate': 7.596489572520525e-06, 'epoch': 3.29}


                                                      
 13%|█▎        | 6278/47590 [1:51:42<22:13, 30.97it/s]

{'embedding_loss': 0.0258, 'grad_norm': 0.4970121383666992, 'learning_rate': 7.502123242427103e-06, 'epoch': 3.31}


                                                      
 13%|█▎        | 6278/47590 [1:51:46<22:13, 30.97it/s]

{'embedding_loss': 0.0298, 'grad_norm': 1.3068530559539795, 'learning_rate': 7.40775691233368e-06, 'epoch': 3.33}


                                                      
 13%|█▎        | 6278/47590 [1:51:55<22:13, 30.97it/s]

{'embedding_loss': 0.0292, 'grad_norm': 1.3096270561218262, 'learning_rate': 7.313390582240258e-06, 'epoch': 3.35}


                                                      
 13%|█▎        | 6278/47590 [1:52:02<22:13, 30.97it/s]


{'embedding_loss': 0.026, 'grad_norm': 0.670796275138855, 'learning_rate': 7.219024252146834e-06, 'epoch': 3.38}


                                                      A
 13%|█▎        | 6278/47590 [1:52:10<22:13, 30.97it/s]

{'embedding_loss': 0.0267, 'grad_norm': 0.9042291045188904, 'learning_rate': 7.1246579220534116e-06, 'epoch': 3.4}


                                                      
 13%|█▎        | 6278/47590 [1:52:26<22:13, 30.97it/s]


{'embedding_loss': 0.0252, 'grad_norm': 1.1705950498580933, 'learning_rate': 7.030291591959989e-06, 'epoch': 3.42}


                                                      A
 13%|█▎        | 6278/47590 [1:52:29<22:13, 30.97it/s]

{'embedding_loss': 0.0309, 'grad_norm': 1.4333529472351074, 'learning_rate': 6.935925261866566e-06, 'epoch': 3.44}


                                                      
 13%|█▎        | 6278/47590 [1:52:35<22:13, 30.97it/s]

{'embedding_loss': 0.027, 'grad_norm': 0.8610638976097107, 'learning_rate': 6.841558931773144e-06, 'epoch': 3.46}


                                                      
 13%|█▎        | 6278/47590 [1:52:45<22:13, 30.97it/s]


{'embedding_loss': 0.0238, 'grad_norm': 0.45485198497772217, 'learning_rate': 6.74719260167972e-06, 'epoch': 3.48}


                                                      A
 13%|█▎        | 6278/47590 [1:52:47<22:13, 30.97it/s]

{'embedding_loss': 0.0257, 'grad_norm': 1.4293509721755981, 'learning_rate': 6.652826271586299e-06, 'epoch': 3.5}


                                                      
 13%|█▎        | 6278/47590 [1:53:04<22:13, 30.97it/s]

{'embedding_loss': 0.0265, 'grad_norm': 0.6885696053504944, 'learning_rate': 6.558459941492876e-06, 'epoch': 3.52}


                                                      
 13%|█▎        | 6278/47590 [1:53:30<22:13, 30.97it/s]

{'embedding_loss': 0.0275, 'grad_norm': 1.1376315355300903, 'learning_rate': 6.464093611399453e-06, 'epoch': 3.55}


                                                      
 13%|█▎        | 6278/47590 [1:53:33<22:13, 30.97it/s]

{'embedding_loss': 0.0235, 'grad_norm': 0.33575424551963806, 'learning_rate': 6.369727281306031e-06, 'epoch': 3.57}


                                                      
 13%|█▎        | 6278/47590 [1:53:47<22:13, 30.97it/s]

{'embedding_loss': 0.0273, 'grad_norm': 1.2009556293487549, 'learning_rate': 6.275360951212608e-06, 'epoch': 3.59}


                                                      
 13%|█▎        | 6278/47590 [1:53:54<22:13, 30.97it/s]

{'embedding_loss': 0.0293, 'grad_norm': 0.8246714472770691, 'learning_rate': 6.180994621119185e-06, 'epoch': 3.61}


                                                      
 13%|█▎        | 6278/47590 [1:53:56<22:13, 30.97it/s]

{'embedding_loss': 0.0279, 'grad_norm': 0.7460234761238098, 'learning_rate': 6.086628291025762e-06, 'epoch': 3.63}


                                                      
 13%|█▎        | 6278/47590 [1:54:21<22:13, 30.97it/s]

{'embedding_loss': 0.0237, 'grad_norm': 1.0604910850524902, 'learning_rate': 5.99226196093234e-06, 'epoch': 3.65}


                                                      
 13%|█▎        | 6278/47590 [1:54:32<22:13, 30.97it/s]

{'embedding_loss': 0.0226, 'grad_norm': 0.9844294786453247, 'learning_rate': 5.897895630838917e-06, 'epoch': 3.67}


                                                      
 13%|█▎        | 6278/47590 [1:54:41<22:13, 30.97it/s]

{'embedding_loss': 0.0237, 'grad_norm': 1.1348180770874023, 'learning_rate': 5.8035293007454944e-06, 'epoch': 3.69}


                                                      
 13%|█▎        | 6278/47590 [1:54:44<22:13, 30.97it/s]

{'embedding_loss': 0.0305, 'grad_norm': 0.6535425782203674, 'learning_rate': 5.709162970652071e-06, 'epoch': 3.72}


                                                      
 13%|█▎        | 6278/47590 [1:54:48<22:13, 30.97it/s]

{'embedding_loss': 0.0263, 'grad_norm': 1.165201187133789, 'learning_rate': 5.614796640558649e-06, 'epoch': 3.74}


                                                      
 13%|█▎        | 6278/47590 [1:54:56<22:13, 30.97it/s]

{'embedding_loss': 0.0252, 'grad_norm': 0.7617387175559998, 'learning_rate': 5.520430310465227e-06, 'epoch': 3.76}


                                                      
 13%|█▎        | 6278/47590 [1:55:01<22:13, 30.97it/s]

{'embedding_loss': 0.025, 'grad_norm': 0.7203279137611389, 'learning_rate': 5.426063980371804e-06, 'epoch': 3.78}


                                                      
 13%|█▎        | 6278/47590 [1:55:10<22:13, 30.97it/s]

{'embedding_loss': 0.0313, 'grad_norm': 1.5890648365020752, 'learning_rate': 5.3316976502783815e-06, 'epoch': 3.8}


                                                      
 13%|█▎        | 6278/47590 [1:55:18<22:13, 30.97it/s]

{'embedding_loss': 0.0255, 'grad_norm': 0.7725067138671875, 'learning_rate': 5.237331320184959e-06, 'epoch': 3.82}


                                                      
 13%|█▎        | 6278/47590 [1:55:20<22:13, 30.97it/s]

{'embedding_loss': 0.0249, 'grad_norm': 0.4764646589756012, 'learning_rate': 5.1429649900915354e-06, 'epoch': 3.84}


                                                      
 13%|█▎        | 6278/47590 [1:55:22<22:13, 30.97it/s]

{'embedding_loss': 0.0251, 'grad_norm': 0.9301730990409851, 'learning_rate': 5.048598659998113e-06, 'epoch': 3.86}


                                                      
 13%|█▎        | 6278/47590 [1:55:24<22:13, 30.97it/s]

{'embedding_loss': 0.0236, 'grad_norm': 0.3094031810760498, 'learning_rate': 4.95423232990469e-06, 'epoch': 3.89}


                                                      
 13%|█▎        | 6278/47590 [1:55:26<22:13, 30.97it/s]

{'embedding_loss': 0.0249, 'grad_norm': 0.8136033415794373, 'learning_rate': 4.859865999811268e-06, 'epoch': 3.91}


                                                      
 13%|█▎        | 6278/47590 [1:55:27<22:13, 30.97it/s]

{'embedding_loss': 0.0237, 'grad_norm': 0.7238600254058838, 'learning_rate': 4.765499669717845e-06, 'epoch': 3.93}


                                                      
 13%|█▎        | 6278/47590 [1:55:29<22:13, 30.97it/s]

{'embedding_loss': 0.0253, 'grad_norm': 1.5254108905792236, 'learning_rate': 4.6711333396244225e-06, 'epoch': 3.95}


                                                      
 13%|█▎        | 6278/47590 [1:55:31<22:13, 30.97it/s]

{'embedding_loss': 0.0242, 'grad_norm': 0.5687781572341919, 'learning_rate': 4.576767009531e-06, 'epoch': 3.97}


                                                      
 13%|█▎        | 6278/47590 [1:55:33<22:13, 30.97it/s]

{'embedding_loss': 0.0262, 'grad_norm': 1.1824694871902466, 'learning_rate': 4.4824006794375765e-06, 'epoch': 3.99}


                                                      
 13%|█▎        | 6278/47590 [1:55:34<22:13, 30.97it/s]

{'embedding_loss': 0.0257, 'grad_norm': 1.2700999975204468, 'learning_rate': 4.388034349344155e-06, 'epoch': 4.01}


                                                      
 13%|█▎        | 6278/47590 [1:55:36<22:13, 30.97it/s]

{'embedding_loss': 0.0238, 'grad_norm': 0.7191048860549927, 'learning_rate': 4.293668019250732e-06, 'epoch': 4.03}


                                                      
 13%|█▎        | 6278/47590 [1:55:39<22:13, 30.97it/s]

{'embedding_loss': 0.0257, 'grad_norm': 1.0701770782470703, 'learning_rate': 4.199301689157309e-06, 'epoch': 4.06}


                                                      
 13%|█▎        | 6278/47590 [1:55:47<22:13, 30.97it/s]

{'embedding_loss': 0.0269, 'grad_norm': 1.018934726715088, 'learning_rate': 4.104935359063886e-06, 'epoch': 4.08}


                                                      
 13%|█▎        | 6278/47590 [1:55:50<22:13, 30.97it/s]

{'embedding_loss': 0.0245, 'grad_norm': 0.7228513956069946, 'learning_rate': 4.0105690289704635e-06, 'epoch': 4.1}


                                                      
 13%|█▎        | 6278/47590 [1:55:52<22:13, 30.97it/s]


{'embedding_loss': 0.0249, 'grad_norm': 0.9952530264854431, 'learning_rate': 3.916202698877041e-06, 'epoch': 4.12}


                                                      A
 13%|█▎        | 6278/47590 [1:55:55<22:13, 30.97it/s]

{'embedding_loss': 0.0247, 'grad_norm': 0.46122416853904724, 'learning_rate': 3.821836368783618e-06, 'epoch': 4.14}


                                                      
 13%|█▎        | 6278/47590 [1:55:58<22:13, 30.97it/s]

{'embedding_loss': 0.0224, 'grad_norm': 1.7280951738357544, 'learning_rate': 3.7274700386901957e-06, 'epoch': 4.16}


                                                      
 13%|█▎        | 6278/47590 [1:56:01<22:13, 30.97it/s]

{'embedding_loss': 0.0227, 'grad_norm': 0.6087608933448792, 'learning_rate': 3.633103708596773e-06, 'epoch': 4.18}


                                                      
 13%|█▎        | 6278/47590 [1:56:04<22:13, 30.97it/s]

{'embedding_loss': 0.0228, 'grad_norm': 0.6204730868339539, 'learning_rate': 3.53873737850335e-06, 'epoch': 4.2}


                                                      
 13%|█▎        | 6278/47590 [1:56:06<22:13, 30.97it/s]

{'embedding_loss': 0.0227, 'grad_norm': 0.8507755398750305, 'learning_rate': 3.4443710484099275e-06, 'epoch': 4.23}


                                                      
 13%|█▎        | 6278/47590 [1:56:09<22:13, 30.97it/s]

{'embedding_loss': 0.0255, 'grad_norm': 1.1432485580444336, 'learning_rate': 3.3500047183165054e-06, 'epoch': 4.25}


                                                      
 13%|█▎        | 6278/47590 [1:56:13<22:13, 30.97it/s]

{'embedding_loss': 0.0226, 'grad_norm': 1.3471412658691406, 'learning_rate': 3.2556383882230824e-06, 'epoch': 4.27}


                                                      
 13%|█▎        | 6278/47590 [1:56:18<22:13, 30.97it/s]

{'embedding_loss': 0.0223, 'grad_norm': 0.868338406085968, 'learning_rate': 3.1612720581296598e-06, 'epoch': 4.29}


                                                      
 13%|█▎        | 6278/47590 [1:56:27<22:13, 30.97it/s]


{'embedding_loss': 0.0246, 'grad_norm': 1.0077555179595947, 'learning_rate': 3.0669057280362368e-06, 'epoch': 4.31}


                                                      [A
 13%|█▎        | 6278/47590 [1:56:31<22:13, 30.97it/s]


{'embedding_loss': 0.022, 'grad_norm': 0.7568044662475586, 'learning_rate': 2.972539397942814e-06, 'epoch': 4.33}


                                                      [A
 13%|█▎        | 6278/47590 [1:56:34<22:13, 30.97it/s]

{'embedding_loss': 0.021, 'grad_norm': 0.5700446963310242, 'learning_rate': 2.878173067849391e-06, 'epoch': 4.35}


                                                      
 13%|█▎        | 6278/47590 [1:56:37<22:13, 30.97it/s]


{'embedding_loss': 0.021, 'grad_norm': 0.6676397919654846, 'learning_rate': 2.783806737755969e-06, 'epoch': 4.37}


                                                      [A
 13%|█▎        | 6278/47590 [1:56:42<22:13, 30.97it/s]

{'embedding_loss': 0.0229, 'grad_norm': 1.6179136037826538, 'learning_rate': 2.6894404076625464e-06, 'epoch': 4.39}


                                                      
 13%|█▎        | 6278/47590 [1:56:49<22:13, 30.97it/s]

{'embedding_loss': 0.0226, 'grad_norm': 1.2768737077713013, 'learning_rate': 2.5950740775691234e-06, 'epoch': 4.42}


                                                      
 13%|█▎        | 6278/47590 [1:56:56<22:13, 30.97it/s]

{'embedding_loss': 0.0249, 'grad_norm': 0.6238510012626648, 'learning_rate': 2.5007077474757008e-06, 'epoch': 4.44}


                                                      
 13%|█▎        | 6278/47590 [1:57:01<22:13, 30.97it/s]

{'embedding_loss': 0.026, 'grad_norm': 1.2358232736587524, 'learning_rate': 2.406341417382278e-06, 'epoch': 4.46}


                                                      
 13%|█▎        | 6278/47590 [1:57:35<22:13, 30.97it/s]

{'embedding_loss': 0.0227, 'grad_norm': 0.9479451179504395, 'learning_rate': 2.3119750872888556e-06, 'epoch': 4.48}


                                                      
 13%|█▎        | 6278/47590 [1:58:04<22:13, 30.97it/s]

{'embedding_loss': 0.0251, 'grad_norm': 0.8497714996337891, 'learning_rate': 2.2176087571954326e-06, 'epoch': 4.5}


                                                      
 13%|█▎        | 6278/47590 [1:58:07<22:13, 30.97it/s]

{'embedding_loss': 0.0226, 'grad_norm': 0.7367107272148132, 'learning_rate': 2.1232424271020104e-06, 'epoch': 4.52}


                                                      
 13%|█▎        | 6278/47590 [1:58:20<22:13, 30.97it/s]

{'embedding_loss': 0.0237, 'grad_norm': 0.6676647067070007, 'learning_rate': 2.0288760970085874e-06, 'epoch': 4.54}


                                                      
 13%|█▎        | 6278/47590 [1:58:56<22:13, 30.97it/s]

{'embedding_loss': 0.0218, 'grad_norm': 1.212896704673767, 'learning_rate': 1.934509766915165e-06, 'epoch': 4.56}


                                                      
 13%|█▎        | 6278/47590 [1:59:03<22:13, 30.97it/s]

{'embedding_loss': 0.024, 'grad_norm': 0.7737791538238525, 'learning_rate': 1.8401434368217422e-06, 'epoch': 4.59}


                                                      
 13%|█▎        | 6278/47590 [1:59:17<22:13, 30.97it/s]

{'embedding_loss': 0.0245, 'grad_norm': 0.8364699482917786, 'learning_rate': 1.7457771067283194e-06, 'epoch': 4.61}


                                                      
 13%|█▎        | 6278/47590 [1:59:19<22:13, 30.97it/s]

{'embedding_loss': 0.0247, 'grad_norm': 0.71670001745224, 'learning_rate': 1.6514107766348968e-06, 'epoch': 4.63}


                                                      
 13%|█▎        | 6278/47590 [1:59:40<22:13, 30.97it/s]

{'embedding_loss': 0.0228, 'grad_norm': 1.4287441968917847, 'learning_rate': 1.5570444465414742e-06, 'epoch': 4.65}


                                                      
 13%|█▎        | 6278/47590 [2:00:18<22:13, 30.97it/s]

{'embedding_loss': 0.0182, 'grad_norm': 0.6895911693572998, 'learning_rate': 1.4626781164480514e-06, 'epoch': 4.67}


                                                      
 13%|█▎        | 6278/47590 [2:00:24<22:13, 30.97it/s]

{'embedding_loss': 0.0226, 'grad_norm': 1.2063379287719727, 'learning_rate': 1.3683117863546288e-06, 'epoch': 4.69}


                                                      
 13%|█▎        | 6278/47590 [2:00:39<22:13, 30.97it/s]

{'embedding_loss': 0.0201, 'grad_norm': 0.41265445947647095, 'learning_rate': 1.273945456261206e-06, 'epoch': 4.71}


                                                      
 13%|█▎        | 6278/47590 [2:00:40<22:13, 30.97it/s]


{'embedding_loss': 0.0231, 'grad_norm': 1.0835665464401245, 'learning_rate': 1.1795791261677834e-06, 'epoch': 4.73}


                                                      [A
 13%|█▎        | 6278/47590 [2:00:41<22:13, 30.97it/s]

{'embedding_loss': 0.0201, 'grad_norm': 0.8968410491943359, 'learning_rate': 1.0852127960743609e-06, 'epoch': 4.76}


                                                      
 13%|█▎        | 6278/47590 [2:00:42<22:13, 30.97it/s]

{'embedding_loss': 0.0245, 'grad_norm': 0.7028281688690186, 'learning_rate': 9.90846465980938e-07, 'epoch': 4.78}


                                                      
 13%|█▎        | 6278/47590 [2:00:43<22:13, 30.97it/s]


{'embedding_loss': 0.0217, 'grad_norm': 0.7063049077987671, 'learning_rate': 8.964801358875154e-07, 'epoch': 4.8}


                                                      [A
 13%|█▎        | 6278/47590 [2:00:44<22:13, 30.97it/s]

{'embedding_loss': 0.0232, 'grad_norm': 0.4381004273891449, 'learning_rate': 8.021138057940928e-07, 'epoch': 4.82}


                                                      
 13%|█▎        | 6278/47590 [2:00:44<22:13, 30.97it/s]

{'embedding_loss': 0.0221, 'grad_norm': 0.9597584009170532, 'learning_rate': 7.077474757006702e-07, 'epoch': 4.84}


                                                      
 13%|█▎        | 6278/47590 [2:00:45<22:13, 30.97it/s]


{'embedding_loss': 0.0244, 'grad_norm': 1.546215295791626, 'learning_rate': 6.133811456072474e-07, 'epoch': 4.86}


                                                      [A
 13%|█▎        | 6278/47590 [2:00:46<22:13, 30.97it/s]

{'embedding_loss': 0.0235, 'grad_norm': 0.9262745380401611, 'learning_rate': 5.190148155138247e-07, 'epoch': 4.88}


                                                      
 13%|█▎        | 6278/47590 [2:00:48<22:13, 30.97it/s]

{'embedding_loss': 0.024, 'grad_norm': 1.4539402723312378, 'learning_rate': 4.2464848542040203e-07, 'epoch': 4.9}


                                                      
 13%|█▎        | 6278/47590 [2:01:05<22:13, 30.97it/s]

{'embedding_loss': 0.0237, 'grad_norm': 0.7088818550109863, 'learning_rate': 3.302821553269794e-07, 'epoch': 4.93}


                                                      
 13%|█▎        | 6278/47590 [2:01:24<22:13, 30.97it/s]

{'embedding_loss': 0.0213, 'grad_norm': 0.6205975413322449, 'learning_rate': 2.359158252335567e-07, 'epoch': 4.95}


                                                      
 13%|█▎        | 6278/47590 [2:01:36<22:13, 30.97it/s]

{'embedding_loss': 0.0226, 'grad_norm': 1.2740271091461182, 'learning_rate': 1.4154949514013402e-07, 'epoch': 4.97}


                                                      
 13%|█▎        | 6278/47590 [2:01:40<22:13, 30.97it/s]

{'embedding_loss': 0.0249, 'grad_norm': 0.4734831750392914, 'learning_rate': 4.7183165046711335e-08, 'epoch': 4.99}


                                                      
100%|██████████| 11775/11775 [20:13<00:00,  9.70it/s]]


{'train_runtime': 1213.4355, 'train_samples_per_second': 310.523, 'train_steps_per_second': 9.704, 'train_loss': 0.06802977183821854, 'epoch': 5.0}


The configuration for the pharo model.

In [17]:
conf_pharo = {
  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": None,
  "gradient_checkpointing": False,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.2",
  "type_vocab_size": 2,
  "use_cache": True,
  "vocab_size": 30522
}

The transformer is taken from huggingface hub and the conf_pharo configuration arguments are applied. The model is then constructed from the transformer and pooling using the SentenceTransformer (SBERT)library.

In [18]:
transformer = models.Transformer("sentence-transformers/all-MiniLM-L6-v2", max_seq_length=256, config_args=conf_pharo)
pooling = models.Pooling(transformer.get_word_embedding_dimension(), 
    pooling_mode_cls_token=False, pooling_mode_mean_tokens=True, pooling_mode_max_tokens=False, pooling_mode_mean_sqrt_len_tokens=False, pooling_mode_weightedmean_tokens=False, pooling_mode_lasttoken=False, include_prompt=True)
modelMy = SentenceTransformer(modules=[transformer, pooling])
modelMy.save_pretrained('./models/aight-l6-pharo')

Some weights of the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 were not used when initializing BertModel: ['encoder.layer.3.attention.output.LayerNorm.bias', 'encoder.layer.3.attention.output.LayerNorm.weight', 'encoder.layer.3.attention.output.dense.bias', 'encoder.layer.3.attention.output.dense.weight', 'encoder.layer.3.attention.self.key.bias', 'encoder.layer.3.attention.self.key.weight', 'encoder.layer.3.attention.self.query.bias', 'encoder.layer.3.attention.self.query.weight', 'encoder.layer.3.attention.self.value.bias', 'encoder.layer.3.attention.self.value.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.3.output.dense.bias', 'encoder.layer.3.output.dense.weight', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.4.attention.output.LayerNorm.weight', 'encoder.layer.4.attention.output.dense.bias', 'encod

The untrained model is loaded as a SetFitModel, and a trainer is set up using the dataset. The trainer then trains the model, and it is saved.

In [19]:
model = SetFitModel.from_pretrained("./models/aight-l6-pharo", multi_target_strategy="multi-output",device='cuda', normalize_embeddings=False)
trainer = SetFitTrainer(
    model=model,
    train_dataset=ds['pharo_train'],
    column_mapping={"combo": "text", "labels": "label"},
    num_epochs=10,
    batch_size=32,
)
trainer.train()
trainer.model.save_pretrained('./models/aight-l6-pharo-trained')

model_head.pkl not found in D:\Documents\!Colleg\CS440-Final\Finalized\models\aight-l6-pharo, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
  trainer = SetFitTrainer(
Applying column mapping to the training dataset
***** Running training *****
  Num unique pairs = 51920
  Batch size = 32
  Num epochs = 10
                                                      
 13%|█▎        | 6278/47590 [2:02:01<22:13, 30.97it/s]

{'embedding_loss': 0.2158, 'grad_norm': 0.36696842312812805, 'learning_rate': 1.2322858903265559e-08, 'epoch': 0.0}


                                                      
 13%|█▎        | 6278/47590 [2:02:06<22:13, 30.97it/s]

{'embedding_loss': 0.2602, 'grad_norm': 0.542107105255127, 'learning_rate': 6.161429451632779e-07, 'epoch': 0.03}


                                                      
 13%|█▎        | 6278/47590 [2:02:12<22:13, 30.97it/s]

{'embedding_loss': 0.2659, 'grad_norm': 0.4318813383579254, 'learning_rate': 1.2322858903265559e-06, 'epoch': 0.06}


                                                      
 13%|█▎        | 6278/47590 [2:02:18<22:13, 30.97it/s]

{'embedding_loss': 0.2564, 'grad_norm': 0.4962020516395569, 'learning_rate': 1.848428835489834e-06, 'epoch': 0.09}


                                                      
 13%|█▎        | 6278/47590 [2:02:24<22:13, 30.97it/s]


{'embedding_loss': 0.2536, 'grad_norm': 0.5150508284568787, 'learning_rate': 2.4645717806531117e-06, 'epoch': 0.12}


                                                      
 13%|█▎        | 6278/47590 [2:02:31<22:13, 30.97it/s]

{'embedding_loss': 0.245, 'grad_norm': 0.6129359602928162, 'learning_rate': 3.0807147258163894e-06, 'epoch': 0.15}


                                                      
 13%|█▎        | 6278/47590 [2:02:35<22:13, 30.97it/s]

{'embedding_loss': 0.2371, 'grad_norm': 0.4547139108181, 'learning_rate': 3.696857670979668e-06, 'epoch': 0.18}


                                                      
 13%|█▎        | 6278/47590 [2:02:46<22:13, 30.97it/s]

{'embedding_loss': 0.2381, 'grad_norm': 0.5531184077262878, 'learning_rate': 4.313000616142946e-06, 'epoch': 0.22}


                                                      
 13%|█▎        | 6278/47590 [2:02:49<22:13, 30.97it/s]


{'embedding_loss': 0.2304, 'grad_norm': 0.6411883234977722, 'learning_rate': 4.9291435613062235e-06, 'epoch': 0.25}


                                                      
 13%|█▎        | 6278/47590 [2:03:04<22:13, 30.97it/s]

{'embedding_loss': 0.2241, 'grad_norm': 0.5557150840759277, 'learning_rate': 5.545286506469502e-06, 'epoch': 0.28}


                                                      
 13%|█▎        | 6278/47590 [2:03:17<22:13, 30.97it/s]

{'embedding_loss': 0.2133, 'grad_norm': 0.6630551218986511, 'learning_rate': 6.161429451632779e-06, 'epoch': 0.31}



[A
                                                      
 13%|█▎        | 6278/47590 [2:03:20<22:13, 30.97it/s]

{'embedding_loss': 0.2018, 'grad_norm': 0.620062530040741, 'learning_rate': 6.777572396796057e-06, 'epoch': 0.34}


                                                      
 13%|█▎        | 6278/47590 [2:03:22<22:13, 30.97it/s]

{'embedding_loss': 0.1977, 'grad_norm': 0.6370843052864075, 'learning_rate': 7.393715341959336e-06, 'epoch': 0.37}


                                                      
 13%|█▎        | 6278/47590 [2:03:24<22:13, 30.97it/s]

{'embedding_loss': 0.189, 'grad_norm': 0.7189193964004517, 'learning_rate': 8.009858287122613e-06, 'epoch': 0.4}


                                                      
 13%|█▎        | 6278/47590 [2:03:26<22:13, 30.97it/s]

{'embedding_loss': 0.1875, 'grad_norm': 0.551116943359375, 'learning_rate': 8.626001232285892e-06, 'epoch': 0.43}


                                                      
 13%|█▎        | 6278/47590 [2:03:27<22:13, 30.97it/s]

{'embedding_loss': 0.1767, 'grad_norm': 0.9239833354949951, 'learning_rate': 9.24214417744917e-06, 'epoch': 0.46}


                                                      
 13%|█▎        | 6278/47590 [2:03:28<22:13, 30.97it/s]

{'embedding_loss': 0.1746, 'grad_norm': 0.9781711101531982, 'learning_rate': 9.858287122612447e-06, 'epoch': 0.49}


                                                      
 13%|█▎        | 6278/47590 [2:03:29<22:13, 30.97it/s]

{'embedding_loss': 0.1663, 'grad_norm': 0.7468429207801819, 'learning_rate': 1.0474430067775725e-05, 'epoch': 0.52}


                                                      
 13%|█▎        | 6278/47590 [2:03:31<22:13, 30.97it/s]

{'embedding_loss': 0.147, 'grad_norm': 0.8593127727508545, 'learning_rate': 1.1090573012939004e-05, 'epoch': 0.55}


                                                      
 13%|█▎        | 6278/47590 [2:03:32<22:13, 30.97it/s]

{'embedding_loss': 0.1452, 'grad_norm': 0.8646577596664429, 'learning_rate': 1.1706715958102282e-05, 'epoch': 0.59}


                                                      
 13%|█▎        | 6278/47590 [2:03:33<22:13, 30.97it/s]

{'embedding_loss': 0.1294, 'grad_norm': 0.8741542100906372, 'learning_rate': 1.2322858903265557e-05, 'epoch': 0.62}


                                                      
 13%|█▎        | 6278/47590 [2:03:35<22:13, 30.97it/s]

{'embedding_loss': 0.1229, 'grad_norm': 0.8901481032371521, 'learning_rate': 1.2939001848428836e-05, 'epoch': 0.65}


                                                      
 13%|█▎        | 6278/47590 [2:03:37<22:13, 30.97it/s]

{'embedding_loss': 0.1079, 'grad_norm': 1.0534683465957642, 'learning_rate': 1.3555144793592114e-05, 'epoch': 0.68}


                                                      
 13%|█▎        | 6278/47590 [2:03:39<22:13, 30.97it/s]

{'embedding_loss': 0.1103, 'grad_norm': 0.7423899173736572, 'learning_rate': 1.4171287738755393e-05, 'epoch': 0.71}


                                                      
 13%|█▎        | 6278/47590 [2:03:42<22:13, 30.97it/s]

{'embedding_loss': 0.1055, 'grad_norm': 0.8784887194633484, 'learning_rate': 1.4787430683918671e-05, 'epoch': 0.74}


                                                      
 13%|█▎        | 6278/47590 [2:03:44<22:13, 30.97it/s]

{'embedding_loss': 0.0949, 'grad_norm': 0.9272008538246155, 'learning_rate': 1.540357362908195e-05, 'epoch': 0.77}


                                                      
 13%|█▎        | 6278/47590 [2:04:01<22:13, 30.97it/s]

{'embedding_loss': 0.0903, 'grad_norm': 1.0713862180709839, 'learning_rate': 1.6019716574245227e-05, 'epoch': 0.8}


                                                      
 13%|█▎        | 6278/47590 [2:04:12<22:13, 30.97it/s]

{'embedding_loss': 0.0816, 'grad_norm': 0.8100733757019043, 'learning_rate': 1.6635859519408503e-05, 'epoch': 0.83}


                                                      
 13%|█▎        | 6278/47590 [2:04:18<22:13, 30.97it/s]

{'embedding_loss': 0.0805, 'grad_norm': 1.048203468322754, 'learning_rate': 1.7252002464571783e-05, 'epoch': 0.86}


                                                      
 13%|█▎        | 6278/47590 [2:04:23<22:13, 30.97it/s]

{'embedding_loss': 0.0805, 'grad_norm': 0.9934534430503845, 'learning_rate': 1.786814540973506e-05, 'epoch': 0.89}


                                                      
 13%|█▎        | 6278/47590 [2:04:29<22:13, 30.97it/s]

{'embedding_loss': 0.0702, 'grad_norm': 0.6983770728111267, 'learning_rate': 1.848428835489834e-05, 'epoch': 0.92}


                                                      
 13%|█▎        | 6278/47590 [2:04:36<22:13, 30.97it/s]

{'embedding_loss': 0.0727, 'grad_norm': 0.7049174308776855, 'learning_rate': 1.9100431300061614e-05, 'epoch': 0.96}


                                                      
 13%|█▎        | 6278/47590 [2:04:39<22:13, 30.97it/s]

{'embedding_loss': 0.0718, 'grad_norm': 0.9789791107177734, 'learning_rate': 1.9716574245224894e-05, 'epoch': 0.99}


                                                      
 13%|█▎        | 6278/47590 [2:04:49<22:13, 30.97it/s]

{'embedding_loss': 0.0718, 'grad_norm': 0.6119965314865112, 'learning_rate': 1.9963031423290204e-05, 'epoch': 1.02}


                                                      
 13%|█▎        | 6278/47590 [2:04:54<22:13, 30.97it/s]

{'embedding_loss': 0.0632, 'grad_norm': 1.0110832452774048, 'learning_rate': 1.989457109604984e-05, 'epoch': 1.05}


                                                      
 13%|█▎        | 6278/47590 [2:05:03<22:13, 30.97it/s]

{'embedding_loss': 0.0515, 'grad_norm': 0.6933714151382446, 'learning_rate': 1.9826110768809475e-05, 'epoch': 1.08}


                                                      
 13%|█▎        | 6278/47590 [2:05:07<22:13, 30.97it/s]

{'embedding_loss': 0.0537, 'grad_norm': 0.9046692252159119, 'learning_rate': 1.9757650441569113e-05, 'epoch': 1.11}


                                                      
 13%|█▎        | 6278/47590 [2:05:09<22:13, 30.97it/s]


{'embedding_loss': 0.0553, 'grad_norm': 0.818529486656189, 'learning_rate': 1.968919011432875e-05, 'epoch': 1.14}


                                                      A
 13%|█▎        | 6278/47590 [2:05:11<22:13, 30.97it/s]

{'embedding_loss': 0.0489, 'grad_norm': 0.6418370604515076, 'learning_rate': 1.9620729787088384e-05, 'epoch': 1.17}


                                                      
 13%|█▎        | 6278/47590 [2:05:16<22:13, 30.97it/s]

{'embedding_loss': 0.0487, 'grad_norm': 0.6236354112625122, 'learning_rate': 1.9552269459848018e-05, 'epoch': 1.2}


                                                      
 13%|█▎        | 6278/47590 [2:05:21<22:13, 30.97it/s]

{'embedding_loss': 0.0465, 'grad_norm': 1.0780868530273438, 'learning_rate': 1.9483809132607656e-05, 'epoch': 1.23}


                                                      
 13%|█▎        | 6278/47590 [2:05:28<22:13, 30.97it/s]

{'embedding_loss': 0.0528, 'grad_norm': 0.597696840763092, 'learning_rate': 1.941534880536729e-05, 'epoch': 1.26}


                                                      
 13%|█▎        | 6278/47590 [2:05:39<22:13, 30.97it/s]

{'embedding_loss': 0.0487, 'grad_norm': 0.905238151550293, 'learning_rate': 1.9346888478126927e-05, 'epoch': 1.29}


                                                      
 13%|█▎        | 6278/47590 [2:05:46<22:13, 30.97it/s]


{'embedding_loss': 0.0452, 'grad_norm': 1.0204371213912964, 'learning_rate': 1.9278428150886565e-05, 'epoch': 1.32}


                                                      A
 13%|█▎        | 6278/47590 [2:05:53<22:13, 30.97it/s]

{'embedding_loss': 0.0397, 'grad_norm': 0.7932510375976562, 'learning_rate': 1.92099678236462e-05, 'epoch': 1.36}


                                                      
 13%|█▎        | 6278/47590 [2:05:57<22:13, 30.97it/s]

{'embedding_loss': 0.0401, 'grad_norm': 1.1007273197174072, 'learning_rate': 1.9141507496405833e-05, 'epoch': 1.39}


                                                      
 13%|█▎        | 6278/47590 [2:06:08<22:13, 30.97it/s]

{'embedding_loss': 0.04, 'grad_norm': 0.9722937941551208, 'learning_rate': 1.907304716916547e-05, 'epoch': 1.42}


                                                      
 13%|█▎        | 6278/47590 [2:06:14<22:13, 30.97it/s]

{'embedding_loss': 0.0403, 'grad_norm': 0.7097390294075012, 'learning_rate': 1.9004586841925104e-05, 'epoch': 1.45}


                                                      
 13%|█▎        | 6278/47590 [2:06:21<22:13, 30.97it/s]

{'embedding_loss': 0.0376, 'grad_norm': 0.7947527766227722, 'learning_rate': 1.893612651468474e-05, 'epoch': 1.48}


                                                      
 13%|█▎        | 6278/47590 [2:06:28<22:13, 30.97it/s]

{'embedding_loss': 0.037, 'grad_norm': 0.4305095374584198, 'learning_rate': 1.886766618744438e-05, 'epoch': 1.51}


                                                      
 13%|█▎        | 6278/47590 [2:06:36<22:13, 30.97it/s]

{'embedding_loss': 0.0367, 'grad_norm': 0.8581103086471558, 'learning_rate': 1.8799205860204013e-05, 'epoch': 1.54}


                                                      
 13%|█▎        | 6278/47590 [2:06:41<22:13, 30.97it/s]

{'embedding_loss': 0.0371, 'grad_norm': 0.73052579164505, 'learning_rate': 1.873074553296365e-05, 'epoch': 1.57}


                                                      
 13%|█▎        | 6278/47590 [2:06:44<22:13, 30.97it/s]

{'embedding_loss': 0.0326, 'grad_norm': 0.9820418953895569, 'learning_rate': 1.8662285205723284e-05, 'epoch': 1.6}


                                                      
 13%|█▎        | 6278/47590 [2:06:49<22:13, 30.97it/s]


{'embedding_loss': 0.0396, 'grad_norm': 0.5355201363563538, 'learning_rate': 1.859382487848292e-05, 'epoch': 1.63}


                                                      A
 13%|█▎        | 6278/47590 [2:06:59<22:13, 30.97it/s]

{'embedding_loss': 0.0281, 'grad_norm': 0.43096888065338135, 'learning_rate': 1.8525364551242556e-05, 'epoch': 1.66}


                                                      
 13%|█▎        | 6278/47590 [2:07:07<22:13, 30.97it/s]

{'embedding_loss': 0.0322, 'grad_norm': 0.650177538394928, 'learning_rate': 1.8456904224002193e-05, 'epoch': 1.69}


                                                      
 13%|█▎        | 6278/47590 [2:07:24<22:13, 30.97it/s]


{'embedding_loss': 0.0317, 'grad_norm': 1.025650978088379, 'learning_rate': 1.8388443896761827e-05, 'epoch': 1.73}


                                                      A
 13%|█▎        | 6278/47590 [2:07:28<22:13, 30.97it/s]

{'embedding_loss': 0.0271, 'grad_norm': 0.600642204284668, 'learning_rate': 1.8319983569521465e-05, 'epoch': 1.76}


                                                      
 13%|█▎        | 6278/47590 [2:07:31<22:13, 30.97it/s]

{'embedding_loss': 0.0289, 'grad_norm': 0.753444492816925, 'learning_rate': 1.82515232422811e-05, 'epoch': 1.79}


                                                      
 13%|█▎        | 6278/47590 [2:07:35<22:13, 30.97it/s]

{'embedding_loss': 0.0286, 'grad_norm': 0.8900160193443298, 'learning_rate': 1.8183062915040733e-05, 'epoch': 1.82}


                                                      
 13%|█▎        | 6278/47590 [2:07:41<22:13, 30.97it/s]

{'embedding_loss': 0.0254, 'grad_norm': 0.20430515706539154, 'learning_rate': 1.811460258780037e-05, 'epoch': 1.85}


                                                      
 13%|█▎        | 6278/47590 [2:07:55<22:13, 30.97it/s]

{'embedding_loss': 0.0289, 'grad_norm': 0.6519297361373901, 'learning_rate': 1.8046142260560008e-05, 'epoch': 1.88}


                                                      
 13%|█▎        | 6278/47590 [2:08:08<22:13, 30.97it/s]

{'embedding_loss': 0.0242, 'grad_norm': 0.6836401224136353, 'learning_rate': 1.7977681933319642e-05, 'epoch': 1.91}


                                                      
 13%|█▎        | 6278/47590 [2:08:14<22:13, 30.97it/s]

{'embedding_loss': 0.0306, 'grad_norm': 0.5451343655586243, 'learning_rate': 1.790922160607928e-05, 'epoch': 1.94}


                                                      
 13%|█▎        | 6278/47590 [2:08:22<22:13, 30.97it/s]

{'embedding_loss': 0.0274, 'grad_norm': 0.4981704354286194, 'learning_rate': 1.7840761278838913e-05, 'epoch': 1.97}


                                                      
 13%|█▎        | 6278/47590 [2:08:29<22:13, 30.97it/s]

{'embedding_loss': 0.0244, 'grad_norm': 0.24883750081062317, 'learning_rate': 1.777230095159855e-05, 'epoch': 2.0}


                                                      
 13%|█▎        | 6278/47590 [2:08:35<22:13, 30.97it/s]

{'embedding_loss': 0.0258, 'grad_norm': 0.4780062735080719, 'learning_rate': 1.7703840624358185e-05, 'epoch': 2.03}


                                                      
 13%|█▎        | 6278/47590 [2:08:40<22:13, 30.97it/s]


{'embedding_loss': 0.0257, 'grad_norm': 1.0059510469436646, 'learning_rate': 1.7635380297117822e-05, 'epoch': 2.06}


                                                      A
 13%|█▎        | 6278/47590 [2:08:44<22:13, 30.97it/s]

{'embedding_loss': 0.0196, 'grad_norm': 2.0516555309295654, 'learning_rate': 1.756691996987746e-05, 'epoch': 2.09}


                                                      
 13%|█▎        | 6278/47590 [2:08:49<22:13, 30.97it/s]

{'embedding_loss': 0.022, 'grad_norm': 0.666397213935852, 'learning_rate': 1.7498459642637093e-05, 'epoch': 2.13}


                                                      
 13%|█▎        | 6278/47590 [2:08:56<22:13, 30.97it/s]

{'embedding_loss': 0.0216, 'grad_norm': 0.6434495449066162, 'learning_rate': 1.7429999315396728e-05, 'epoch': 2.16}


                                                      
 13%|█▎        | 6278/47590 [2:09:04<22:13, 30.97it/s]

{'embedding_loss': 0.0284, 'grad_norm': 0.7767024636268616, 'learning_rate': 1.7361538988156365e-05, 'epoch': 2.19}


                                                      
 13%|█▎        | 6278/47590 [2:09:08<22:13, 30.97it/s]

{'embedding_loss': 0.0216, 'grad_norm': 0.3908791244029999, 'learning_rate': 1.7293078660916e-05, 'epoch': 2.22}


                                                      
 13%|█▎        | 6278/47590 [2:09:13<22:13, 30.97it/s]

{'embedding_loss': 0.0212, 'grad_norm': 0.39732563495635986, 'learning_rate': 1.7224618333675636e-05, 'epoch': 2.25}


                                                      
 13%|█▎        | 6278/47590 [2:09:17<22:13, 30.97it/s]

{'embedding_loss': 0.0199, 'grad_norm': 0.7431135177612305, 'learning_rate': 1.7156158006435274e-05, 'epoch': 2.28}


                                                      
 13%|█▎        | 6278/47590 [2:09:22<22:13, 30.97it/s]

{'embedding_loss': 0.0229, 'grad_norm': 0.7111066579818726, 'learning_rate': 1.7087697679194908e-05, 'epoch': 2.31}


                                                      
 13%|█▎        | 6278/47590 [2:09:26<22:13, 30.97it/s]

{'embedding_loss': 0.0208, 'grad_norm': 0.32723480463027954, 'learning_rate': 1.7019237351954542e-05, 'epoch': 2.34}


                                                      
 13%|█▎        | 6278/47590 [2:09:31<22:13, 30.97it/s]

{'embedding_loss': 0.0185, 'grad_norm': 0.3340539336204529, 'learning_rate': 1.695077702471418e-05, 'epoch': 2.37}


                                                      
 13%|█▎        | 6278/47590 [2:09:34<22:13, 30.97it/s]

{'embedding_loss': 0.0214, 'grad_norm': 0.39698418974876404, 'learning_rate': 1.6882316697473817e-05, 'epoch': 2.4}


                                                      
 13%|█▎        | 6278/47590 [2:09:39<22:13, 30.97it/s]

{'embedding_loss': 0.0196, 'grad_norm': 0.7148496508598328, 'learning_rate': 1.681385637023345e-05, 'epoch': 2.43}


                                                      
 13%|█▎        | 6278/47590 [2:09:42<22:13, 30.97it/s]

{'embedding_loss': 0.0226, 'grad_norm': 0.22906719148159027, 'learning_rate': 1.6745396042993088e-05, 'epoch': 2.46}


                                                      
 13%|█▎        | 6278/47590 [2:09:48<22:13, 30.97it/s]

{'embedding_loss': 0.0225, 'grad_norm': 0.5273380279541016, 'learning_rate': 1.6676935715752722e-05, 'epoch': 2.5}


                                                      
 13%|█▎        | 6278/47590 [2:09:53<22:13, 30.97it/s]

{'embedding_loss': 0.0213, 'grad_norm': 0.27876871824264526, 'learning_rate': 1.660847538851236e-05, 'epoch': 2.53}


                                                      
 13%|█▎        | 6278/47590 [2:09:56<22:13, 30.97it/s]

{'embedding_loss': 0.0203, 'grad_norm': 0.2116173505783081, 'learning_rate': 1.6540015061271994e-05, 'epoch': 2.56}


                                                      
 13%|█▎        | 6278/47590 [2:09:59<22:13, 30.97it/s]

{'embedding_loss': 0.02, 'grad_norm': 0.5442264080047607, 'learning_rate': 1.647155473403163e-05, 'epoch': 2.59}


                                                      
 13%|█▎        | 6278/47590 [2:10:03<22:13, 30.97it/s]

{'embedding_loss': 0.0175, 'grad_norm': 0.941173791885376, 'learning_rate': 1.6403094406791265e-05, 'epoch': 2.62}


                                                      
 13%|█▎        | 6278/47590 [2:10:06<22:13, 30.97it/s]

{'embedding_loss': 0.0213, 'grad_norm': 0.4063448905944824, 'learning_rate': 1.6334634079550903e-05, 'epoch': 2.65}


                                                      
 13%|█▎        | 6278/47590 [2:10:08<22:13, 30.97it/s]

{'embedding_loss': 0.0199, 'grad_norm': 0.40588563680648804, 'learning_rate': 1.6266173752310537e-05, 'epoch': 2.68}


                                                      
 13%|█▎        | 6278/47590 [2:10:10<22:13, 30.97it/s]

{'embedding_loss': 0.0254, 'grad_norm': 0.5764793753623962, 'learning_rate': 1.6197713425070174e-05, 'epoch': 2.71}


                                                      
 13%|█▎        | 6278/47590 [2:10:12<22:13, 30.97it/s]

{'embedding_loss': 0.0211, 'grad_norm': 0.5043225288391113, 'learning_rate': 1.6129253097829808e-05, 'epoch': 2.74}


                                                      
 13%|█▎        | 6278/47590 [2:10:14<22:13, 30.97it/s]

{'embedding_loss': 0.0241, 'grad_norm': 0.5784053802490234, 'learning_rate': 1.6060792770589445e-05, 'epoch': 2.77}


                                                      
 13%|█▎        | 6278/47590 [2:10:16<22:13, 30.97it/s]


{'embedding_loss': 0.0206, 'grad_norm': 0.36740779876708984, 'learning_rate': 1.5992332443349083e-05, 'epoch': 2.8}


                                                      A
 13%|█▎        | 6278/47590 [2:10:18<22:13, 30.97it/s]

{'embedding_loss': 0.0194, 'grad_norm': 0.9480167627334595, 'learning_rate': 1.5923872116108717e-05, 'epoch': 2.83}


                                                      
 13%|█▎        | 6278/47590 [2:10:20<22:13, 30.97it/s]

{'embedding_loss': 0.0208, 'grad_norm': 0.36200660467147827, 'learning_rate': 1.585541178886835e-05, 'epoch': 2.87}


                                                      
 13%|█▎        | 6278/47590 [2:10:22<22:13, 30.97it/s]

{'embedding_loss': 0.0193, 'grad_norm': 0.7269970774650574, 'learning_rate': 1.578695146162799e-05, 'epoch': 2.9}


                                                      
 13%|█▎        | 6278/47590 [2:10:24<22:13, 30.97it/s]


{'embedding_loss': 0.0198, 'grad_norm': 0.12460582703351974, 'learning_rate': 1.5718491134387622e-05, 'epoch': 2.93}


                                                      A
 13%|█▎        | 6278/47590 [2:10:27<22:13, 30.97it/s]


{'embedding_loss': 0.0183, 'grad_norm': 0.22543777525424957, 'learning_rate': 1.565003080714726e-05, 'epoch': 2.96}


                                                      A
 13%|█▎        | 6278/47590 [2:10:29<22:13, 30.97it/s]

{'embedding_loss': 0.0204, 'grad_norm': 0.8215951323509216, 'learning_rate': 1.5581570479906897e-05, 'epoch': 2.99}


                                                      
 13%|█▎        | 6278/47590 [2:10:32<22:13, 30.97it/s]

{'embedding_loss': 0.0203, 'grad_norm': 0.28222644329071045, 'learning_rate': 1.551311015266653e-05, 'epoch': 3.02}


                                                      
 13%|█▎        | 6278/47590 [2:10:35<22:13, 30.97it/s]

{'embedding_loss': 0.0177, 'grad_norm': 0.7153576016426086, 'learning_rate': 1.544464982542617e-05, 'epoch': 3.05}


                                                      
 13%|█▎        | 6278/47590 [2:10:37<22:13, 30.97it/s]

{'embedding_loss': 0.0167, 'grad_norm': 0.18261823058128357, 'learning_rate': 1.5376189498185803e-05, 'epoch': 3.08}


                                                      
 13%|█▎        | 6278/47590 [2:10:41<22:13, 30.97it/s]

{'embedding_loss': 0.0184, 'grad_norm': 0.4325411319732666, 'learning_rate': 1.5307729170945437e-05, 'epoch': 3.11}


                                                      
 13%|█▎        | 6278/47590 [2:10:43<22:13, 30.97it/s]

{'embedding_loss': 0.017, 'grad_norm': 0.6060779094696045, 'learning_rate': 1.5239268843705076e-05, 'epoch': 3.14}


                                                      
 13%|█▎        | 6278/47590 [2:10:44<22:13, 30.97it/s]

{'embedding_loss': 0.0181, 'grad_norm': 0.891946017742157, 'learning_rate': 1.517080851646471e-05, 'epoch': 3.17}


                                                      
 13%|█▎        | 6278/47590 [2:10:46<22:13, 30.97it/s]


{'embedding_loss': 0.0213, 'grad_norm': 0.5632987022399902, 'learning_rate': 1.5102348189224346e-05, 'epoch': 3.2}


                                                      A
 13%|█▎        | 6278/47590 [2:10:49<22:13, 30.97it/s]

{'embedding_loss': 0.0176, 'grad_norm': 0.46542733907699585, 'learning_rate': 1.5033887861983983e-05, 'epoch': 3.23}


                                                      
 13%|█▎        | 6278/47590 [2:10:51<22:13, 30.97it/s]

{'embedding_loss': 0.0184, 'grad_norm': 0.34566807746887207, 'learning_rate': 1.4965427534743617e-05, 'epoch': 3.27}


                                                      
 13%|█▎        | 6278/47590 [2:10:53<22:13, 30.97it/s]


{'embedding_loss': 0.0177, 'grad_norm': 0.6259425282478333, 'learning_rate': 1.4896967207503253e-05, 'epoch': 3.3}


                                                      A
 13%|█▎        | 6278/47590 [2:10:55<22:13, 30.97it/s]

{'embedding_loss': 0.0197, 'grad_norm': 0.8621476888656616, 'learning_rate': 1.482850688026289e-05, 'epoch': 3.33}


                                                      
 13%|█▎        | 6278/47590 [2:10:57<22:13, 30.97it/s]

{'embedding_loss': 0.0176, 'grad_norm': 0.558335542678833, 'learning_rate': 1.4760046553022524e-05, 'epoch': 3.36}


                                                      
 13%|█▎        | 6278/47590 [2:10:59<22:13, 30.97it/s]

{'embedding_loss': 0.0198, 'grad_norm': 0.5312120914459229, 'learning_rate': 1.469158622578216e-05, 'epoch': 3.39}


                                                      
 13%|█▎        | 6278/47590 [2:11:02<22:13, 30.97it/s]

{'embedding_loss': 0.0198, 'grad_norm': 0.541739284992218, 'learning_rate': 1.4623125898541797e-05, 'epoch': 3.42}


                                                      
 13%|█▎        | 6278/47590 [2:11:04<22:13, 30.97it/s]

{'embedding_loss': 0.018, 'grad_norm': 0.5411625504493713, 'learning_rate': 1.4554665571301431e-05, 'epoch': 3.45}


                                                      
 13%|█▎        | 6278/47590 [2:11:06<22:13, 30.97it/s]

{'embedding_loss': 0.0194, 'grad_norm': 2.0214767456054688, 'learning_rate': 1.4486205244061067e-05, 'epoch': 3.48}


                                                      
 13%|█▎        | 6278/47590 [2:11:08<22:13, 30.97it/s]

{'embedding_loss': 0.0176, 'grad_norm': 0.09148447960615158, 'learning_rate': 1.4417744916820705e-05, 'epoch': 3.51}


                                                      
 13%|█▎        | 6278/47590 [2:11:10<22:13, 30.97it/s]

{'embedding_loss': 0.0159, 'grad_norm': 0.46904805302619934, 'learning_rate': 1.4349284589580339e-05, 'epoch': 3.54}


                                                      
 13%|█▎        | 6278/47590 [2:11:12<22:13, 30.97it/s]

{'embedding_loss': 0.0164, 'grad_norm': 0.2367984652519226, 'learning_rate': 1.4280824262339976e-05, 'epoch': 3.57}


                                                      
 13%|█▎        | 6278/47590 [2:11:15<22:13, 30.97it/s]


{'embedding_loss': 0.0181, 'grad_norm': 0.5346412062644958, 'learning_rate': 1.4212363935099612e-05, 'epoch': 3.6}


                                                      A
 13%|█▎        | 6278/47590 [2:11:18<22:13, 30.97it/s]

{'embedding_loss': 0.0166, 'grad_norm': 0.5252911448478699, 'learning_rate': 1.4143903607859246e-05, 'epoch': 3.64}


                                                      
 13%|█▎        | 6278/47590 [2:11:20<22:13, 30.97it/s]

{'embedding_loss': 0.0203, 'grad_norm': 0.6556037068367004, 'learning_rate': 1.4075443280618883e-05, 'epoch': 3.67}


                                                      
 13%|█▎        | 6278/47590 [2:11:22<22:13, 30.97it/s]

{'embedding_loss': 0.0189, 'grad_norm': 0.5627568364143372, 'learning_rate': 1.4006982953378519e-05, 'epoch': 3.7}


                                                      
 13%|█▎        | 6278/47590 [2:11:26<22:13, 30.97it/s]

{'embedding_loss': 0.0186, 'grad_norm': 0.8410863280296326, 'learning_rate': 1.3938522626138153e-05, 'epoch': 3.73}


                                                      
 13%|█▎        | 6278/47590 [2:11:28<22:13, 30.97it/s]

{'embedding_loss': 0.0181, 'grad_norm': 0.4106578826904297, 'learning_rate': 1.387006229889779e-05, 'epoch': 3.76}


                                                      
 13%|█▎        | 6278/47590 [2:11:30<22:13, 30.97it/s]

{'embedding_loss': 0.0178, 'grad_norm': 0.44209733605384827, 'learning_rate': 1.3801601971657426e-05, 'epoch': 3.79}


                                                      
 13%|█▎        | 6278/47590 [2:11:33<22:13, 30.97it/s]

{'embedding_loss': 0.0138, 'grad_norm': 0.49109503626823425, 'learning_rate': 1.373314164441706e-05, 'epoch': 3.82}


                                                      
 13%|█▎        | 6278/47590 [2:11:35<22:13, 30.97it/s]


{'embedding_loss': 0.0164, 'grad_norm': 0.47453513741493225, 'learning_rate': 1.3664681317176698e-05, 'epoch': 3.85}


                                                      A
 13%|█▎        | 6278/47590 [2:11:38<22:13, 30.97it/s]

{'embedding_loss': 0.0138, 'grad_norm': 0.1962670087814331, 'learning_rate': 1.3596220989936333e-05, 'epoch': 3.88}


                                                      
 13%|█▎        | 6278/47590 [2:11:40<22:13, 30.97it/s]

{'embedding_loss': 0.0211, 'grad_norm': 0.4170382618904114, 'learning_rate': 1.3527760662695967e-05, 'epoch': 3.91}


                                                      
 13%|█▎        | 6278/47590 [2:11:43<22:13, 30.97it/s]

{'embedding_loss': 0.0191, 'grad_norm': 0.6161903142929077, 'learning_rate': 1.3459300335455605e-05, 'epoch': 3.94}


                                                      
 13%|█▎        | 6278/47590 [2:11:45<22:13, 30.97it/s]

{'embedding_loss': 0.0156, 'grad_norm': 0.5849387049674988, 'learning_rate': 1.339084000821524e-05, 'epoch': 3.97}


                                                      
 13%|█▎        | 6278/47590 [2:11:47<22:13, 30.97it/s]

{'embedding_loss': 0.0183, 'grad_norm': 0.4450565576553345, 'learning_rate': 1.3322379680974874e-05, 'epoch': 4.0}


                                                      
 13%|█▎        | 6278/47590 [2:11:50<22:13, 30.97it/s]

{'embedding_loss': 0.0147, 'grad_norm': 0.7465112805366516, 'learning_rate': 1.3253919353734512e-05, 'epoch': 4.04}


                                                      
 13%|█▎        | 6278/47590 [2:11:53<22:13, 30.97it/s]


{'embedding_loss': 0.0145, 'grad_norm': 0.49402672052383423, 'learning_rate': 1.3185459026494148e-05, 'epoch': 4.07}


                                                      A
 13%|█▎        | 6278/47590 [2:11:55<22:13, 30.97it/s]

{'embedding_loss': 0.0153, 'grad_norm': 0.43405044078826904, 'learning_rate': 1.3116998699253785e-05, 'epoch': 4.1}


                                                      
 13%|█▎        | 6278/47590 [2:11:57<22:13, 30.97it/s]

{'embedding_loss': 0.0159, 'grad_norm': 0.4706093370914459, 'learning_rate': 1.3048538372013419e-05, 'epoch': 4.13}


                                                      
 13%|█▎        | 6278/47590 [2:12:00<22:13, 30.97it/s]

{'embedding_loss': 0.0154, 'grad_norm': 0.5600764751434326, 'learning_rate': 1.2980078044773055e-05, 'epoch': 4.16}


                                                      
 13%|█▎        | 6278/47590 [2:12:01<22:13, 30.97it/s]


{'embedding_loss': 0.0137, 'grad_norm': 0.3554888069629669, 'learning_rate': 1.2911617717532692e-05, 'epoch': 4.19}


                                                      A
 13%|█▎        | 6278/47590 [2:12:03<22:13, 30.97it/s]

{'embedding_loss': 0.0147, 'grad_norm': 0.1356121003627777, 'learning_rate': 1.2843157390292326e-05, 'epoch': 4.22}


                                                      
 13%|█▎        | 6278/47590 [2:12:04<22:13, 30.97it/s]

{'embedding_loss': 0.0179, 'grad_norm': 0.7739037275314331, 'learning_rate': 1.2774697063051962e-05, 'epoch': 4.25}


                                                      
 13%|█▎        | 6278/47590 [2:12:06<22:13, 30.97it/s]

{'embedding_loss': 0.0201, 'grad_norm': 0.4128819406032562, 'learning_rate': 1.27062367358116e-05, 'epoch': 4.28}


                                                      
 13%|█▎        | 6278/47590 [2:12:08<22:13, 30.97it/s]

{'embedding_loss': 0.0182, 'grad_norm': 0.2543526887893677, 'learning_rate': 1.2637776408571233e-05, 'epoch': 4.31}


                                                      
 13%|█▎        | 6278/47590 [2:12:10<22:13, 30.97it/s]

{'embedding_loss': 0.0172, 'grad_norm': 0.33816227316856384, 'learning_rate': 1.2569316081330869e-05, 'epoch': 4.34}


                                                      
 13%|█▎        | 6278/47590 [2:12:13<22:13, 30.97it/s]

{'embedding_loss': 0.0157, 'grad_norm': 0.4018895924091339, 'learning_rate': 1.2500855754090507e-05, 'epoch': 4.37}


                                                      
 13%|█▎        | 6278/47590 [2:12:14<22:13, 30.97it/s]

{'embedding_loss': 0.0204, 'grad_norm': 0.6806554198265076, 'learning_rate': 1.243239542685014e-05, 'epoch': 4.41}


                                                      
 13%|█▎        | 6278/47590 [2:12:18<22:13, 30.97it/s]

{'embedding_loss': 0.0146, 'grad_norm': 0.48180556297302246, 'learning_rate': 1.2363935099609776e-05, 'epoch': 4.44}


                                                      
 13%|█▎        | 6278/47590 [2:12:23<22:13, 30.97it/s]

{'embedding_loss': 0.0131, 'grad_norm': 0.2492644339799881, 'learning_rate': 1.2295474772369414e-05, 'epoch': 4.47}


                                                      
 13%|█▎        | 6278/47590 [2:12:25<22:13, 30.97it/s]

{'embedding_loss': 0.0195, 'grad_norm': 0.5794631838798523, 'learning_rate': 1.2227014445129048e-05, 'epoch': 4.5}


                                                      
 13%|█▎        | 6278/47590 [2:12:27<22:13, 30.97it/s]

{'embedding_loss': 0.0136, 'grad_norm': 0.7512302398681641, 'learning_rate': 1.2158554117888684e-05, 'epoch': 4.53}


                                                      
 13%|█▎        | 6278/47590 [2:12:29<22:13, 30.97it/s]

{'embedding_loss': 0.0157, 'grad_norm': 0.383697509765625, 'learning_rate': 1.2090093790648321e-05, 'epoch': 4.56}


                                                      
 13%|█▎        | 6278/47590 [2:12:33<22:13, 30.97it/s]

{'embedding_loss': 0.0142, 'grad_norm': 0.13575433194637299, 'learning_rate': 1.2021633463407955e-05, 'epoch': 4.59}


                                                      
 13%|█▎        | 6278/47590 [2:12:35<22:13, 30.97it/s]

{'embedding_loss': 0.0156, 'grad_norm': 0.2229558676481247, 'learning_rate': 1.1953173136167592e-05, 'epoch': 4.62}


                                                      
 13%|█▎        | 6278/47590 [2:12:37<22:13, 30.97it/s]


{'embedding_loss': 0.0131, 'grad_norm': 0.3660168945789337, 'learning_rate': 1.1884712808927228e-05, 'epoch': 4.65}


                                                      A
 13%|█▎        | 6278/47590 [2:12:39<22:13, 30.97it/s]

{'embedding_loss': 0.016, 'grad_norm': 0.5860509872436523, 'learning_rate': 1.1816252481686862e-05, 'epoch': 4.68}


                                                      
 13%|█▎        | 6278/47590 [2:12:41<22:13, 30.97it/s]

{'embedding_loss': 0.0145, 'grad_norm': 0.46947818994522095, 'learning_rate': 1.17477921544465e-05, 'epoch': 4.71}


                                                      
 13%|█▎        | 6278/47590 [2:12:42<22:13, 30.97it/s]


{'embedding_loss': 0.0138, 'grad_norm': 0.0782521590590477, 'learning_rate': 1.1679331827206135e-05, 'epoch': 4.74}


                                                      A
 13%|█▎        | 6278/47590 [2:12:44<22:13, 30.97it/s]

{'embedding_loss': 0.0146, 'grad_norm': 0.36725345253944397, 'learning_rate': 1.161087149996577e-05, 'epoch': 4.78}


                                                      
 13%|█▎        | 6278/47590 [2:12:46<22:13, 30.97it/s]

{'embedding_loss': 0.0141, 'grad_norm': 0.0498366542160511, 'learning_rate': 1.1542411172725407e-05, 'epoch': 4.81}


                                                      
 13%|█▎        | 6278/47590 [2:12:47<22:13, 30.97it/s]

{'embedding_loss': 0.0164, 'grad_norm': 0.5792862176895142, 'learning_rate': 1.1473950845485042e-05, 'epoch': 4.84}


                                                      
 13%|█▎        | 6278/47590 [2:12:49<22:13, 30.97it/s]

{'embedding_loss': 0.0138, 'grad_norm': 0.5864415764808655, 'learning_rate': 1.1405490518244678e-05, 'epoch': 4.87}


                                                      
 13%|█▎        | 6278/47590 [2:12:50<22:13, 30.97it/s]

{'embedding_loss': 0.0138, 'grad_norm': 0.4533693492412567, 'learning_rate': 1.1337030191004314e-05, 'epoch': 4.9}


                                                      
 13%|█▎        | 6278/47590 [2:12:52<22:13, 30.97it/s]

{'embedding_loss': 0.0155, 'grad_norm': 0.36798641085624695, 'learning_rate': 1.126856986376395e-05, 'epoch': 4.93}


                                                      
 13%|█▎        | 6278/47590 [2:12:55<22:13, 30.97it/s]

{'embedding_loss': 0.015, 'grad_norm': 0.5287282466888428, 'learning_rate': 1.1200109536523585e-05, 'epoch': 4.96}


                                                      
 13%|█▎        | 6278/47590 [2:12:56<22:13, 30.97it/s]

{'embedding_loss': 0.0171, 'grad_norm': 0.3391193747520447, 'learning_rate': 1.1131649209283221e-05, 'epoch': 4.99}


                                                      
 13%|█▎        | 6278/47590 [2:12:58<22:13, 30.97it/s]


{'embedding_loss': 0.0134, 'grad_norm': 0.13634845614433289, 'learning_rate': 1.1063188882042857e-05, 'epoch': 5.02}


                                                      A
 13%|█▎        | 6278/47590 [2:13:00<22:13, 30.97it/s]

{'embedding_loss': 0.0135, 'grad_norm': 0.33730682730674744, 'learning_rate': 1.0994728554802493e-05, 'epoch': 5.05}


                                                      
 13%|█▎        | 6278/47590 [2:13:01<22:13, 30.97it/s]

{'embedding_loss': 0.014, 'grad_norm': 0.11890342086553574, 'learning_rate': 1.0926268227562128e-05, 'epoch': 5.08}


                                                      
 13%|█▎        | 6278/47590 [2:13:03<22:13, 30.97it/s]


{'embedding_loss': 0.0134, 'grad_norm': 0.3974756896495819, 'learning_rate': 1.0857807900321764e-05, 'epoch': 5.11}


                                                      A
 13%|█▎        | 6278/47590 [2:13:05<22:13, 30.97it/s]

{'embedding_loss': 0.0149, 'grad_norm': 0.35457295179367065, 'learning_rate': 1.0789347573081401e-05, 'epoch': 5.14}


                                                      
 13%|█▎        | 6278/47590 [2:13:07<22:13, 30.97it/s]


{'embedding_loss': 0.0127, 'grad_norm': 0.245176300406456, 'learning_rate': 1.0720887245841035e-05, 'epoch': 5.18}


                                                      A
 13%|█▎        | 6278/47590 [2:13:09<22:13, 30.97it/s]

{'embedding_loss': 0.013, 'grad_norm': 0.0877898782491684, 'learning_rate': 1.0652426918600671e-05, 'epoch': 5.21}


                                                      
 13%|█▎        | 6278/47590 [2:13:11<22:13, 30.97it/s]

{'embedding_loss': 0.0187, 'grad_norm': 0.5585946440696716, 'learning_rate': 1.0583966591360309e-05, 'epoch': 5.24}


                                                      
 13%|█▎        | 6278/47590 [2:13:14<22:13, 30.97it/s]

{'embedding_loss': 0.0131, 'grad_norm': 0.40784627199172974, 'learning_rate': 1.0515506264119943e-05, 'epoch': 5.27}


                                                      
 13%|█▎        | 6278/47590 [2:13:15<22:13, 30.97it/s]


{'embedding_loss': 0.0154, 'grad_norm': 0.44867560267448425, 'learning_rate': 1.0447045936879578e-05, 'epoch': 5.3}


                                                      A
 13%|█▎        | 6278/47590 [2:13:17<22:13, 30.97it/s]


{'embedding_loss': 0.0153, 'grad_norm': 0.1574753075838089, 'learning_rate': 1.0378585609639216e-05, 'epoch': 5.33}


                                                      A
 13%|█▎        | 6278/47590 [2:13:19<22:13, 30.97it/s]

{'embedding_loss': 0.0141, 'grad_norm': 0.3115479648113251, 'learning_rate': 1.0310125282398852e-05, 'epoch': 5.36}


                                                      
 13%|█▎        | 6278/47590 [2:13:21<22:13, 30.97it/s]

{'embedding_loss': 0.0153, 'grad_norm': 0.7486386895179749, 'learning_rate': 1.0241664955158486e-05, 'epoch': 5.39}


                                                      
 13%|█▎        | 6278/47590 [2:13:23<22:13, 30.97it/s]

{'embedding_loss': 0.0115, 'grad_norm': 0.06817831844091415, 'learning_rate': 1.0173204627918123e-05, 'epoch': 5.42}


                                                      
 13%|█▎        | 6278/47590 [2:13:25<22:13, 30.97it/s]

{'embedding_loss': 0.0152, 'grad_norm': 0.532524824142456, 'learning_rate': 1.0104744300677759e-05, 'epoch': 5.45}


                                                      
 13%|█▎        | 6278/47590 [2:13:26<22:13, 30.97it/s]


{'embedding_loss': 0.0164, 'grad_norm': 0.5155778527259827, 'learning_rate': 1.0036283973437393e-05, 'epoch': 5.48}


                                                      A
 13%|█▎        | 6278/47590 [2:13:28<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.4244561493396759, 'learning_rate': 9.96782364619703e-06, 'epoch': 5.51}


                                                      
 13%|█▎        | 6278/47590 [2:13:29<22:13, 30.97it/s]

{'embedding_loss': 0.014, 'grad_norm': 0.2513333559036255, 'learning_rate': 9.899363318956666e-06, 'epoch': 5.55}


                                                      
 13%|█▎        | 6278/47590 [2:13:32<22:13, 30.97it/s]

{'embedding_loss': 0.0109, 'grad_norm': 0.35611799359321594, 'learning_rate': 9.830902991716302e-06, 'epoch': 5.58}


                                                      
 13%|█▎        | 6278/47590 [2:13:34<22:13, 30.97it/s]

{'embedding_loss': 0.0131, 'grad_norm': 0.16062875092029572, 'learning_rate': 9.762442664475937e-06, 'epoch': 5.61}


                                                      
 13%|█▎        | 6278/47590 [2:13:36<22:13, 30.97it/s]

{'embedding_loss': 0.0168, 'grad_norm': 0.2978407144546509, 'learning_rate': 9.693982337235573e-06, 'epoch': 5.64}


                                                      
 13%|█▎        | 6278/47590 [2:13:38<22:13, 30.97it/s]

{'embedding_loss': 0.013, 'grad_norm': 0.48279279470443726, 'learning_rate': 9.625522009995209e-06, 'epoch': 5.67}


                                                      
 13%|█▎        | 6278/47590 [2:13:40<22:13, 30.97it/s]

{'embedding_loss': 0.0136, 'grad_norm': 0.2618716061115265, 'learning_rate': 9.557061682754844e-06, 'epoch': 5.7}


                                                      
 13%|█▎        | 6278/47590 [2:13:42<22:13, 30.97it/s]

{'embedding_loss': 0.0144, 'grad_norm': 0.1434340476989746, 'learning_rate': 9.48860135551448e-06, 'epoch': 5.73}


                                                      
 13%|█▎        | 6278/47590 [2:13:44<22:13, 30.97it/s]


{'embedding_loss': 0.0123, 'grad_norm': 0.23584984242916107, 'learning_rate': 9.420141028274116e-06, 'epoch': 5.76}


                                                      A
 13%|█▎        | 6278/47590 [2:13:45<22:13, 30.97it/s]

{'embedding_loss': 0.0142, 'grad_norm': 0.4273903965950012, 'learning_rate': 9.351680701033752e-06, 'epoch': 5.79}


                                                      
 13%|█▎        | 6278/47590 [2:13:47<22:13, 30.97it/s]

{'embedding_loss': 0.0136, 'grad_norm': 0.7752735614776611, 'learning_rate': 9.283220373793387e-06, 'epoch': 5.82}


                                                      
 13%|█▎        | 6278/47590 [2:13:49<22:13, 30.97it/s]

{'embedding_loss': 0.0163, 'grad_norm': 0.5394701957702637, 'learning_rate': 9.214760046553023e-06, 'epoch': 5.85}


                                                      
 13%|█▎        | 6278/47590 [2:13:51<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.21485860645771027, 'learning_rate': 9.146299719312659e-06, 'epoch': 5.88}


                                                      
 13%|█▎        | 6278/47590 [2:13:53<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.44495081901550293, 'learning_rate': 9.077839392072295e-06, 'epoch': 5.91}


                                                      
 13%|█▎        | 6278/47590 [2:13:56<22:13, 30.97it/s]

{'embedding_loss': 0.0115, 'grad_norm': 0.3478599488735199, 'learning_rate': 9.00937906483193e-06, 'epoch': 5.95}


                                                      
 13%|█▎        | 6278/47590 [2:13:58<22:13, 30.97it/s]

{'embedding_loss': 0.0162, 'grad_norm': 0.6685107946395874, 'learning_rate': 8.940918737591566e-06, 'epoch': 5.98}


                                                      
 13%|█▎        | 6278/47590 [2:14:00<22:13, 30.97it/s]

{'embedding_loss': 0.0116, 'grad_norm': 0.45250630378723145, 'learning_rate': 8.872458410351202e-06, 'epoch': 6.01}


                                                      
 13%|█▎        | 6278/47590 [2:14:02<22:13, 30.97it/s]

{'embedding_loss': 0.0128, 'grad_norm': 0.5054776668548584, 'learning_rate': 8.80399808311084e-06, 'epoch': 6.04}


                                                      
 13%|█▎        | 6278/47590 [2:14:04<22:13, 30.97it/s]


{'embedding_loss': 0.0114, 'grad_norm': 0.28241363167762756, 'learning_rate': 8.735537755870473e-06, 'epoch': 6.07}


                                                      A
 13%|█▎        | 6278/47590 [2:14:06<22:13, 30.97it/s]

{'embedding_loss': 0.0176, 'grad_norm': 0.5190147161483765, 'learning_rate': 8.667077428630109e-06, 'epoch': 6.1}


                                                      
 13%|█▎        | 6278/47590 [2:14:08<22:13, 30.97it/s]

{'embedding_loss': 0.0131, 'grad_norm': 0.2501208186149597, 'learning_rate': 8.598617101389746e-06, 'epoch': 6.13}


                                                      
 13%|█▎        | 6278/47590 [2:14:10<22:13, 30.97it/s]

{'embedding_loss': 0.0134, 'grad_norm': 0.3960076868534088, 'learning_rate': 8.53015677414938e-06, 'epoch': 6.16}


                                                      
 13%|█▎        | 6278/47590 [2:14:13<22:13, 30.97it/s]

{'embedding_loss': 0.0138, 'grad_norm': 0.47452089190483093, 'learning_rate': 8.461696446909016e-06, 'epoch': 6.19}


                                                      
 13%|█▎        | 6278/47590 [2:14:16<22:13, 30.97it/s]


{'embedding_loss': 0.0137, 'grad_norm': 0.40346747636795044, 'learning_rate': 8.393236119668654e-06, 'epoch': 6.22}


                                                      [A
 13%|█▎        | 6278/47590 [2:14:18<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.5908938646316528, 'learning_rate': 8.32477579242829e-06, 'epoch': 6.25}


                                                      
 13%|█▎        | 6278/47590 [2:14:20<22:13, 30.97it/s]

{'embedding_loss': 0.0128, 'grad_norm': 0.8141505122184753, 'learning_rate': 8.256315465187923e-06, 'epoch': 6.28}


                                                      
 13%|█▎        | 6278/47590 [2:14:21<22:13, 30.97it/s]

{'embedding_loss': 0.0132, 'grad_norm': 0.3025493323802948, 'learning_rate': 8.18785513794756e-06, 'epoch': 6.32}


                                                      
 13%|█▎        | 6278/47590 [2:14:23<22:13, 30.97it/s]


{'embedding_loss': 0.0126, 'grad_norm': 0.6346578598022461, 'learning_rate': 8.119394810707196e-06, 'epoch': 6.35}


                                                      [A
 13%|█▎        | 6278/47590 [2:14:25<22:13, 30.97it/s]

{'embedding_loss': 0.0118, 'grad_norm': 0.1441466510295868, 'learning_rate': 8.05093448346683e-06, 'epoch': 6.38}


                                                      
 13%|█▎        | 6278/47590 [2:14:27<22:13, 30.97it/s]

{'embedding_loss': 0.0108, 'grad_norm': 0.21440333127975464, 'learning_rate': 7.982474156226468e-06, 'epoch': 6.41}


                                                      
 13%|█▎        | 6278/47590 [2:14:29<22:13, 30.97it/s]

{'embedding_loss': 0.012, 'grad_norm': 0.22453835606575012, 'learning_rate': 7.914013828986104e-06, 'epoch': 6.44}


                                                      
 13%|█▎        | 6278/47590 [2:14:33<22:13, 30.97it/s]

{'embedding_loss': 0.0115, 'grad_norm': 0.19116921722888947, 'learning_rate': 7.845553501745738e-06, 'epoch': 6.47}


                                                      
 13%|█▎        | 6278/47590 [2:14:36<22:13, 30.97it/s]

{'embedding_loss': 0.0124, 'grad_norm': 0.4763047397136688, 'learning_rate': 7.777093174505375e-06, 'epoch': 6.5}


                                                      
 13%|█▎        | 6278/47590 [2:14:40<22:13, 30.97it/s]

{'embedding_loss': 0.0147, 'grad_norm': 0.18838657438755035, 'learning_rate': 7.70863284726501e-06, 'epoch': 6.53}


                                                      
 13%|█▎        | 6278/47590 [2:14:43<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.35521185398101807, 'learning_rate': 7.640172520024647e-06, 'epoch': 6.56}


                                                      
 13%|█▎        | 6278/47590 [2:14:45<22:13, 30.97it/s]

{'embedding_loss': 0.014, 'grad_norm': 0.7039448618888855, 'learning_rate': 7.571712192784282e-06, 'epoch': 6.59}


                                                      
 13%|█▎        | 6278/47590 [2:14:47<22:13, 30.97it/s]

{'embedding_loss': 0.0147, 'grad_norm': 0.7237414717674255, 'learning_rate': 7.503251865543918e-06, 'epoch': 6.62}


                                                      
 13%|█▎        | 6278/47590 [2:14:49<22:13, 30.97it/s]


{'embedding_loss': 0.0126, 'grad_norm': 0.16487430036067963, 'learning_rate': 7.434791538303554e-06, 'epoch': 6.65}


                                                      [A
 13%|█▎        | 6278/47590 [2:14:51<22:13, 30.97it/s]

{'embedding_loss': 0.0121, 'grad_norm': 0.2779931426048279, 'learning_rate': 7.3663312110631894e-06, 'epoch': 6.69}


                                                      
 13%|█▎        | 6278/47590 [2:14:53<22:13, 30.97it/s]

{'embedding_loss': 0.0119, 'grad_norm': 0.4305085241794586, 'learning_rate': 7.297870883822825e-06, 'epoch': 6.72}


                                                      
 13%|█▎        | 6278/47590 [2:14:55<22:13, 30.97it/s]

{'embedding_loss': 0.0136, 'grad_norm': 0.28573447465896606, 'learning_rate': 7.229410556582461e-06, 'epoch': 6.75}


                                                      
 13%|█▎        | 6278/47590 [2:14:57<22:13, 30.97it/s]

{'embedding_loss': 0.0131, 'grad_norm': 0.1977989822626114, 'learning_rate': 7.1609502293420974e-06, 'epoch': 6.78}


                                                      
 13%|█▎        | 6278/47590 [2:15:00<22:13, 30.97it/s]

{'embedding_loss': 0.0108, 'grad_norm': 0.30690598487854004, 'learning_rate': 7.092489902101732e-06, 'epoch': 6.81}


                                                      
 13%|█▎        | 6278/47590 [2:15:02<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.40847504138946533, 'learning_rate': 7.024029574861369e-06, 'epoch': 6.84}


                                                      
 13%|█▎        | 6278/47590 [2:15:04<22:13, 30.97it/s]

{'embedding_loss': 0.0122, 'grad_norm': 0.21605147421360016, 'learning_rate': 6.955569247621005e-06, 'epoch': 6.87}


                                                      
 13%|█▎        | 6278/47590 [2:15:07<22:13, 30.97it/s]

{'embedding_loss': 0.0145, 'grad_norm': 0.6673287749290466, 'learning_rate': 6.8871089203806395e-06, 'epoch': 6.9}


                                                      
 13%|█▎        | 6278/47590 [2:15:10<22:13, 30.97it/s]

{'embedding_loss': 0.0122, 'grad_norm': 0.20617592334747314, 'learning_rate': 6.818648593140276e-06, 'epoch': 6.93}


                                                      
 13%|█▎        | 6278/47590 [2:15:16<22:13, 30.97it/s]

{'embedding_loss': 0.0108, 'grad_norm': 0.3043263256549835, 'learning_rate': 6.750188265899912e-06, 'epoch': 6.96}


                                                      
 13%|█▎        | 6278/47590 [2:15:20<22:13, 30.97it/s]

{'embedding_loss': 0.0132, 'grad_norm': 0.3047933876514435, 'learning_rate': 6.681727938659547e-06, 'epoch': 6.99}


                                                      
 13%|█▎        | 6278/47590 [2:15:23<22:13, 30.97it/s]

{'embedding_loss': 0.0088, 'grad_norm': 0.3411635160446167, 'learning_rate': 6.613267611419183e-06, 'epoch': 7.02}


                                                      
 13%|█▎        | 6278/47590 [2:15:25<22:13, 30.97it/s]

{'embedding_loss': 0.0106, 'grad_norm': 0.30203789472579956, 'learning_rate': 6.544807284178819e-06, 'epoch': 7.05}


                                                      
 13%|█▎        | 6278/47590 [2:15:28<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.4068133234977722, 'learning_rate': 6.4763469569384556e-06, 'epoch': 7.09}


                                                      
 13%|█▎        | 6278/47590 [2:15:32<22:13, 30.97it/s]


{'embedding_loss': 0.0123, 'grad_norm': 0.11512594670057297, 'learning_rate': 6.4078866296980904e-06, 'epoch': 7.12}


                                                      [A
 13%|█▎        | 6278/47590 [2:15:34<22:13, 30.97it/s]

{'embedding_loss': 0.0113, 'grad_norm': 0.5621802806854248, 'learning_rate': 6.339426302457726e-06, 'epoch': 7.15}


                                                      
 13%|█▎        | 6278/47590 [2:15:37<22:13, 30.97it/s]

{'embedding_loss': 0.0108, 'grad_norm': 0.27626895904541016, 'learning_rate': 6.270965975217363e-06, 'epoch': 7.18}


                                                      
 13%|█▎        | 6278/47590 [2:15:40<22:13, 30.97it/s]

{'embedding_loss': 0.0146, 'grad_norm': 0.5735268592834473, 'learning_rate': 6.202505647976998e-06, 'epoch': 7.21}


                                                      
 13%|█▎        | 6278/47590 [2:15:42<22:13, 30.97it/s]


{'embedding_loss': 0.0116, 'grad_norm': 0.0888427272439003, 'learning_rate': 6.134045320736633e-06, 'epoch': 7.24}


                                                      [A
 13%|█▎        | 6278/47590 [2:15:44<22:13, 30.97it/s]

{'embedding_loss': 0.0124, 'grad_norm': 0.4720754027366638, 'learning_rate': 6.06558499349627e-06, 'epoch': 7.27}


                                                      
 13%|█▎        | 6278/47590 [2:15:45<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.4319685101509094, 'learning_rate': 5.997124666255906e-06, 'epoch': 7.3}


                                                      
 13%|█▎        | 6278/47590 [2:15:47<22:13, 30.97it/s]

{'embedding_loss': 0.0126, 'grad_norm': 0.03927375748753548, 'learning_rate': 5.9286643390155405e-06, 'epoch': 7.33}


                                                      
 13%|█▎        | 6278/47590 [2:15:48<22:13, 30.97it/s]

{'embedding_loss': 0.012, 'grad_norm': 0.32871121168136597, 'learning_rate': 5.860204011775177e-06, 'epoch': 7.36}


                                                      
 13%|█▎        | 6278/47590 [2:15:50<22:13, 30.97it/s]

{'embedding_loss': 0.0112, 'grad_norm': 0.38732683658599854, 'learning_rate': 5.791743684534813e-06, 'epoch': 7.39}


                                                      
 13%|█▎        | 6278/47590 [2:15:53<22:13, 30.97it/s]

{'embedding_loss': 0.0129, 'grad_norm': 0.33128851652145386, 'learning_rate': 5.723283357294448e-06, 'epoch': 7.42}


                                                      
 13%|█▎        | 6278/47590 [2:15:54<22:13, 30.97it/s]

{'embedding_loss': 0.0139, 'grad_norm': 0.43831080198287964, 'learning_rate': 5.654823030054084e-06, 'epoch': 7.46}


                                                      
 13%|█▎        | 6278/47590 [2:15:56<22:13, 30.97it/s]

{'embedding_loss': 0.0129, 'grad_norm': 0.3773360550403595, 'learning_rate': 5.58636270281372e-06, 'epoch': 7.49}


                                                      
 13%|█▎        | 6278/47590 [2:15:58<22:13, 30.97it/s]

{'embedding_loss': 0.0119, 'grad_norm': 0.27260395884513855, 'learning_rate': 5.517902375573355e-06, 'epoch': 7.52}


                                                      
 13%|█▎        | 6278/47590 [2:16:00<22:13, 30.97it/s]

{'embedding_loss': 0.0119, 'grad_norm': 0.11703986674547195, 'learning_rate': 5.4494420483329915e-06, 'epoch': 7.55}


                                                      
 13%|█▎        | 6278/47590 [2:16:02<22:13, 30.97it/s]

{'embedding_loss': 0.0132, 'grad_norm': 0.08718859404325485, 'learning_rate': 5.380981721092627e-06, 'epoch': 7.58}


                                                      
 13%|█▎        | 6278/47590 [2:16:03<22:13, 30.97it/s]


{'embedding_loss': 0.0125, 'grad_norm': 0.621203601360321, 'learning_rate': 5.312521393852264e-06, 'epoch': 7.61}


                                                      [A
 13%|█▎        | 6278/47590 [2:16:05<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.21034479141235352, 'learning_rate': 5.244061066611899e-06, 'epoch': 7.64}


                                                      
 13%|█▎        | 6278/47590 [2:16:07<22:13, 30.97it/s]

{'embedding_loss': 0.01, 'grad_norm': 0.3375319838523865, 'learning_rate': 5.175600739371534e-06, 'epoch': 7.67}


                                                      
 13%|█▎        | 6278/47590 [2:16:09<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.19483834505081177, 'learning_rate': 5.107140412131171e-06, 'epoch': 7.7}


                                                      
 13%|█▎        | 6278/47590 [2:16:12<22:13, 30.97it/s]

{'embedding_loss': 0.0144, 'grad_norm': 0.6559750437736511, 'learning_rate': 5.038680084890806e-06, 'epoch': 7.73}


                                                      
 13%|█▎        | 6278/47590 [2:16:14<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.2809711694717407, 'learning_rate': 4.9702197576504415e-06, 'epoch': 7.76}


                                                      
 13%|█▎        | 6278/47590 [2:16:16<22:13, 30.97it/s]

{'embedding_loss': 0.0119, 'grad_norm': 0.2387690246105194, 'learning_rate': 4.901759430410078e-06, 'epoch': 7.79}


                                                      
 13%|█▎        | 6278/47590 [2:16:19<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.17608456313610077, 'learning_rate': 4.833299103169714e-06, 'epoch': 7.83}


                                                      
 13%|█▎        | 6278/47590 [2:16:22<22:13, 30.97it/s]

{'embedding_loss': 0.01, 'grad_norm': 0.2645356357097626, 'learning_rate': 4.764838775929349e-06, 'epoch': 7.86}


                                                      
 13%|█▎        | 6278/47590 [2:16:24<22:13, 30.97it/s]

{'embedding_loss': 0.0125, 'grad_norm': 0.18503466248512268, 'learning_rate': 4.696378448688985e-06, 'epoch': 7.89}


                                                      
 13%|█▎        | 6278/47590 [2:16:26<22:13, 30.97it/s]

{'embedding_loss': 0.0087, 'grad_norm': 0.2461482286453247, 'learning_rate': 4.627918121448621e-06, 'epoch': 7.92}


                                                      
 13%|█▎        | 6278/47590 [2:16:29<22:13, 30.97it/s]

{'embedding_loss': 0.0103, 'grad_norm': 0.22264523804187775, 'learning_rate': 4.559457794208257e-06, 'epoch': 7.95}


                                                      
 13%|█▎        | 6278/47590 [2:16:31<22:13, 30.97it/s]

{'embedding_loss': 0.0121, 'grad_norm': 0.4166467487812042, 'learning_rate': 4.4909974669678925e-06, 'epoch': 7.98}


                                                      
 13%|█▎        | 6278/47590 [2:16:33<22:13, 30.97it/s]

{'embedding_loss': 0.0126, 'grad_norm': 0.6117488741874695, 'learning_rate': 4.422537139727528e-06, 'epoch': 8.01}


                                                      
 13%|█▎        | 6278/47590 [2:16:35<22:13, 30.97it/s]

{'embedding_loss': 0.0113, 'grad_norm': 0.35022759437561035, 'learning_rate': 4.354076812487164e-06, 'epoch': 8.04}


                                                      
 13%|█▎        | 6278/47590 [2:16:37<22:13, 30.97it/s]


{'embedding_loss': 0.011, 'grad_norm': 0.5367063879966736, 'learning_rate': 4.2856164852468e-06, 'epoch': 8.07}


                                                      [A
 13%|█▎        | 6278/47590 [2:16:39<22:13, 30.97it/s]


{'embedding_loss': 0.012, 'grad_norm': 0.6678882837295532, 'learning_rate': 4.217156158006435e-06, 'epoch': 8.1}


                                                      [A
 13%|█▎        | 6278/47590 [2:16:41<22:13, 30.97it/s]

{'embedding_loss': 0.0105, 'grad_norm': 0.1846688985824585, 'learning_rate': 4.148695830766071e-06, 'epoch': 8.13}


                                                      
 13%|█▎        | 6278/47590 [2:16:43<22:13, 30.97it/s]

{'embedding_loss': 0.0112, 'grad_norm': 0.11240231245756149, 'learning_rate': 4.080235503525708e-06, 'epoch': 8.16}


                                                      
 13%|█▎        | 6278/47590 [2:16:45<22:13, 30.97it/s]

{'embedding_loss': 0.0089, 'grad_norm': 0.317658007144928, 'learning_rate': 4.0117751762853425e-06, 'epoch': 8.19}


                                                      
 13%|█▎        | 6278/47590 [2:16:52<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.14284434914588928, 'learning_rate': 3.943314849044978e-06, 'epoch': 8.23}


                                                      
 13%|█▎        | 6278/47590 [2:16:55<22:13, 30.97it/s]

{'embedding_loss': 0.0102, 'grad_norm': 0.41461679339408875, 'learning_rate': 3.874854521804615e-06, 'epoch': 8.26}


                                                      
 13%|█▎        | 6278/47590 [2:16:57<22:13, 30.97it/s]

{'embedding_loss': 0.0099, 'grad_norm': 0.4612845480442047, 'learning_rate': 3.80639419456425e-06, 'epoch': 8.29}


                                                      
 13%|█▎        | 6278/47590 [2:16:58<22:13, 30.97it/s]

{'embedding_loss': 0.0102, 'grad_norm': 0.1961756944656372, 'learning_rate': 3.7379338673238863e-06, 'epoch': 8.32}


                                                      
 13%|█▎        | 6278/47590 [2:17:00<22:13, 30.97it/s]

{'embedding_loss': 0.0104, 'grad_norm': 0.3639453053474426, 'learning_rate': 3.669473540083522e-06, 'epoch': 8.35}


                                                      
 13%|█▎        | 6278/47590 [2:17:02<22:13, 30.97it/s]

{'embedding_loss': 0.0112, 'grad_norm': 0.6579586863517761, 'learning_rate': 3.6010132128431573e-06, 'epoch': 8.38}


                                                      
 13%|█▎        | 6278/47590 [2:17:04<22:13, 30.97it/s]

{'embedding_loss': 0.012, 'grad_norm': 0.9430457353591919, 'learning_rate': 3.5325528856027935e-06, 'epoch': 8.41}


                                                      
 13%|█▎        | 6278/47590 [2:17:05<22:13, 30.97it/s]

{'embedding_loss': 0.0098, 'grad_norm': 0.3449748456478119, 'learning_rate': 3.464092558362429e-06, 'epoch': 8.44}


                                                      
 13%|█▎        | 6278/47590 [2:17:08<22:13, 30.97it/s]

{'embedding_loss': 0.0137, 'grad_norm': 0.5329613089561462, 'learning_rate': 3.3956322311220654e-06, 'epoch': 8.47}


                                                      
 13%|█▎        | 6278/47590 [2:17:09<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.38069817423820496, 'learning_rate': 3.3271719038817007e-06, 'epoch': 8.5}


                                                      
 13%|█▎        | 6278/47590 [2:17:12<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.3307301998138428, 'learning_rate': 3.258711576641337e-06, 'epoch': 8.53}


                                                      
 13%|█▎        | 6278/47590 [2:17:17<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.6502041220664978, 'learning_rate': 3.1902512494009725e-06, 'epoch': 8.56}


                                                      
 13%|█▎        | 6278/47590 [2:17:19<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.3567027449607849, 'learning_rate': 3.121790922160608e-06, 'epoch': 8.6}


                                                      
 13%|█▎        | 6278/47590 [2:17:22<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.2807309329509735, 'learning_rate': 3.053330594920244e-06, 'epoch': 8.63}


                                                      
 13%|█▎        | 6278/47590 [2:17:27<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.25967174768447876, 'learning_rate': 2.9848702676798797e-06, 'epoch': 8.66}


                                                      
 13%|█▎        | 6278/47590 [2:17:30<22:13, 30.97it/s]

{'embedding_loss': 0.0151, 'grad_norm': 0.244980126619339, 'learning_rate': 2.916409940439516e-06, 'epoch': 8.69}


                                                      
 13%|█▎        | 6278/47590 [2:17:33<22:13, 30.97it/s]

{'embedding_loss': 0.0125, 'grad_norm': 0.5586204528808594, 'learning_rate': 2.847949613199151e-06, 'epoch': 8.72}


                                                      
 13%|█▎        | 6278/47590 [2:17:35<22:13, 30.97it/s]

{'embedding_loss': 0.0123, 'grad_norm': 0.14289438724517822, 'learning_rate': 2.779489285958787e-06, 'epoch': 8.75}


                                                      
 13%|█▎        | 6278/47590 [2:17:37<22:13, 30.97it/s]

{'embedding_loss': 0.0108, 'grad_norm': 0.3646852970123291, 'learning_rate': 2.711028958718423e-06, 'epoch': 8.78}


                                                      
 13%|█▎        | 6278/47590 [2:17:39<22:13, 30.97it/s]


{'embedding_loss': 0.0114, 'grad_norm': 0.14382250607013702, 'learning_rate': 2.6425686314780583e-06, 'epoch': 8.81}


                                                      [A
 13%|█▎        | 6278/47590 [2:17:42<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.3999421000480652, 'learning_rate': 2.5741083042376945e-06, 'epoch': 8.84}


                                                      
 13%|█▎        | 6278/47590 [2:17:44<22:13, 30.97it/s]

{'embedding_loss': 0.0105, 'grad_norm': 0.1019887626171112, 'learning_rate': 2.5056479769973302e-06, 'epoch': 8.87}


                                                      
 13%|█▎        | 6278/47590 [2:17:46<22:13, 30.97it/s]

{'embedding_loss': 0.0105, 'grad_norm': 0.19197647273540497, 'learning_rate': 2.437187649756966e-06, 'epoch': 8.9}


                                                      
 13%|█▎        | 6278/47590 [2:17:48<22:13, 30.97it/s]

{'embedding_loss': 0.0101, 'grad_norm': 0.24186736345291138, 'learning_rate': 2.3687273225166017e-06, 'epoch': 8.93}


                                                      
 13%|█▎        | 6278/47590 [2:17:51<22:13, 30.97it/s]


{'embedding_loss': 0.0116, 'grad_norm': 0.36330023407936096, 'learning_rate': 2.300266995276238e-06, 'epoch': 8.96}


                                                      [A
 13%|█▎        | 6278/47590 [2:17:53<22:13, 30.97it/s]

{'embedding_loss': 0.0115, 'grad_norm': 0.31181278824806213, 'learning_rate': 2.231806668035873e-06, 'epoch': 9.0}


                                                      
 13%|█▎        | 6278/47590 [2:17:55<22:13, 30.97it/s]

{'embedding_loss': 0.0099, 'grad_norm': 0.14751461148262024, 'learning_rate': 2.1633463407955093e-06, 'epoch': 9.03}


                                                      
 13%|█▎        | 6278/47590 [2:17:56<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.31212520599365234, 'learning_rate': 2.094886013555145e-06, 'epoch': 9.06}


                                                      
 13%|█▎        | 6278/47590 [2:17:58<22:13, 30.97it/s]


{'embedding_loss': 0.0093, 'grad_norm': 0.09394580870866776, 'learning_rate': 2.0264256863147807e-06, 'epoch': 9.09}


                                                      [A
 13%|█▎        | 6278/47590 [2:18:00<22:13, 30.97it/s]

{'embedding_loss': 0.0101, 'grad_norm': 0.14798367023468018, 'learning_rate': 1.9579653590744165e-06, 'epoch': 9.12}


                                                      
 13%|█▎        | 6278/47590 [2:18:02<22:13, 30.97it/s]

{'embedding_loss': 0.0112, 'grad_norm': 0.31228432059288025, 'learning_rate': 1.8895050318340524e-06, 'epoch': 9.15}


                                                      
 13%|█▎        | 6278/47590 [2:18:03<22:13, 30.97it/s]

{'embedding_loss': 0.0104, 'grad_norm': 0.5760446786880493, 'learning_rate': 1.821044704593688e-06, 'epoch': 9.18}


                                                      
 13%|█▎        | 6278/47590 [2:18:05<22:13, 30.97it/s]

{'embedding_loss': 0.008, 'grad_norm': 0.2046472430229187, 'learning_rate': 1.7525843773533238e-06, 'epoch': 9.21}


                                                      
 13%|█▎        | 6278/47590 [2:18:08<22:13, 30.97it/s]

{'embedding_loss': 0.0102, 'grad_norm': 0.4679780900478363, 'learning_rate': 1.6841240501129596e-06, 'epoch': 9.24}


                                                      
 13%|█▎        | 6278/47590 [2:18:11<22:13, 30.97it/s]

{'embedding_loss': 0.0114, 'grad_norm': 0.34253862500190735, 'learning_rate': 1.6156637228725955e-06, 'epoch': 9.27}


                                                      
 13%|█▎        | 6278/47590 [2:18:14<22:13, 30.97it/s]

{'embedding_loss': 0.0104, 'grad_norm': 0.308430939912796, 'learning_rate': 1.5472033956322312e-06, 'epoch': 9.3}


                                                      
 13%|█▎        | 6278/47590 [2:18:17<22:13, 30.97it/s]

{'embedding_loss': 0.0141, 'grad_norm': 0.3620949387550354, 'learning_rate': 1.4787430683918672e-06, 'epoch': 9.33}


                                                      
 13%|█▎        | 6278/47590 [2:18:19<22:13, 30.97it/s]


{'embedding_loss': 0.0093, 'grad_norm': 0.33553630113601685, 'learning_rate': 1.4102827411515027e-06, 'epoch': 9.37}


                                                      [A
 13%|█▎        | 6278/47590 [2:18:21<22:13, 30.97it/s]

{'embedding_loss': 0.0129, 'grad_norm': 0.43983808159828186, 'learning_rate': 1.3418224139111386e-06, 'epoch': 9.4}


                                                      
 13%|█▎        | 6278/47590 [2:18:24<22:13, 30.97it/s]

{'embedding_loss': 0.0119, 'grad_norm': 0.2583311200141907, 'learning_rate': 1.2733620866707743e-06, 'epoch': 9.43}


                                                      
 13%|█▎        | 6278/47590 [2:18:26<22:13, 30.97it/s]

{'embedding_loss': 0.0109, 'grad_norm': 0.41985929012298584, 'learning_rate': 1.2049017594304103e-06, 'epoch': 9.46}


                                                      
 13%|█▎        | 6278/47590 [2:18:28<22:13, 30.97it/s]

{'embedding_loss': 0.0113, 'grad_norm': 0.35752415657043457, 'learning_rate': 1.1364414321900458e-06, 'epoch': 9.49}


                                                      
 13%|█▎        | 6278/47590 [2:18:30<22:13, 30.97it/s]

{'embedding_loss': 0.0117, 'grad_norm': 0.15497304499149323, 'learning_rate': 1.0679811049496817e-06, 'epoch': 9.52}


                                                      
 13%|█▎        | 6278/47590 [2:18:33<22:13, 30.97it/s]

{'embedding_loss': 0.0106, 'grad_norm': 0.3149843215942383, 'learning_rate': 9.995207777093177e-07, 'epoch': 9.55}


                                                      
 13%|█▎        | 6278/47590 [2:18:35<22:13, 30.97it/s]

{'embedding_loss': 0.0102, 'grad_norm': 0.6752060055732727, 'learning_rate': 9.310604504689533e-07, 'epoch': 9.58}


                                                      
 13%|█▎        | 6278/47590 [2:18:37<22:13, 30.97it/s]

{'embedding_loss': 0.0112, 'grad_norm': 0.33735451102256775, 'learning_rate': 8.626001232285891e-07, 'epoch': 9.61}


                                                      
 13%|█▎        | 6278/47590 [2:18:38<22:13, 30.97it/s]

{'embedding_loss': 0.01, 'grad_norm': 0.32572853565216064, 'learning_rate': 7.94139795988225e-07, 'epoch': 9.64}


                                                      
 13%|█▎        | 6278/47590 [2:18:40<22:13, 30.97it/s]

{'embedding_loss': 0.0111, 'grad_norm': 0.6386342644691467, 'learning_rate': 7.256794687478606e-07, 'epoch': 9.67}


                                                      
 13%|█▎        | 6278/47590 [2:18:42<22:13, 30.97it/s]

{'embedding_loss': 0.011, 'grad_norm': 0.22989745438098907, 'learning_rate': 6.572191415074964e-07, 'epoch': 9.7}


                                                      
 13%|█▎        | 6278/47590 [2:18:43<22:13, 30.97it/s]

{'embedding_loss': 0.0115, 'grad_norm': 0.3035755157470703, 'learning_rate': 5.887588142671322e-07, 'epoch': 9.74}


                                                      
 13%|█▎        | 6278/47590 [2:18:45<22:13, 30.97it/s]

{'embedding_loss': 0.0122, 'grad_norm': 0.5817296504974365, 'learning_rate': 5.202984870267681e-07, 'epoch': 9.77}


                                                      
 13%|█▎        | 6278/47590 [2:18:47<22:13, 30.97it/s]


{'embedding_loss': 0.0116, 'grad_norm': 0.6715352535247803, 'learning_rate': 4.518381597864038e-07, 'epoch': 9.8}


                                                      [A
 13%|█▎        | 6278/47590 [2:18:48<22:13, 30.97it/s]

{'embedding_loss': 0.0112, 'grad_norm': 0.136265367269516, 'learning_rate': 3.833778325460396e-07, 'epoch': 9.83}


                                                      
 13%|█▎        | 6278/47590 [2:18:50<22:13, 30.97it/s]

{'embedding_loss': 0.0098, 'grad_norm': 0.4246184229850769, 'learning_rate': 3.1491750530567536e-07, 'epoch': 9.86}


                                                      
 13%|█▎        | 6278/47590 [2:18:52<22:13, 30.97it/s]

{'embedding_loss': 0.0102, 'grad_norm': 0.3928261399269104, 'learning_rate': 2.4645717806531114e-07, 'epoch': 9.89}


                                                      
 13%|█▎        | 6278/47590 [2:18:54<22:13, 30.97it/s]

{'embedding_loss': 0.0107, 'grad_norm': 0.3557782769203186, 'learning_rate': 1.7799685082494695e-07, 'epoch': 9.92}


                                                      
 13%|█▎        | 6278/47590 [2:18:56<22:13, 30.97it/s]

{'embedding_loss': 0.0122, 'grad_norm': 0.47588911652565, 'learning_rate': 1.0953652358458274e-07, 'epoch': 9.95}


                                                      
 13%|█▎        | 6278/47590 [2:18:57<22:13, 30.97it/s]


{'embedding_loss': 0.0117, 'grad_norm': 0.2696921229362488, 'learning_rate': 4.1076196344218526e-08, 'epoch': 9.98}


                                                      [A
100%|██████████| 16230/16230 [16:58<00:00, 15.94it/s]]


{'train_runtime': 1018.3114, 'train_samples_per_second': 509.864, 'train_steps_per_second': 15.938, 'train_loss': 0.03165035528106155, 'epoch': 10.0}


Now that all three models have been set up and trained, they can be tested and scored. The testing is done bu running each of them through with testing data while capturing the compute and run time, then calculating the total compute, average runtime, and individually the precision, recall, and F1 by category.

In [26]:
total_flops = 0
total_time = 0
scores = []
num = range(3)
langs = ['java', 'python', 'pharo']
for lan in langs:
    print(lan)
    # to load newly trained models leave as is, otherwise change the directory to './models-pretrained/aight-l6-{lan}-trained'
    model = SetFitModel.from_pretrained(f'./models/aight-l6-{lan}-trained')
    with torch.profiler.profile(with_flops=True) as p:
        begin = time.time()
        for i in range(10):
          y_pred = model(ds[f'{lan}_test']['combo']).numpy().T
        total = time.time() - begin
        total_time = total_time + total
    total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)
    y_true = np.array(ds[f'{lan}_test']['labels']).T
    for i in range(len(y_pred)):
        assert(len(y_pred[i]) == len(y_true[i]))
        tp = sum([true == pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
        tn = sum([true == pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
        fp = sum([true == 0 and pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
        fn = sum([true == 1 and pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2*tp) / (2*tp + fp + fn)
        scores.append({'lan': lan, 'cat': labels[lan][i],'precision': precision,'recall': recall,'f1': f1})
print("Compute in GFLOPs:", total_flops/10)
print("Avg runtime in seconds:", total_time/10)
scores = pd.DataFrame(scores)
scores

java
python
pharo
Compute in GFLOPs: 458.604438528
Avg runtime in seconds: 0.413419508934021


Unnamed: 0,lan,cat,precision,recall,f1
0,java,summary,0.86731,0.806054,0.835561
1,java,Ownership,1.0,1.0,1.0
2,java,Expand,0.318182,0.343137,0.330189
3,java,usage,0.882812,0.786543,0.831902
4,java,Pointer,0.800926,0.940217,0.865
5,java,deprecation,1.0,0.6,0.75
6,java,rational,0.26087,0.264706,0.262774
7,python,Usage,0.694215,0.694215,0.694215
8,python,Parameters,0.808,0.789062,0.798419
9,python,DevelopmentNotes,0.294118,0.487805,0.366972


Next, the models are scored as a whole using their average F1, measured GFLOPS, and average runtime.

In [27]:
max_avg_runtime = 5
max_avg_flops = 5000

# s𝑢𝑏𝑚𝑖𝑠𝑠𝑖𝑜𝑛_𝑠𝑐𝑜𝑟𝑒(𝑚𝑜𝑑𝑒𝑙)=(𝑎𝑣𝑔. 𝐹1)×0.60+((𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)/𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)×0.2+((𝑚𝑎𝑥_GFLOPs−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_GFLOPs)/𝑚𝑎𝑥_GFLOPs)×0.2
def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
      0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
      0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

avg_f1 = scores.f1.mean()
avg_runtime = total_time/10
avg_flops = total_flops/10

round(score(avg_f1, avg_runtime, avg_flops), 2)

0.75

Finally, outputting the average F1 is used to get another measure of the models.

In [28]:
print(avg_f1)

0.6371616155083775
