In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
!cp -r "/content/drive/My Drive/recurrent-transformer-models/model_artifacts/StatefulTransformer_best_model.pth" "StatefulTransformer_best_model.pth"

In [3]:
!rm -rf Recurrent-Neuron-Transformer

In [4]:
!git clone -b add-dev-container-debug-and-run-name https://github.com/ChrisHayduk/Recurrent-Neuron-Transformer.git

Cloning into 'Recurrent-Neuron-Transformer'...
remote: Enumerating objects: 696, done.[K
remote: Counting objects: 100% (248/248), done.[K
remote: Compressing objects: 100% (139/139), done.[K
remote: Total 696 (delta 185), reused 166 (delta 108), pack-reused 448[K
Receiving objects: 100% (696/696), 27.81 MiB | 13.13 MiB/s, done.
Resolving deltas: 100% (442/442), done.


In [5]:
!cd Recurrent-Neuron-Transformer/ && pip install -r requirements.txt



In [7]:
import torch
from torch import nn

class Neurons(nn.Module):
    def __init__(self, n_neurons, device):
        super(Neurons, self).__init__()
        self.device = device

        # Initialize matrix neuron parameters and number of neurons to create
        self.n_neurons = n_neurons
        self.params = nn.Parameter(torch.rand(n_neurons, 3, 3) * 2 - 1)
        self.gelu = nn.GELU()

    def forward(self, inputs, hidden_state=None):
        if hidden_state is not None:
            hidden_state = hidden_state.detach()
        else:
            hidden_state = torch.zeros(1, self.n_neurons, 1, device=self.device)

        batch_size = inputs.shape[0]
        seq_len = inputs.shape[1]

        hidden_batch = hidden_state.expand(batch_size, seq_len, self.n_neurons, 1)
        inputs = inputs.view(batch_size, seq_len, -1, 1)
        ones = torch.ones_like(inputs)


        # Concatenate along the last dimension
        stacked = torch.cat((inputs, hidden_batch, ones), dim=3)

        # Reshape stacked for matrix multiplication: [batch_size, seq_len, n_neurons, 3]
        stacked = stacked.view(batch_size, seq_len, self.n_neurons, 3)

        # Perform matrix multiplication
        dot = self.gelu(torch.matmul(self.params, stacked.unsqueeze(4)).squeeze(4))

        # Update hidden state without in-place operation
        new_hidden = dot[:, :, :, 1].unsqueeze(3).detach()

        return dot[:, :, :, 0], new_hidden

class RecurrentNeuronLayer(nn.Module):
    def __init__(self, input_size, output_size, device):
        super(RecurrentNeuronLayer, self).__init__()
        self.neurons = Neurons(output_size, device)
        self.weights = nn.Linear(input_size, output_size)
        self.device = device

    def forward(self, x, hidden_state=None):
        batch_size = x.shape[0]
        seq_len = x.shape[1]

        x = self.weights(x)
        x, updated_hidden_state = self.neurons(x, hidden_state)

        # Reshape the output to ensure it has the shape [batch_size, n_classes]
        final_output = x.view(batch_size, seq_len, -1)

        return final_output, updated_hidden_state

In [8]:
import numpy as np
import math
import torch
from torch import nn
import random
import torch.functional as F
from dataclasses import dataclass

@dataclass
class RecurrentModelConfig:
    max_length: int = 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    num_heads: int = 12
    hidden_dim: int = 768
    dropout: float = 0.0
    device: str = "cuda"
    recurrent_layers: str = "all"

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        if "proj" == config.recurrent_layers or "all" == config.recurrent_layers:
            self.c_fc = RecurrentNeuronLayer(config.hidden_dim, 4 * config.hidden_dim, config.device)
        else:
            self.c_fc = nn.Linear(config.hidden_dim, 4 * config.hidden_dim)

        self.gelu = nn.GELU()

        if "proj" == config.recurrent_layers or "all" == config.recurrent_layers:
            self.c_proj = RecurrentNeuronLayer(4 * config.hidden_dim, config.hidden_dim, config.device)
        else:
            self.c_proj = nn.Linear(4 * config.hidden_dim, config.hidden_dim)

        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x, hidden_layers = None, layer_num=0):
        if isinstance(self.c_fc, RecurrentNeuronLayer):
            x, hidden_layers[f"c_fc_{layer_num}"] = self.c_fc(x, hidden_layers.get(f"c_fc_{layer_num}"))
        else:
            x =  self.c_fc(x)

        x = self.gelu(x)

        if isinstance(self.c_proj, RecurrentNeuronLayer):
            x, hidden_layers[f"c_proj_{layer_num}"] = self.c_proj(x, hidden_layers.get(f"c_proj_{layer_num}"))
        else:
            x =  self.c_proj(x)

        x = self.dropout(x)
        return x, hidden_layers

class RecurrentCausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.hidden_dim % config.num_heads == 0
        # key, query, value projections for all heads, but in a batch
        if "qkv" == config.recurrent_layers or "all" == config.recurrent_layers:
            self.c_attn = RecurrentNeuronLayer(config.hidden_dim, 3 * config.hidden_dim, config.device)
        else:
            self.c_attn = nn.Linear(config.hidden_dim, 3 * config.hidden_dim)
        # output projection

        if "qkv" == config.recurrent_layers or "all" == config.recurrent_layers:
            self.c_proj = RecurrentNeuronLayer(config.hidden_dim, config.hidden_dim, config.device)
        else:
            self.c_proj = nn.Linear(config.hidden_dim, config.hidden_dim)

        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)

        self.n_head = config.num_heads
        self.n_embd = config.hidden_dim
        self.dropout = config.dropout
        self.max_length = config.max_length

        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(self.max_length, self.max_length))
                                        .view(1, 1, self.max_length, self.max_length))

    def forward(self, x, hidden_layers=None, layer_num=0):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        if isinstance(self.c_attn, RecurrentNeuronLayer):
            proj_output, hidden_layers[f"c_attn_{layer_num}"]  = self.c_attn(x, hidden_layers.get(f"c_attn_{layer_num}"))
        else:
            proj_output = self.c_attn(x)

        q, k, v = proj_output.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        if isinstance(self.c_proj, RecurrentNeuronLayer):
            y, hidden_layers[f"c_proj_{layer_num}"] = self.c_proj(y, hidden_layers.get(f"c_proj_{layer_num}"))
        else:
            y = self.c_proj(y)

        y = self.resid_dropout(y)
        return y, hidden_layers

class RecurrentTransformerBlock(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.hidden_dim)
        self.attn = RecurrentCausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.hidden_dim)
        self.mlp = MLP(config)

    def forward(self, x, hidden_layers = None, layer_num = 0):
        new_x, hidden_layers = self.attn(self.ln_1(x), hidden_layers, layer_num)
        x = x + new_x
        new_x, hidden_layers = self.mlp(self.ln_2(x), hidden_layers, layer_num)
        x = x + new_x
        return x, hidden_layers

class RecurrentNeuronTransformer(nn.Module):
    """
    A single-layer Transformer which encodes a sequence of text and
    performs binary classification.

    The model has a vocab size of V, works on
    sequences of length T, has an hidden dimension of H, uses word vectors
    also of dimension H, and operates on minibatches of size N.
    """
    def __init__(self, config):
        """
        :config
        """
        super(RecurrentNeuronTransformer, self).__init__()
        assert config.hidden_dim % config.num_heads == 0
        assert config.recurrent_layers in set(["qkv", "proj", "all", "none"])

        print(config)

        self.num_heads = config.num_heads
        self.word_embedding_dim = config.hidden_dim
        self.hidden_dim = config.hidden_dim
        self.max_length = config.max_length
        self.vocab_size = config.vocab_size
        self.device = config.device
        self.dropout = config.dropout

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(self.vocab_size, self.word_embedding_dim),
            wpe = nn.Embedding(self.max_length, self.word_embedding_dim),
            drop = nn.Dropout(self.dropout),
            h = nn.ModuleList([RecurrentTransformerBlock(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(self.hidden_dim),
        ))


        self.lm_head = RecurrentNeuronLayer(self.hidden_dim, self.vocab_size, self.device)

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters
        print("Number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, RecurrentNeuronLayer):
            neuron_module = module.neurons
            torch.nn.init.normal_(neuron_module.params, mean=0.0, std=0.02)
            linear_module = module.weights
            torch.nn.init.normal_(linear_module.weight, mean=0.0, std=0.02)
            if linear_module.bias is not None:
                torch.nn.init.zeros_(linear_module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    def forward(self, inputs, hidden_layers):
        """
        This function computes the full Transformer forward pass.
        Put together all of the layers you've developed in the correct order.

        :param inputs: a PyTorch tensor of shape (N,T). These are integer lookups.

        :returns: the model outputs. Should be scores of shape (N,T,output_size).
        """

        embeddings = self.embed(inputs)
        x = self.transformer.drop(embeddings)
        for idx, block in enumerate(self.transformer.h):
            x, hidden_layers = block(x, hidden_layers, idx)
        x = self.transformer.ln_f(x)
        outputs, hidden_layers["lm_output"] = self.lm_head(x, hidden_layers.get("lm_output"))


        return outputs, hidden_layers


    def embed(self, inputs):
        """
        :param inputs: intTensor of shape (N,T)
        :returns embeddings: floatTensor of shape (N,T,H)
        """

        pos = torch.arange(0, self.max_length, dtype=torch.long, device=self.device) # shape (t)
        tok_emb = self.transformer.wte(inputs) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        embeddings  = tok_emb + pos_emb

        return embeddings

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, hidden_layers = None, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, hidden_layers = self(idx_cond, hidden_layers)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import tiktoken
import os

class TextDataset(Dataset):
    def __init__(self, tokens, seq_length, bpe_tokenizer, vocab_size, device):
        self.tokens = tokens
        self.tokenizer = tiktoken.get_encoding(bpe_tokenizer)
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.device = device

    def __len__(self):
        return len(self.tokens) - self.seq_length - 1

    def __getitem__(self, idx):
        input_seq = torch.tensor(self.tokens[idx : idx+self.seq_length], device=self.device)
        target_seq = torch.tensor(self.tokens[idx+1 : idx+1+self.seq_length], device=self.device)
        return input_seq, target_seq


class TextDataLoader:
    def __init__(self, file_path, seq_length, bpe_tokenizer, batch_size, vocab_size, device, split_ratio=0.8):
        self.file_path = file_path
        self.seq_length = seq_length
        self.bpe_tokenizer = bpe_tokenizer
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.device = device
        self.split_ratio = split_ratio

    def load_and_tokenize(self):
        try:
            with open(self.file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            return text
        except IOError:
            print(f"Error opening/reading {self.file_path}")
            return None

    def _create_datasets(self):
        text = self.load_and_tokenize()
        tokenizer = tiktoken.get_encoding(self.bpe_tokenizer)
        tokens = tokenizer.encode_ordinary(text)
        split_index = int(len(tokens) * self.split_ratio)
        train_tokens = tokens[:split_index]
        test_tokens = tokens[split_index:]
        train_dataset = TextDataset(train_tokens, self.seq_length, self.bpe_tokenizer, self.vocab_size, self.device)
        test_dataset = TextDataset(test_tokens, self.seq_length, self.bpe_tokenizer, self.vocab_size, self.device)
        return train_dataset, test_dataset

    def create_loaders(self):
        train_dataset, test_dataset = self._create_datasets()
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True)
        return train_loader, test_loader


In [10]:
# Device configuration
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
elif torch.backends.mps.is_available():
    DEVICE = torch.device('mps')
else:
    DEVICE = torch.device('cpu')
print(f"Using device: {DEVICE}")

Using device: cuda


In [11]:
model_config = RecurrentModelConfig(max_length=512, vocab_size=50257,
                                    n_layer=8, num_heads=8, hidden_dim=768,
                                    dropout=0.1, device=DEVICE, recurrent_layers="all")

model = RecurrentNeuronTransformer(config=model_config).to(DEVICE)

RecurrentModelConfig(max_length=512, vocab_size=50257, n_layer=8, num_heads=8, hidden_dim=768, dropout=0.1, device=device(type='cuda'), recurrent_layers='all')
Number of parameters: 134.90M


In [12]:
model.load_state_dict(torch.load('StatefulTransformer_best_model.pth'))

<All keys matched successfully>

In [13]:
data_loader = TextDataLoader(file_path="Recurrent-Neuron-Transformer/data/shakespeare/tinyshakespeare.txt",
                             seq_length=1024,
                             bpe_tokenizer='gpt2',
                             batch_size=12,
                             vocab_size=50257,
                             split_ratio=0.8,
                             device=DEVICE)
train_loader, test_loader = data_loader.create_loaders()

In [14]:
def recurrent_transformer_forward(model, input_seq, hidden_layers, target_seq):
    # Forward pass
    outputs, hidden_layers = model(inputs=input_seq, hidden_layers=hidden_layers)
    outputs = outputs.reshape(-1, outputs.size(-1))
    target_seq = target_seq.reshape(-1)

    # Calculate loss
    loss = nn.CrossEntropyLoss()(outputs, target_seq)

    return (outputs, hidden_layers), loss

In [16]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="transformer-testing",
)

wandb.run.name = "StatefulTransformer_best_model-eval"
wandb.define_metric("epoch")
wandb.define_metric("eval_batch")

wandb.define_metric("epoch/*", step_metric="epoch")
wandb.define_metric("eval_batch/*", step_metric="eval_batch")



VBox(children=(Label(value='0.002 MB of 0.003 MB uploaded\r'), FloatProgress(value=0.8431876606683805, max=1.0…

<wandb.sdk.wandb_metric.Metric at 0x7ddf748971c0>

In [17]:
from tqdm import tqdm

context_window = 512
step_size = 511
distributed = False
rank = 0


model.eval()
epoch_val_loss = 0
eval_progress_bar = tqdm(test_loader, desc=f'Evaluating: Epoch 1', leave=False)
with torch.no_grad():
    for batch_idx, (input_chunk, target_chunk) in enumerate(eval_progress_bar):
        batch_loss = 0
        hidden_layers = dict()

        for i in range(0, input_chunk.size(1) - context_window, step_size):
            # Create the input and target sequences
            input_seq = input_chunk[:, i:i+context_window].to(DEVICE)
            target_seq = target_chunk[:, i+1:i+context_window+1].to(DEVICE)

            outputs = None
            loss = None
            (outputs, hidden_layers), loss = recurrent_transformer_forward(model, input_seq, hidden_layers, target_seq)

            batch_loss += loss.item()

        if (rank == 0 or not distributed) and (batch_idx == len(eval_progress_bar)-1 or (batch_idx % 10 == 0)):
            print({'eval_batch': batch_idx, 'eval_batch/loss': batch_loss})
            wandb.log({'eval_batch': batch_idx, 'eval_batch/loss': batch_loss})

        epoch_val_loss += batch_loss
        eval_progress_bar.set_postfix(loss=batch_loss)

avg_val_loss = epoch_val_loss / len(test_loader)

wandb.log({'epoch': 1, 'epoch/val_loss': avg_val_loss})


Evaluating: Epoch 1:   0%|          | 1/5548 [00:02<4:14:05,  2.75s/it, loss=35.5]

{'eval_batch': 0, 'eval_batch/loss': 35.47840881347656}


Evaluating: Epoch 1:   0%|          | 11/5548 [00:19<2:39:20,  1.73s/it, loss=36.2]

{'eval_batch': 10, 'eval_batch/loss': 36.19887351989746}


Evaluating: Epoch 1:   0%|          | 21/5548 [00:37<2:38:12,  1.72s/it, loss=36.3]

{'eval_batch': 20, 'eval_batch/loss': 36.27600860595703}


Evaluating: Epoch 1:   1%|          | 31/5548 [00:54<2:37:53,  1.72s/it, loss=37.5]

{'eval_batch': 30, 'eval_batch/loss': 37.4677848815918}


Evaluating: Epoch 1:   1%|          | 41/5548 [01:11<2:37:37,  1.72s/it, loss=38.1]

{'eval_batch': 40, 'eval_batch/loss': 38.110116958618164}


Evaluating: Epoch 1:   1%|          | 51/5548 [01:28<2:37:19,  1.72s/it, loss=38.7]

{'eval_batch': 50, 'eval_batch/loss': 38.706010818481445}


Evaluating: Epoch 1:   1%|          | 61/5548 [01:45<2:37:00,  1.72s/it, loss=38]

{'eval_batch': 60, 'eval_batch/loss': 37.9537353515625}


Evaluating: Epoch 1:   1%|▏         | 71/5548 [02:02<2:36:45,  1.72s/it, loss=37.1]

{'eval_batch': 70, 'eval_batch/loss': 37.138572692871094}


Evaluating: Epoch 1:   1%|▏         | 81/5548 [02:20<2:36:31,  1.72s/it, loss=36.9]

{'eval_batch': 80, 'eval_batch/loss': 36.9152946472168}


Evaluating: Epoch 1:   2%|▏         | 91/5548 [02:37<2:36:11,  1.72s/it, loss=36.2]

{'eval_batch': 90, 'eval_batch/loss': 36.213022232055664}


Evaluating: Epoch 1:   2%|▏         | 101/5548 [02:54<2:35:54,  1.72s/it, loss=36]

{'eval_batch': 100, 'eval_batch/loss': 36.00620079040527}


Evaluating: Epoch 1:   2%|▏         | 111/5548 [03:11<2:35:38,  1.72s/it, loss=36.8]

{'eval_batch': 110, 'eval_batch/loss': 36.77278137207031}


Evaluating: Epoch 1:   2%|▏         | 121/5548 [03:28<2:35:19,  1.72s/it, loss=35.3]

{'eval_batch': 120, 'eval_batch/loss': 35.307193756103516}


Evaluating: Epoch 1:   2%|▏         | 131/5548 [03:45<2:35:01,  1.72s/it, loss=34.2]

{'eval_batch': 130, 'eval_batch/loss': 34.21252250671387}


Evaluating: Epoch 1:   3%|▎         | 141/5548 [04:03<2:34:47,  1.72s/it, loss=33.9]

{'eval_batch': 140, 'eval_batch/loss': 33.93824577331543}


Evaluating: Epoch 1:   3%|▎         | 151/5548 [04:20<2:34:26,  1.72s/it, loss=33.2]

{'eval_batch': 150, 'eval_batch/loss': 33.15163993835449}


Evaluating: Epoch 1:   3%|▎         | 161/5548 [04:37<2:34:09,  1.72s/it, loss=32.6]

{'eval_batch': 160, 'eval_batch/loss': 32.64851474761963}


Evaluating: Epoch 1:   3%|▎         | 171/5548 [04:54<2:33:53,  1.72s/it, loss=32.2]

{'eval_batch': 170, 'eval_batch/loss': 32.24685192108154}


Evaluating: Epoch 1:   3%|▎         | 181/5548 [05:11<2:33:36,  1.72s/it, loss=32.9]

{'eval_batch': 180, 'eval_batch/loss': 32.903443336486816}


Evaluating: Epoch 1:   3%|▎         | 191/5548 [05:29<2:33:18,  1.72s/it, loss=33.1]

{'eval_batch': 190, 'eval_batch/loss': 33.050018310546875}


Evaluating: Epoch 1:   4%|▎         | 201/5548 [05:46<2:33:02,  1.72s/it, loss=32.3]

{'eval_batch': 200, 'eval_batch/loss': 32.2937536239624}


Evaluating: Epoch 1:   4%|▍         | 211/5548 [06:03<2:32:47,  1.72s/it, loss=32.1]

{'eval_batch': 210, 'eval_batch/loss': 32.10906982421875}


Evaluating: Epoch 1:   4%|▍         | 221/5548 [06:20<2:32:28,  1.72s/it, loss=31.9]

{'eval_batch': 220, 'eval_batch/loss': 31.936135292053223}


Evaluating: Epoch 1:   4%|▍         | 231/5548 [06:37<2:32:14,  1.72s/it, loss=32.4]

{'eval_batch': 230, 'eval_batch/loss': 32.36654472351074}


Evaluating: Epoch 1:   4%|▍         | 241/5548 [06:54<2:31:56,  1.72s/it, loss=32.5]

{'eval_batch': 240, 'eval_batch/loss': 32.48899841308594}


Evaluating: Epoch 1:   5%|▍         | 251/5548 [07:12<2:31:37,  1.72s/it, loss=33.2]

{'eval_batch': 250, 'eval_batch/loss': 33.20764446258545}


Evaluating: Epoch 1:   5%|▍         | 261/5548 [07:29<2:31:23,  1.72s/it, loss=32.7]

{'eval_batch': 260, 'eval_batch/loss': 32.725746154785156}


Evaluating: Epoch 1:   5%|▍         | 271/5548 [07:46<2:31:02,  1.72s/it, loss=33.2]

{'eval_batch': 270, 'eval_batch/loss': 33.180015563964844}


Evaluating: Epoch 1:   5%|▌         | 281/5548 [08:03<2:30:44,  1.72s/it, loss=33.2]

{'eval_batch': 280, 'eval_batch/loss': 33.221007347106934}


Evaluating: Epoch 1:   5%|▌         | 291/5548 [08:20<2:30:27,  1.72s/it, loss=33.6]

{'eval_batch': 290, 'eval_batch/loss': 33.61796569824219}


Evaluating: Epoch 1:   5%|▌         | 301/5548 [08:37<2:30:12,  1.72s/it, loss=33.6]

{'eval_batch': 300, 'eval_batch/loss': 33.600295066833496}


Evaluating: Epoch 1:   6%|▌         | 311/5548 [08:55<2:29:53,  1.72s/it, loss=34.2]

{'eval_batch': 310, 'eval_batch/loss': 34.246360778808594}


Evaluating: Epoch 1:   6%|▌         | 321/5548 [09:12<2:29:40,  1.72s/it, loss=33.2]

{'eval_batch': 320, 'eval_batch/loss': 33.208168029785156}


Evaluating: Epoch 1:   6%|▌         | 331/5548 [09:29<2:29:18,  1.72s/it, loss=34.5]

{'eval_batch': 330, 'eval_batch/loss': 34.47408103942871}


Evaluating: Epoch 1:   6%|▌         | 341/5548 [09:46<2:28:59,  1.72s/it, loss=35.3]

{'eval_batch': 340, 'eval_batch/loss': 35.30732345581055}


Evaluating: Epoch 1:   6%|▋         | 351/5548 [10:03<2:28:44,  1.72s/it, loss=35.6]

{'eval_batch': 350, 'eval_batch/loss': 35.57895469665527}


Evaluating: Epoch 1:   7%|▋         | 361/5548 [10:20<2:28:27,  1.72s/it, loss=34.7]

{'eval_batch': 360, 'eval_batch/loss': 34.70191955566406}


Evaluating: Epoch 1:   7%|▋         | 371/5548 [10:38<2:28:11,  1.72s/it, loss=36]

{'eval_batch': 370, 'eval_batch/loss': 35.99256896972656}


Evaluating: Epoch 1:   7%|▋         | 381/5548 [10:55<2:27:52,  1.72s/it, loss=36.1]

{'eval_batch': 380, 'eval_batch/loss': 36.098745346069336}


Evaluating: Epoch 1:   7%|▋         | 391/5548 [11:12<2:27:37,  1.72s/it, loss=36.1]

{'eval_batch': 390, 'eval_batch/loss': 36.144569396972656}


Evaluating: Epoch 1:   7%|▋         | 401/5548 [11:29<2:27:17,  1.72s/it, loss=36.6]

{'eval_batch': 400, 'eval_batch/loss': 36.60567855834961}


Evaluating: Epoch 1:   7%|▋         | 411/5548 [11:46<2:26:59,  1.72s/it, loss=36.7]

{'eval_batch': 410, 'eval_batch/loss': 36.65013122558594}


Evaluating: Epoch 1:   8%|▊         | 421/5548 [12:03<2:26:44,  1.72s/it, loss=35.9]

{'eval_batch': 420, 'eval_batch/loss': 35.928226470947266}


Evaluating: Epoch 1:   8%|▊         | 431/5548 [12:21<2:26:28,  1.72s/it, loss=34.6]

{'eval_batch': 430, 'eval_batch/loss': 34.59347724914551}


Evaluating: Epoch 1:   8%|▊         | 441/5548 [12:38<2:26:11,  1.72s/it, loss=34.7]

{'eval_batch': 440, 'eval_batch/loss': 34.73618125915527}


Evaluating: Epoch 1:   8%|▊         | 451/5548 [12:55<2:25:53,  1.72s/it, loss=33.5]

{'eval_batch': 450, 'eval_batch/loss': 33.54297351837158}


Evaluating: Epoch 1:   8%|▊         | 461/5548 [13:12<2:25:33,  1.72s/it, loss=33]

{'eval_batch': 460, 'eval_batch/loss': 33.00946807861328}


Evaluating: Epoch 1:   8%|▊         | 471/5548 [13:29<2:25:17,  1.72s/it, loss=33.5]

{'eval_batch': 470, 'eval_batch/loss': 33.539358139038086}


Evaluating: Epoch 1:   9%|▊         | 481/5548 [13:47<2:25:00,  1.72s/it, loss=33.8]

{'eval_batch': 480, 'eval_batch/loss': 33.75400352478027}


Evaluating: Epoch 1:   9%|▉         | 491/5548 [14:04<2:24:45,  1.72s/it, loss=33.9]

{'eval_batch': 490, 'eval_batch/loss': 33.941829681396484}


Evaluating: Epoch 1:   9%|▉         | 501/5548 [14:21<2:24:28,  1.72s/it, loss=34.7]

{'eval_batch': 500, 'eval_batch/loss': 34.65582466125488}


Evaluating: Epoch 1:   9%|▉         | 511/5548 [14:38<2:24:11,  1.72s/it, loss=35]

{'eval_batch': 510, 'eval_batch/loss': 34.9764347076416}


Evaluating: Epoch 1:   9%|▉         | 521/5548 [14:55<2:23:52,  1.72s/it, loss=36]

{'eval_batch': 520, 'eval_batch/loss': 35.977182388305664}


Evaluating: Epoch 1:  10%|▉         | 531/5548 [15:12<2:23:37,  1.72s/it, loss=36.5]

{'eval_batch': 530, 'eval_batch/loss': 36.4627799987793}


Evaluating: Epoch 1:  10%|▉         | 541/5548 [15:30<2:23:21,  1.72s/it, loss=37.1]

{'eval_batch': 540, 'eval_batch/loss': 37.10939407348633}


Evaluating: Epoch 1:  10%|▉         | 551/5548 [15:47<2:23:01,  1.72s/it, loss=35]

{'eval_batch': 550, 'eval_batch/loss': 34.984673500061035}


Evaluating: Epoch 1:  10%|█         | 561/5548 [16:04<2:22:44,  1.72s/it, loss=35.2]

{'eval_batch': 560, 'eval_batch/loss': 35.22229766845703}


Evaluating: Epoch 1:  10%|█         | 571/5548 [16:21<2:22:27,  1.72s/it, loss=33.9]

{'eval_batch': 570, 'eval_batch/loss': 33.923410415649414}


Evaluating: Epoch 1:  10%|█         | 581/5548 [16:38<2:22:08,  1.72s/it, loss=33.3]

{'eval_batch': 580, 'eval_batch/loss': 33.27431297302246}


Evaluating: Epoch 1:  11%|█         | 591/5548 [16:55<2:21:57,  1.72s/it, loss=33.2]

{'eval_batch': 590, 'eval_batch/loss': 33.16943168640137}


Evaluating: Epoch 1:  11%|█         | 601/5548 [17:13<2:21:38,  1.72s/it, loss=32.8]

{'eval_batch': 600, 'eval_batch/loss': 32.8173770904541}


Evaluating: Epoch 1:  11%|█         | 611/5548 [17:30<2:21:16,  1.72s/it, loss=32.9]

{'eval_batch': 610, 'eval_batch/loss': 32.867201805114746}


Evaluating: Epoch 1:  11%|█         | 621/5548 [17:47<2:21:01,  1.72s/it, loss=32.8]

{'eval_batch': 620, 'eval_batch/loss': 32.83212947845459}


Evaluating: Epoch 1:  11%|█▏        | 631/5548 [18:04<2:20:43,  1.72s/it, loss=33.7]

{'eval_batch': 630, 'eval_batch/loss': 33.71945762634277}


Evaluating: Epoch 1:  12%|█▏        | 641/5548 [18:21<2:20:29,  1.72s/it, loss=34.6]

{'eval_batch': 640, 'eval_batch/loss': 34.58840751647949}


Evaluating: Epoch 1:  12%|█▏        | 651/5548 [18:38<2:20:08,  1.72s/it, loss=33.8]

{'eval_batch': 650, 'eval_batch/loss': 33.7841796875}


Evaluating: Epoch 1:  12%|█▏        | 661/5548 [18:56<2:19:54,  1.72s/it, loss=34.2]

{'eval_batch': 660, 'eval_batch/loss': 34.2015438079834}


Evaluating: Epoch 1:  12%|█▏        | 671/5548 [19:13<2:19:38,  1.72s/it, loss=35]

{'eval_batch': 670, 'eval_batch/loss': 34.980926513671875}


Evaluating: Epoch 1:  12%|█▏        | 681/5548 [19:30<2:19:22,  1.72s/it, loss=35.3]

{'eval_batch': 680, 'eval_batch/loss': 35.26628875732422}


Evaluating: Epoch 1:  12%|█▏        | 691/5548 [19:47<2:19:08,  1.72s/it, loss=35.4]

{'eval_batch': 690, 'eval_batch/loss': 35.44474983215332}


Evaluating: Epoch 1:  13%|█▎        | 701/5548 [20:04<2:18:50,  1.72s/it, loss=37.4]

{'eval_batch': 700, 'eval_batch/loss': 37.350358963012695}


Evaluating: Epoch 1:  13%|█▎        | 711/5548 [20:22<2:18:26,  1.72s/it, loss=36.7]

{'eval_batch': 710, 'eval_batch/loss': 36.723201751708984}


Evaluating: Epoch 1:  13%|█▎        | 721/5548 [20:39<2:18:09,  1.72s/it, loss=37]

{'eval_batch': 720, 'eval_batch/loss': 37.005781173706055}


Evaluating: Epoch 1:  13%|█▎        | 731/5548 [20:56<2:17:52,  1.72s/it, loss=36.9]

{'eval_batch': 730, 'eval_batch/loss': 36.93928337097168}


Evaluating: Epoch 1:  13%|█▎        | 741/5548 [21:13<2:17:32,  1.72s/it, loss=37.3]

{'eval_batch': 740, 'eval_batch/loss': 37.31250190734863}


Evaluating: Epoch 1:  14%|█▎        | 751/5548 [21:30<2:17:16,  1.72s/it, loss=37.1]

{'eval_batch': 750, 'eval_batch/loss': 37.074798583984375}


Evaluating: Epoch 1:  14%|█▎        | 761/5548 [21:47<2:17:02,  1.72s/it, loss=35.5]

{'eval_batch': 760, 'eval_batch/loss': 35.47764015197754}


Evaluating: Epoch 1:  14%|█▍        | 771/5548 [22:05<2:16:46,  1.72s/it, loss=35.8]

{'eval_batch': 770, 'eval_batch/loss': 35.81391906738281}


Evaluating: Epoch 1:  14%|█▍        | 781/5548 [22:22<2:16:29,  1.72s/it, loss=34.3]

{'eval_batch': 780, 'eval_batch/loss': 34.340972900390625}


Evaluating: Epoch 1:  14%|█▍        | 791/5548 [22:39<2:16:12,  1.72s/it, loss=33]

{'eval_batch': 790, 'eval_batch/loss': 33.04964637756348}


Evaluating: Epoch 1:  14%|█▍        | 801/5548 [22:56<2:15:57,  1.72s/it, loss=32.9]

{'eval_batch': 800, 'eval_batch/loss': 32.91329383850098}


Evaluating: Epoch 1:  15%|█▍        | 811/5548 [23:13<2:15:38,  1.72s/it, loss=32.3]

{'eval_batch': 810, 'eval_batch/loss': 32.31292152404785}


Evaluating: Epoch 1:  15%|█▍        | 821/5548 [23:31<2:15:19,  1.72s/it, loss=31.6]

{'eval_batch': 820, 'eval_batch/loss': 31.588650703430176}


Evaluating: Epoch 1:  15%|█▍        | 831/5548 [23:48<2:15:04,  1.72s/it, loss=30.8]

{'eval_batch': 830, 'eval_batch/loss': 30.791733741760254}


Evaluating: Epoch 1:  15%|█▌        | 841/5548 [24:05<2:14:45,  1.72s/it, loss=31.3]

{'eval_batch': 840, 'eval_batch/loss': 31.337820053100586}


Evaluating: Epoch 1:  15%|█▌        | 851/5548 [24:22<2:14:30,  1.72s/it, loss=32.8]

{'eval_batch': 850, 'eval_batch/loss': 32.828593254089355}


Evaluating: Epoch 1:  16%|█▌        | 861/5548 [24:39<2:14:12,  1.72s/it, loss=32.8]

{'eval_batch': 860, 'eval_batch/loss': 32.79738903045654}


Evaluating: Epoch 1:  16%|█▌        | 871/5548 [24:56<2:13:55,  1.72s/it, loss=33.1]

{'eval_batch': 870, 'eval_batch/loss': 33.122941970825195}


Evaluating: Epoch 1:  16%|█▌        | 881/5548 [25:14<2:13:39,  1.72s/it, loss=31.4]

{'eval_batch': 880, 'eval_batch/loss': 31.406790733337402}


Evaluating: Epoch 1:  16%|█▌        | 891/5548 [25:31<2:13:23,  1.72s/it, loss=31.5]

{'eval_batch': 890, 'eval_batch/loss': 31.519232749938965}


Evaluating: Epoch 1:  16%|█▌        | 901/5548 [25:48<2:13:05,  1.72s/it, loss=32.7]

{'eval_batch': 900, 'eval_batch/loss': 32.718719482421875}


Evaluating: Epoch 1:  16%|█▋        | 911/5548 [26:05<2:12:48,  1.72s/it, loss=34.6]

{'eval_batch': 910, 'eval_batch/loss': 34.591739654541016}


Evaluating: Epoch 1:  17%|█▋        | 921/5548 [26:22<2:12:32,  1.72s/it, loss=34.4]

{'eval_batch': 920, 'eval_batch/loss': 34.36858558654785}


Evaluating: Epoch 1:  17%|█▋        | 931/5548 [26:40<2:12:12,  1.72s/it, loss=35.1]

{'eval_batch': 930, 'eval_batch/loss': 35.07039833068848}


Evaluating: Epoch 1:  17%|█▋        | 941/5548 [26:57<2:11:56,  1.72s/it, loss=34.8]

{'eval_batch': 940, 'eval_batch/loss': 34.81148338317871}


Evaluating: Epoch 1:  17%|█▋        | 951/5548 [27:14<2:11:37,  1.72s/it, loss=33.5]

{'eval_batch': 950, 'eval_batch/loss': 33.53291606903076}


Evaluating: Epoch 1:  17%|█▋        | 961/5548 [27:31<2:11:20,  1.72s/it, loss=34.1]

{'eval_batch': 960, 'eval_batch/loss': 34.115116119384766}


Evaluating: Epoch 1:  18%|█▊        | 971/5548 [27:48<2:11:05,  1.72s/it, loss=34.9]

{'eval_batch': 970, 'eval_batch/loss': 34.931570053100586}


Evaluating: Epoch 1:  18%|█▊        | 981/5548 [28:05<2:10:48,  1.72s/it, loss=35.9]

{'eval_batch': 980, 'eval_batch/loss': 35.851057052612305}


Evaluating: Epoch 1:  18%|█▊        | 991/5548 [28:23<2:10:31,  1.72s/it, loss=35.3]

{'eval_batch': 990, 'eval_batch/loss': 35.27055358886719}


Evaluating: Epoch 1:  18%|█▊        | 1001/5548 [28:40<2:10:14,  1.72s/it, loss=35.2]

{'eval_batch': 1000, 'eval_batch/loss': 35.21243476867676}


Evaluating: Epoch 1:  18%|█▊        | 1011/5548 [28:57<2:09:56,  1.72s/it, loss=34.2]

{'eval_batch': 1010, 'eval_batch/loss': 34.21570873260498}


Evaluating: Epoch 1:  18%|█▊        | 1021/5548 [29:14<2:09:40,  1.72s/it, loss=33.6]

{'eval_batch': 1020, 'eval_batch/loss': 33.61502456665039}


Evaluating: Epoch 1:  19%|█▊        | 1031/5548 [29:31<2:09:20,  1.72s/it, loss=34.2]

{'eval_batch': 1030, 'eval_batch/loss': 34.17324447631836}


Evaluating: Epoch 1:  19%|█▉        | 1041/5548 [29:49<2:09:03,  1.72s/it, loss=34.6]

{'eval_batch': 1040, 'eval_batch/loss': 34.57346534729004}


Evaluating: Epoch 1:  19%|█▉        | 1051/5548 [30:06<2:08:46,  1.72s/it, loss=34.3]

{'eval_batch': 1050, 'eval_batch/loss': 34.28474235534668}


Evaluating: Epoch 1:  19%|█▉        | 1061/5548 [30:23<2:08:27,  1.72s/it, loss=33]

{'eval_batch': 1060, 'eval_batch/loss': 32.99631404876709}


Evaluating: Epoch 1:  19%|█▉        | 1071/5548 [30:40<2:08:11,  1.72s/it, loss=31.3]

{'eval_batch': 1070, 'eval_batch/loss': 31.27827739715576}


Evaluating: Epoch 1:  19%|█▉        | 1081/5548 [30:57<2:07:57,  1.72s/it, loss=30.2]

{'eval_batch': 1080, 'eval_batch/loss': 30.24231719970703}


Evaluating: Epoch 1:  20%|█▉        | 1091/5548 [31:14<2:07:40,  1.72s/it, loss=30.7]

{'eval_batch': 1090, 'eval_batch/loss': 30.730732917785645}


Evaluating: Epoch 1:  20%|█▉        | 1101/5548 [31:32<2:07:21,  1.72s/it, loss=31]

{'eval_batch': 1100, 'eval_batch/loss': 31.024409294128418}


Evaluating: Epoch 1:  20%|██        | 1111/5548 [31:49<2:07:07,  1.72s/it, loss=30.9]

{'eval_batch': 1110, 'eval_batch/loss': 30.88969135284424}


Evaluating: Epoch 1:  20%|██        | 1121/5548 [32:06<2:06:50,  1.72s/it, loss=31.2]

{'eval_batch': 1120, 'eval_batch/loss': 31.176952362060547}


Evaluating: Epoch 1:  20%|██        | 1131/5548 [32:23<2:06:34,  1.72s/it, loss=31.9]

{'eval_batch': 1130, 'eval_batch/loss': 31.870434761047363}


Evaluating: Epoch 1:  21%|██        | 1141/5548 [32:40<2:06:15,  1.72s/it, loss=32.7]

{'eval_batch': 1140, 'eval_batch/loss': 32.71436023712158}


Evaluating: Epoch 1:  21%|██        | 1151/5548 [32:58<2:05:56,  1.72s/it, loss=34]

{'eval_batch': 1150, 'eval_batch/loss': 34.03734588623047}


Evaluating: Epoch 1:  21%|██        | 1161/5548 [33:15<2:05:40,  1.72s/it, loss=35.8]

{'eval_batch': 1160, 'eval_batch/loss': 35.79725456237793}


Evaluating: Epoch 1:  21%|██        | 1171/5548 [33:32<2:05:22,  1.72s/it, loss=35.1]

{'eval_batch': 1170, 'eval_batch/loss': 35.134626388549805}


Evaluating: Epoch 1:  21%|██▏       | 1181/5548 [33:49<2:05:07,  1.72s/it, loss=34.2]

{'eval_batch': 1180, 'eval_batch/loss': 34.177818298339844}


Evaluating: Epoch 1:  21%|██▏       | 1191/5548 [34:06<2:04:47,  1.72s/it, loss=33.2]

{'eval_batch': 1190, 'eval_batch/loss': 33.16470432281494}


Evaluating: Epoch 1:  22%|██▏       | 1201/5548 [34:23<2:04:30,  1.72s/it, loss=32.6]

{'eval_batch': 1200, 'eval_batch/loss': 32.60616588592529}


Evaluating: Epoch 1:  22%|██▏       | 1211/5548 [34:41<2:04:13,  1.72s/it, loss=33.1]

{'eval_batch': 1210, 'eval_batch/loss': 33.09245491027832}


Evaluating: Epoch 1:  22%|██▏       | 1221/5548 [34:58<2:03:55,  1.72s/it, loss=31.5]

{'eval_batch': 1220, 'eval_batch/loss': 31.49925422668457}


Evaluating: Epoch 1:  22%|██▏       | 1231/5548 [35:15<2:03:37,  1.72s/it, loss=32.1]

{'eval_batch': 1230, 'eval_batch/loss': 32.06264591217041}


Evaluating: Epoch 1:  22%|██▏       | 1241/5548 [35:32<2:03:22,  1.72s/it, loss=32.2]

{'eval_batch': 1240, 'eval_batch/loss': 32.16925621032715}


Evaluating: Epoch 1:  23%|██▎       | 1251/5548 [35:49<2:03:04,  1.72s/it, loss=32.2]

{'eval_batch': 1250, 'eval_batch/loss': 32.2090425491333}


Evaluating: Epoch 1:  23%|██▎       | 1261/5548 [36:07<2:02:47,  1.72s/it, loss=33.8]

{'eval_batch': 1260, 'eval_batch/loss': 33.77678966522217}


Evaluating: Epoch 1:  23%|██▎       | 1271/5548 [36:24<2:02:32,  1.72s/it, loss=34.8]

{'eval_batch': 1270, 'eval_batch/loss': 34.78746032714844}


Evaluating: Epoch 1:  23%|██▎       | 1281/5548 [36:41<2:02:13,  1.72s/it, loss=35.2]

{'eval_batch': 1280, 'eval_batch/loss': 35.19855880737305}


Evaluating: Epoch 1:  23%|██▎       | 1291/5548 [36:58<2:01:57,  1.72s/it, loss=35.4]

{'eval_batch': 1290, 'eval_batch/loss': 35.446706771850586}


Evaluating: Epoch 1:  23%|██▎       | 1301/5548 [37:15<2:01:36,  1.72s/it, loss=34]

{'eval_batch': 1300, 'eval_batch/loss': 34.00987243652344}


Evaluating: Epoch 1:  24%|██▎       | 1311/5548 [37:33<2:01:21,  1.72s/it, loss=33.9]

{'eval_batch': 1310, 'eval_batch/loss': 33.92878341674805}


Evaluating: Epoch 1:  24%|██▍       | 1321/5548 [37:50<2:01:05,  1.72s/it, loss=33.2]

{'eval_batch': 1320, 'eval_batch/loss': 33.17604064941406}


Evaluating: Epoch 1:  24%|██▍       | 1331/5548 [38:07<2:00:47,  1.72s/it, loss=33.2]

{'eval_batch': 1330, 'eval_batch/loss': 33.23315715789795}


Evaluating: Epoch 1:  24%|██▍       | 1341/5548 [38:24<2:00:28,  1.72s/it, loss=33]

{'eval_batch': 1340, 'eval_batch/loss': 33.020681381225586}


Evaluating: Epoch 1:  24%|██▍       | 1351/5548 [38:41<2:00:15,  1.72s/it, loss=32.3]

{'eval_batch': 1350, 'eval_batch/loss': 32.33270740509033}


Evaluating: Epoch 1:  25%|██▍       | 1361/5548 [38:58<1:59:53,  1.72s/it, loss=31.4]

{'eval_batch': 1360, 'eval_batch/loss': 31.388697624206543}


Evaluating: Epoch 1:  25%|██▍       | 1371/5548 [39:16<1:59:37,  1.72s/it, loss=32.1]

{'eval_batch': 1370, 'eval_batch/loss': 32.07538318634033}


Evaluating: Epoch 1:  25%|██▍       | 1381/5548 [39:33<1:59:20,  1.72s/it, loss=32.8]

{'eval_batch': 1380, 'eval_batch/loss': 32.78541564941406}


Evaluating: Epoch 1:  25%|██▌       | 1391/5548 [39:50<1:59:05,  1.72s/it, loss=33.6]

{'eval_batch': 1390, 'eval_batch/loss': 33.56455039978027}


Evaluating: Epoch 1:  25%|██▌       | 1401/5548 [40:07<1:58:46,  1.72s/it, loss=33.8]

{'eval_batch': 1400, 'eval_batch/loss': 33.84950256347656}


Evaluating: Epoch 1:  25%|██▌       | 1411/5548 [40:24<1:58:30,  1.72s/it, loss=33.7]

{'eval_batch': 1410, 'eval_batch/loss': 33.74632453918457}


Evaluating: Epoch 1:  26%|██▌       | 1421/5548 [40:42<1:58:12,  1.72s/it, loss=32.5]

{'eval_batch': 1420, 'eval_batch/loss': 32.53716564178467}


Evaluating: Epoch 1:  26%|██▌       | 1431/5548 [40:59<1:57:54,  1.72s/it, loss=32.6]

{'eval_batch': 1430, 'eval_batch/loss': 32.60449695587158}


Evaluating: Epoch 1:  26%|██▌       | 1441/5548 [41:16<1:57:39,  1.72s/it, loss=34.1]

{'eval_batch': 1440, 'eval_batch/loss': 34.06744194030762}


Evaluating: Epoch 1:  26%|██▌       | 1451/5548 [41:33<1:57:21,  1.72s/it, loss=34]

{'eval_batch': 1450, 'eval_batch/loss': 34.02201843261719}


Evaluating: Epoch 1:  26%|██▋       | 1461/5548 [41:50<1:57:04,  1.72s/it, loss=34.4]

{'eval_batch': 1460, 'eval_batch/loss': 34.35568618774414}


Evaluating: Epoch 1:  27%|██▋       | 1471/5548 [42:07<1:56:50,  1.72s/it, loss=33.5]

{'eval_batch': 1470, 'eval_batch/loss': 33.461849212646484}


Evaluating: Epoch 1:  27%|██▋       | 1481/5548 [42:25<1:56:30,  1.72s/it, loss=33]

{'eval_batch': 1480, 'eval_batch/loss': 33.000343322753906}


Evaluating: Epoch 1:  27%|██▋       | 1491/5548 [42:42<1:56:21,  1.72s/it, loss=34]

{'eval_batch': 1490, 'eval_batch/loss': 34.0103645324707}


Evaluating: Epoch 1:  27%|██▋       | 1501/5548 [42:59<1:55:57,  1.72s/it, loss=34.5]

{'eval_batch': 1500, 'eval_batch/loss': 34.46204948425293}


Evaluating: Epoch 1:  27%|██▋       | 1511/5548 [43:16<1:55:38,  1.72s/it, loss=33]

{'eval_batch': 1510, 'eval_batch/loss': 32.99678421020508}


Evaluating: Epoch 1:  27%|██▋       | 1521/5548 [43:33<1:55:20,  1.72s/it, loss=33.5]

{'eval_batch': 1520, 'eval_batch/loss': 33.4637451171875}


Evaluating: Epoch 1:  28%|██▊       | 1531/5548 [43:51<1:55:03,  1.72s/it, loss=32.6]

{'eval_batch': 1530, 'eval_batch/loss': 32.588528633117676}


Evaluating: Epoch 1:  28%|██▊       | 1541/5548 [44:08<1:54:44,  1.72s/it, loss=32.6]

{'eval_batch': 1540, 'eval_batch/loss': 32.55883026123047}


Evaluating: Epoch 1:  28%|██▊       | 1551/5548 [44:25<1:54:27,  1.72s/it, loss=33.4]

{'eval_batch': 1550, 'eval_batch/loss': 33.4459171295166}


Evaluating: Epoch 1:  28%|██▊       | 1561/5548 [44:42<1:54:11,  1.72s/it, loss=32.8]

{'eval_batch': 1560, 'eval_batch/loss': 32.80564785003662}


Evaluating: Epoch 1:  28%|██▊       | 1571/5548 [44:59<1:53:55,  1.72s/it, loss=34.1]

{'eval_batch': 1570, 'eval_batch/loss': 34.13545894622803}


Evaluating: Epoch 1:  28%|██▊       | 1581/5548 [45:17<1:53:40,  1.72s/it, loss=34.4]

{'eval_batch': 1580, 'eval_batch/loss': 34.44858455657959}


Evaluating: Epoch 1:  29%|██▊       | 1591/5548 [45:34<1:53:22,  1.72s/it, loss=35.6]

{'eval_batch': 1590, 'eval_batch/loss': 35.63049125671387}


Evaluating: Epoch 1:  29%|██▉       | 1601/5548 [45:51<1:53:03,  1.72s/it, loss=37.4]

{'eval_batch': 1600, 'eval_batch/loss': 37.41164016723633}


Evaluating: Epoch 1:  29%|██▉       | 1611/5548 [46:08<1:52:46,  1.72s/it, loss=37.4]

{'eval_batch': 1610, 'eval_batch/loss': 37.39718246459961}


Evaluating: Epoch 1:  29%|██▉       | 1621/5548 [46:25<1:52:29,  1.72s/it, loss=37.2]

{'eval_batch': 1620, 'eval_batch/loss': 37.23927879333496}


Evaluating: Epoch 1:  29%|██▉       | 1631/5548 [46:42<1:52:14,  1.72s/it, loss=37.5]

{'eval_batch': 1630, 'eval_batch/loss': 37.48202705383301}


Evaluating: Epoch 1:  30%|██▉       | 1641/5548 [47:00<1:51:53,  1.72s/it, loss=38.6]

{'eval_batch': 1640, 'eval_batch/loss': 38.61725616455078}


Evaluating: Epoch 1:  30%|██▉       | 1651/5548 [47:17<1:51:38,  1.72s/it, loss=39.5]

{'eval_batch': 1650, 'eval_batch/loss': 39.48603630065918}


Evaluating: Epoch 1:  30%|██▉       | 1661/5548 [47:34<1:51:21,  1.72s/it, loss=37.8]

{'eval_batch': 1660, 'eval_batch/loss': 37.79269981384277}


Evaluating: Epoch 1:  30%|███       | 1671/5548 [47:51<1:51:02,  1.72s/it, loss=37.2]

{'eval_batch': 1670, 'eval_batch/loss': 37.174949645996094}


Evaluating: Epoch 1:  30%|███       | 1681/5548 [48:08<1:50:46,  1.72s/it, loss=37.3]

{'eval_batch': 1680, 'eval_batch/loss': 37.253915786743164}


Evaluating: Epoch 1:  30%|███       | 1691/5548 [48:26<1:50:28,  1.72s/it, loss=37.1]

{'eval_batch': 1690, 'eval_batch/loss': 37.11957931518555}


Evaluating: Epoch 1:  31%|███       | 1701/5548 [48:43<1:50:12,  1.72s/it, loss=37.4]

{'eval_batch': 1700, 'eval_batch/loss': 37.354835510253906}


Evaluating: Epoch 1:  31%|███       | 1711/5548 [49:00<1:49:56,  1.72s/it, loss=38.4]

{'eval_batch': 1710, 'eval_batch/loss': 38.35024833679199}


Evaluating: Epoch 1:  31%|███       | 1721/5548 [49:17<1:49:41,  1.72s/it, loss=37.9]

{'eval_batch': 1720, 'eval_batch/loss': 37.874868392944336}


Evaluating: Epoch 1:  31%|███       | 1731/5548 [49:34<1:49:20,  1.72s/it, loss=37.5]

{'eval_batch': 1730, 'eval_batch/loss': 37.532596588134766}


Evaluating: Epoch 1:  31%|███▏      | 1741/5548 [49:52<1:49:07,  1.72s/it, loss=37.9]

{'eval_batch': 1740, 'eval_batch/loss': 37.90156173706055}


Evaluating: Epoch 1:  32%|███▏      | 1751/5548 [50:09<1:48:48,  1.72s/it, loss=39.2]

{'eval_batch': 1750, 'eval_batch/loss': 39.18448829650879}


Evaluating: Epoch 1:  32%|███▏      | 1761/5548 [50:26<1:48:31,  1.72s/it, loss=39.8]

{'eval_batch': 1760, 'eval_batch/loss': 39.811702728271484}


Evaluating: Epoch 1:  32%|███▏      | 1771/5548 [50:43<1:48:10,  1.72s/it, loss=40.8]

{'eval_batch': 1770, 'eval_batch/loss': 40.82103729248047}


Evaluating: Epoch 1:  32%|███▏      | 1781/5548 [51:00<1:47:55,  1.72s/it, loss=40.6]

{'eval_batch': 1780, 'eval_batch/loss': 40.64301300048828}


Evaluating: Epoch 1:  32%|███▏      | 1791/5548 [51:17<1:47:38,  1.72s/it, loss=40]

{'eval_batch': 1790, 'eval_batch/loss': 39.95937538146973}


Evaluating: Epoch 1:  32%|███▏      | 1801/5548 [51:35<1:47:20,  1.72s/it, loss=38.8]

{'eval_batch': 1800, 'eval_batch/loss': 38.801307678222656}


Evaluating: Epoch 1:  33%|███▎      | 1811/5548 [51:52<1:47:03,  1.72s/it, loss=39.5]

{'eval_batch': 1810, 'eval_batch/loss': 39.46071815490723}


Evaluating: Epoch 1:  33%|███▎      | 1821/5548 [52:09<1:46:44,  1.72s/it, loss=38.3]

{'eval_batch': 1820, 'eval_batch/loss': 38.31905746459961}


Evaluating: Epoch 1:  33%|███▎      | 1831/5548 [52:26<1:46:28,  1.72s/it, loss=37.8]

{'eval_batch': 1830, 'eval_batch/loss': 37.8264217376709}


Evaluating: Epoch 1:  33%|███▎      | 1841/5548 [52:43<1:46:10,  1.72s/it, loss=37.1]

{'eval_batch': 1840, 'eval_batch/loss': 37.10442924499512}


Evaluating: Epoch 1:  33%|███▎      | 1851/5548 [53:01<1:45:52,  1.72s/it, loss=36.6]

{'eval_batch': 1850, 'eval_batch/loss': 36.58176612854004}


Evaluating: Epoch 1:  34%|███▎      | 1861/5548 [53:18<1:45:37,  1.72s/it, loss=36.7]

{'eval_batch': 1860, 'eval_batch/loss': 36.693464279174805}


Evaluating: Epoch 1:  34%|███▎      | 1871/5548 [53:35<1:45:18,  1.72s/it, loss=35.7]

{'eval_batch': 1870, 'eval_batch/loss': 35.69824981689453}


Evaluating: Epoch 1:  34%|███▍      | 1881/5548 [53:52<1:45:00,  1.72s/it, loss=35.9]

{'eval_batch': 1880, 'eval_batch/loss': 35.8853759765625}


Evaluating: Epoch 1:  34%|███▍      | 1891/5548 [54:09<1:44:43,  1.72s/it, loss=36.7]

{'eval_batch': 1890, 'eval_batch/loss': 36.65440368652344}


Evaluating: Epoch 1:  34%|███▍      | 1901/5548 [54:27<1:44:27,  1.72s/it, loss=38.2]

{'eval_batch': 1900, 'eval_batch/loss': 38.18224334716797}


Evaluating: Epoch 1:  34%|███▍      | 1911/5548 [54:44<1:44:10,  1.72s/it, loss=38.1]

{'eval_batch': 1910, 'eval_batch/loss': 38.1374454498291}


Evaluating: Epoch 1:  35%|███▍      | 1921/5548 [55:01<1:43:52,  1.72s/it, loss=38.9]

{'eval_batch': 1920, 'eval_batch/loss': 38.9084587097168}


Evaluating: Epoch 1:  35%|███▍      | 1931/5548 [55:18<1:43:40,  1.72s/it, loss=38.7]

{'eval_batch': 1930, 'eval_batch/loss': 38.69309043884277}


Evaluating: Epoch 1:  35%|███▍      | 1941/5548 [55:35<1:43:18,  1.72s/it, loss=39.4]

{'eval_batch': 1940, 'eval_batch/loss': 39.367923736572266}


Evaluating: Epoch 1:  35%|███▌      | 1951/5548 [55:52<1:43:01,  1.72s/it, loss=39]

{'eval_batch': 1950, 'eval_batch/loss': 38.973392486572266}


Evaluating: Epoch 1:  35%|███▌      | 1961/5548 [56:10<1:42:44,  1.72s/it, loss=38.9]

{'eval_batch': 1960, 'eval_batch/loss': 38.900156021118164}


Evaluating: Epoch 1:  36%|███▌      | 1971/5548 [56:27<1:42:28,  1.72s/it, loss=38.9]

{'eval_batch': 1970, 'eval_batch/loss': 38.90628242492676}


Evaluating: Epoch 1:  36%|███▌      | 1981/5548 [56:44<1:42:09,  1.72s/it, loss=38.9]

{'eval_batch': 1980, 'eval_batch/loss': 38.90131759643555}


Evaluating: Epoch 1:  36%|███▌      | 1991/5548 [57:01<1:41:53,  1.72s/it, loss=37.9]

{'eval_batch': 1990, 'eval_batch/loss': 37.86369705200195}


Evaluating: Epoch 1:  36%|███▌      | 2001/5548 [57:18<1:41:38,  1.72s/it, loss=37.5]

{'eval_batch': 2000, 'eval_batch/loss': 37.472089767456055}


Evaluating: Epoch 1:  36%|███▌      | 2011/5548 [57:36<1:41:19,  1.72s/it, loss=37.5]

{'eval_batch': 2010, 'eval_batch/loss': 37.52688407897949}


Evaluating: Epoch 1:  36%|███▋      | 2021/5548 [57:53<1:40:59,  1.72s/it, loss=38]

{'eval_batch': 2020, 'eval_batch/loss': 38.01839637756348}


Evaluating: Epoch 1:  37%|███▋      | 2031/5548 [58:10<1:40:46,  1.72s/it, loss=38.8]

{'eval_batch': 2030, 'eval_batch/loss': 38.791391372680664}


Evaluating: Epoch 1:  37%|███▋      | 2041/5548 [58:27<1:40:24,  1.72s/it, loss=39.1]

{'eval_batch': 2040, 'eval_batch/loss': 39.1045036315918}


Evaluating: Epoch 1:  37%|███▋      | 2051/5548 [58:44<1:40:13,  1.72s/it, loss=39.6]

{'eval_batch': 2050, 'eval_batch/loss': 39.630136489868164}


Evaluating: Epoch 1:  37%|███▋      | 2061/5548 [59:02<1:39:54,  1.72s/it, loss=39.4]

{'eval_batch': 2060, 'eval_batch/loss': 39.35017776489258}


Evaluating: Epoch 1:  37%|███▋      | 2071/5548 [59:19<1:39:35,  1.72s/it, loss=38.7]

{'eval_batch': 2070, 'eval_batch/loss': 38.700035095214844}


Evaluating: Epoch 1:  38%|███▊      | 2081/5548 [59:36<1:39:16,  1.72s/it, loss=38.3]

{'eval_batch': 2080, 'eval_batch/loss': 38.272701263427734}


Evaluating: Epoch 1:  38%|███▊      | 2091/5548 [59:53<1:39:01,  1.72s/it, loss=37.9]

{'eval_batch': 2090, 'eval_batch/loss': 37.89211082458496}


Evaluating: Epoch 1:  38%|███▊      | 2101/5548 [1:00:10<1:38:43,  1.72s/it, loss=37.6]

{'eval_batch': 2100, 'eval_batch/loss': 37.61563491821289}


Evaluating: Epoch 1:  38%|███▊      | 2111/5548 [1:00:27<1:38:25,  1.72s/it, loss=36.9]

{'eval_batch': 2110, 'eval_batch/loss': 36.85079002380371}


Evaluating: Epoch 1:  38%|███▊      | 2121/5548 [1:00:45<1:38:09,  1.72s/it, loss=36.1]

{'eval_batch': 2120, 'eval_batch/loss': 36.1436882019043}


Evaluating: Epoch 1:  38%|███▊      | 2131/5548 [1:01:02<1:37:52,  1.72s/it, loss=36.3]

{'eval_batch': 2130, 'eval_batch/loss': 36.26883888244629}


Evaluating: Epoch 1:  39%|███▊      | 2141/5548 [1:01:19<1:37:35,  1.72s/it, loss=35.6]

{'eval_batch': 2140, 'eval_batch/loss': 35.56962013244629}


Evaluating: Epoch 1:  39%|███▉      | 2151/5548 [1:01:36<1:37:19,  1.72s/it, loss=36.3]

{'eval_batch': 2150, 'eval_batch/loss': 36.33218574523926}


Evaluating: Epoch 1:  39%|███▉      | 2161/5548 [1:01:53<1:37:03,  1.72s/it, loss=36.1]

{'eval_batch': 2160, 'eval_batch/loss': 36.08475875854492}


Evaluating: Epoch 1:  39%|███▉      | 2171/5548 [1:02:11<1:36:45,  1.72s/it, loss=36.4]

{'eval_batch': 2170, 'eval_batch/loss': 36.3974552154541}


Evaluating: Epoch 1:  39%|███▉      | 2181/5548 [1:02:28<1:36:28,  1.72s/it, loss=36.4]

{'eval_batch': 2180, 'eval_batch/loss': 36.40570831298828}


Evaluating: Epoch 1:  39%|███▉      | 2191/5548 [1:02:45<1:36:10,  1.72s/it, loss=36.9]

{'eval_batch': 2190, 'eval_batch/loss': 36.93981170654297}


Evaluating: Epoch 1:  40%|███▉      | 2201/5548 [1:03:02<1:35:51,  1.72s/it, loss=37.1]

{'eval_batch': 2200, 'eval_batch/loss': 37.11000633239746}


Evaluating: Epoch 1:  40%|███▉      | 2211/5548 [1:03:19<1:35:32,  1.72s/it, loss=37.9]

{'eval_batch': 2210, 'eval_batch/loss': 37.88332939147949}


Evaluating: Epoch 1:  40%|████      | 2221/5548 [1:03:36<1:35:19,  1.72s/it, loss=38.3]

{'eval_batch': 2220, 'eval_batch/loss': 38.346527099609375}


Evaluating: Epoch 1:  40%|████      | 2231/5548 [1:03:54<1:35:01,  1.72s/it, loss=39.4]

{'eval_batch': 2230, 'eval_batch/loss': 39.39349365234375}


Evaluating: Epoch 1:  40%|████      | 2241/5548 [1:04:11<1:34:42,  1.72s/it, loss=38.9]

{'eval_batch': 2240, 'eval_batch/loss': 38.93137741088867}


Evaluating: Epoch 1:  41%|████      | 2251/5548 [1:04:28<1:34:25,  1.72s/it, loss=39.3]

{'eval_batch': 2250, 'eval_batch/loss': 39.25544357299805}


Evaluating: Epoch 1:  41%|████      | 2261/5548 [1:04:45<1:34:12,  1.72s/it, loss=40]

{'eval_batch': 2260, 'eval_batch/loss': 39.98934745788574}


Evaluating: Epoch 1:  41%|████      | 2271/5548 [1:05:02<1:33:52,  1.72s/it, loss=40.3]

{'eval_batch': 2270, 'eval_batch/loss': 40.32940864562988}


Evaluating: Epoch 1:  41%|████      | 2281/5548 [1:05:20<1:33:34,  1.72s/it, loss=39.7]

{'eval_batch': 2280, 'eval_batch/loss': 39.71218299865723}


Evaluating: Epoch 1:  41%|████▏     | 2291/5548 [1:05:37<1:33:17,  1.72s/it, loss=39.7]

{'eval_batch': 2290, 'eval_batch/loss': 39.69555473327637}


Evaluating: Epoch 1:  41%|████▏     | 2301/5548 [1:05:54<1:33:00,  1.72s/it, loss=39.3]

{'eval_batch': 2300, 'eval_batch/loss': 39.265390396118164}


Evaluating: Epoch 1:  42%|████▏     | 2311/5548 [1:06:11<1:32:41,  1.72s/it, loss=38.8]

{'eval_batch': 2310, 'eval_batch/loss': 38.80459213256836}


Evaluating: Epoch 1:  42%|████▏     | 2321/5548 [1:06:28<1:32:27,  1.72s/it, loss=39.4]

{'eval_batch': 2320, 'eval_batch/loss': 39.36585807800293}


Evaluating: Epoch 1:  42%|████▏     | 2331/5548 [1:06:46<1:32:07,  1.72s/it, loss=39.9]

{'eval_batch': 2330, 'eval_batch/loss': 39.879276275634766}


Evaluating: Epoch 1:  42%|████▏     | 2341/5548 [1:07:03<1:31:54,  1.72s/it, loss=39.3]

{'eval_batch': 2340, 'eval_batch/loss': 39.29214286804199}


Evaluating: Epoch 1:  42%|████▏     | 2351/5548 [1:07:20<1:31:37,  1.72s/it, loss=38.8]

{'eval_batch': 2350, 'eval_batch/loss': 38.800537109375}


Evaluating: Epoch 1:  43%|████▎     | 2361/5548 [1:07:37<1:31:19,  1.72s/it, loss=38.9]

{'eval_batch': 2360, 'eval_batch/loss': 38.91949462890625}


Evaluating: Epoch 1:  43%|████▎     | 2371/5548 [1:07:54<1:31:08,  1.72s/it, loss=38.6]

{'eval_batch': 2370, 'eval_batch/loss': 38.582576751708984}


Evaluating: Epoch 1:  43%|████▎     | 2381/5548 [1:08:11<1:30:42,  1.72s/it, loss=38.9]

{'eval_batch': 2380, 'eval_batch/loss': 38.89648628234863}


Evaluating: Epoch 1:  43%|████▎     | 2391/5548 [1:08:29<1:30:25,  1.72s/it, loss=38.7]

{'eval_batch': 2390, 'eval_batch/loss': 38.67485427856445}


Evaluating: Epoch 1:  43%|████▎     | 2401/5548 [1:08:46<1:30:10,  1.72s/it, loss=38.4]

{'eval_batch': 2400, 'eval_batch/loss': 38.446144104003906}


Evaluating: Epoch 1:  43%|████▎     | 2411/5548 [1:09:03<1:29:52,  1.72s/it, loss=38]

{'eval_batch': 2410, 'eval_batch/loss': 38.0054931640625}


Evaluating: Epoch 1:  44%|████▎     | 2421/5548 [1:09:20<1:29:34,  1.72s/it, loss=37.5]

{'eval_batch': 2420, 'eval_batch/loss': 37.520402908325195}


Evaluating: Epoch 1:  44%|████▍     | 2431/5548 [1:09:37<1:29:16,  1.72s/it, loss=36.5]

{'eval_batch': 2430, 'eval_batch/loss': 36.4935359954834}


Evaluating: Epoch 1:  44%|████▍     | 2441/5548 [1:09:55<1:29:02,  1.72s/it, loss=36]

{'eval_batch': 2440, 'eval_batch/loss': 35.98830032348633}


Evaluating: Epoch 1:  44%|████▍     | 2451/5548 [1:10:12<1:28:43,  1.72s/it, loss=35.3]

{'eval_batch': 2450, 'eval_batch/loss': 35.30914115905762}


Evaluating: Epoch 1:  44%|████▍     | 2461/5548 [1:10:29<1:28:26,  1.72s/it, loss=35.2]

{'eval_batch': 2460, 'eval_batch/loss': 35.23014450073242}


Evaluating: Epoch 1:  45%|████▍     | 2471/5548 [1:10:46<1:28:05,  1.72s/it, loss=35.3]

{'eval_batch': 2470, 'eval_batch/loss': 35.29349327087402}


Evaluating: Epoch 1:  45%|████▍     | 2481/5548 [1:11:03<1:27:52,  1.72s/it, loss=35.4]

{'eval_batch': 2480, 'eval_batch/loss': 35.442575454711914}


Evaluating: Epoch 1:  45%|████▍     | 2491/5548 [1:11:21<1:27:34,  1.72s/it, loss=35.6]

{'eval_batch': 2490, 'eval_batch/loss': 35.60219955444336}


Evaluating: Epoch 1:  45%|████▌     | 2501/5548 [1:11:38<1:27:17,  1.72s/it, loss=35.3]

{'eval_batch': 2500, 'eval_batch/loss': 35.29859161376953}


Evaluating: Epoch 1:  45%|████▌     | 2511/5548 [1:11:55<1:26:57,  1.72s/it, loss=35.2]

{'eval_batch': 2510, 'eval_batch/loss': 35.190542221069336}


Evaluating: Epoch 1:  45%|████▌     | 2521/5548 [1:12:12<1:26:42,  1.72s/it, loss=35.7]

{'eval_batch': 2520, 'eval_batch/loss': 35.66641044616699}


Evaluating: Epoch 1:  46%|████▌     | 2531/5548 [1:12:29<1:26:22,  1.72s/it, loss=36.1]

{'eval_batch': 2530, 'eval_batch/loss': 36.121212005615234}


Evaluating: Epoch 1:  46%|████▌     | 2541/5548 [1:12:46<1:26:04,  1.72s/it, loss=35.6]

{'eval_batch': 2540, 'eval_batch/loss': 35.61720848083496}


Evaluating: Epoch 1:  46%|████▌     | 2551/5548 [1:13:04<1:25:49,  1.72s/it, loss=35.6]

{'eval_batch': 2550, 'eval_batch/loss': 35.58252143859863}


Evaluating: Epoch 1:  46%|████▌     | 2561/5548 [1:13:21<1:25:32,  1.72s/it, loss=35.9]

{'eval_batch': 2560, 'eval_batch/loss': 35.880165100097656}


Evaluating: Epoch 1:  46%|████▋     | 2571/5548 [1:13:38<1:25:13,  1.72s/it, loss=35.3]

{'eval_batch': 2570, 'eval_batch/loss': 35.26445388793945}


Evaluating: Epoch 1:  47%|████▋     | 2581/5548 [1:13:55<1:24:57,  1.72s/it, loss=34.6]

{'eval_batch': 2580, 'eval_batch/loss': 34.616485595703125}


Evaluating: Epoch 1:  47%|████▋     | 2591/5548 [1:14:12<1:24:37,  1.72s/it, loss=35.7]

{'eval_batch': 2590, 'eval_batch/loss': 35.749189376831055}


Evaluating: Epoch 1:  47%|████▋     | 2601/5548 [1:14:30<1:24:22,  1.72s/it, loss=35.9]

{'eval_batch': 2600, 'eval_batch/loss': 35.93139839172363}


Evaluating: Epoch 1:  47%|████▋     | 2611/5548 [1:14:47<1:24:05,  1.72s/it, loss=36]

{'eval_batch': 2610, 'eval_batch/loss': 35.99702262878418}


Evaluating: Epoch 1:  47%|████▋     | 2621/5548 [1:15:04<1:23:50,  1.72s/it, loss=36.2]

{'eval_batch': 2620, 'eval_batch/loss': 36.22531318664551}


Evaluating: Epoch 1:  47%|████▋     | 2631/5548 [1:15:21<1:23:32,  1.72s/it, loss=36.5]

{'eval_batch': 2630, 'eval_batch/loss': 36.49757385253906}


Evaluating: Epoch 1:  48%|████▊     | 2641/5548 [1:15:38<1:23:14,  1.72s/it, loss=36.2]

{'eval_batch': 2640, 'eval_batch/loss': 36.23713493347168}


Evaluating: Epoch 1:  48%|████▊     | 2651/5548 [1:15:55<1:23:01,  1.72s/it, loss=36.3]

{'eval_batch': 2650, 'eval_batch/loss': 36.34186935424805}


Evaluating: Epoch 1:  48%|████▊     | 2661/5548 [1:16:13<1:22:45,  1.72s/it, loss=36.9]

{'eval_batch': 2660, 'eval_batch/loss': 36.9259090423584}


Evaluating: Epoch 1:  48%|████▊     | 2671/5548 [1:16:30<1:22:25,  1.72s/it, loss=35.9]

{'eval_batch': 2670, 'eval_batch/loss': 35.92702865600586}


Evaluating: Epoch 1:  48%|████▊     | 2681/5548 [1:16:47<1:22:09,  1.72s/it, loss=36.1]

{'eval_batch': 2680, 'eval_batch/loss': 36.12816619873047}


Evaluating: Epoch 1:  49%|████▊     | 2691/5548 [1:17:04<1:21:50,  1.72s/it, loss=36.3]

{'eval_batch': 2690, 'eval_batch/loss': 36.34628677368164}


Evaluating: Epoch 1:  49%|████▊     | 2701/5548 [1:17:21<1:21:33,  1.72s/it, loss=36.1]

{'eval_batch': 2700, 'eval_batch/loss': 36.075923919677734}


Evaluating: Epoch 1:  49%|████▉     | 2711/5548 [1:17:39<1:21:18,  1.72s/it, loss=35.6]

{'eval_batch': 2710, 'eval_batch/loss': 35.58357048034668}


Evaluating: Epoch 1:  49%|████▉     | 2721/5548 [1:17:56<1:21:01,  1.72s/it, loss=35.7]

{'eval_batch': 2720, 'eval_batch/loss': 35.68546676635742}


Evaluating: Epoch 1:  49%|████▉     | 2731/5548 [1:18:13<1:20:41,  1.72s/it, loss=36.3]

{'eval_batch': 2730, 'eval_batch/loss': 36.31367874145508}


Evaluating: Epoch 1:  49%|████▉     | 2741/5548 [1:18:30<1:20:24,  1.72s/it, loss=36]

{'eval_batch': 2740, 'eval_batch/loss': 36.03886795043945}


Evaluating: Epoch 1:  50%|████▉     | 2751/5548 [1:18:47<1:20:08,  1.72s/it, loss=36.6]

{'eval_batch': 2750, 'eval_batch/loss': 36.63731002807617}


Evaluating: Epoch 1:  50%|████▉     | 2761/5548 [1:19:05<1:19:49,  1.72s/it, loss=35.8]

{'eval_batch': 2760, 'eval_batch/loss': 35.820966720581055}


Evaluating: Epoch 1:  50%|████▉     | 2771/5548 [1:19:22<1:19:32,  1.72s/it, loss=36]

{'eval_batch': 2770, 'eval_batch/loss': 36.005741119384766}


Evaluating: Epoch 1:  50%|█████     | 2781/5548 [1:19:39<1:19:18,  1.72s/it, loss=35.3]

{'eval_batch': 2780, 'eval_batch/loss': 35.28205871582031}


Evaluating: Epoch 1:  50%|█████     | 2791/5548 [1:19:56<1:19:02,  1.72s/it, loss=35]

{'eval_batch': 2790, 'eval_batch/loss': 34.95484161376953}


Evaluating: Epoch 1:  50%|█████     | 2801/5548 [1:20:13<1:18:43,  1.72s/it, loss=34.6]

{'eval_batch': 2800, 'eval_batch/loss': 34.63999366760254}


Evaluating: Epoch 1:  51%|█████     | 2811/5548 [1:20:30<1:18:25,  1.72s/it, loss=34.8]

{'eval_batch': 2810, 'eval_batch/loss': 34.8050651550293}


Evaluating: Epoch 1:  51%|█████     | 2821/5548 [1:20:48<1:18:09,  1.72s/it, loss=34.7]

{'eval_batch': 2820, 'eval_batch/loss': 34.66975402832031}


Evaluating: Epoch 1:  51%|█████     | 2831/5548 [1:21:05<1:17:48,  1.72s/it, loss=34.7]

{'eval_batch': 2830, 'eval_batch/loss': 34.71946334838867}


Evaluating: Epoch 1:  51%|█████     | 2841/5548 [1:21:22<1:17:36,  1.72s/it, loss=35.1]

{'eval_batch': 2840, 'eval_batch/loss': 35.12734603881836}


Evaluating: Epoch 1:  51%|█████▏    | 2851/5548 [1:21:39<1:17:18,  1.72s/it, loss=35.5]

{'eval_batch': 2850, 'eval_batch/loss': 35.46897506713867}


Evaluating: Epoch 1:  52%|█████▏    | 2861/5548 [1:21:56<1:16:58,  1.72s/it, loss=35.3]

{'eval_batch': 2860, 'eval_batch/loss': 35.28564453125}


Evaluating: Epoch 1:  52%|█████▏    | 2871/5548 [1:22:14<1:16:42,  1.72s/it, loss=35.9]

{'eval_batch': 2870, 'eval_batch/loss': 35.852224349975586}


Evaluating: Epoch 1:  52%|█████▏    | 2881/5548 [1:22:31<1:16:25,  1.72s/it, loss=36.4]

{'eval_batch': 2880, 'eval_batch/loss': 36.39608383178711}


Evaluating: Epoch 1:  52%|█████▏    | 2891/5548 [1:22:48<1:16:06,  1.72s/it, loss=36.3]

{'eval_batch': 2890, 'eval_batch/loss': 36.31805992126465}


Evaluating: Epoch 1:  52%|█████▏    | 2901/5548 [1:23:05<1:15:50,  1.72s/it, loss=36]

{'eval_batch': 2900, 'eval_batch/loss': 35.98348808288574}


Evaluating: Epoch 1:  52%|█████▏    | 2911/5548 [1:23:22<1:15:36,  1.72s/it, loss=36.1]

{'eval_batch': 2910, 'eval_batch/loss': 36.14469528198242}


Evaluating: Epoch 1:  53%|█████▎    | 2921/5548 [1:23:40<1:15:23,  1.72s/it, loss=35.8]

{'eval_batch': 2920, 'eval_batch/loss': 35.78573989868164}


Evaluating: Epoch 1:  53%|█████▎    | 2931/5548 [1:23:57<1:14:57,  1.72s/it, loss=36.5]

{'eval_batch': 2930, 'eval_batch/loss': 36.48948669433594}


Evaluating: Epoch 1:  53%|█████▎    | 2941/5548 [1:24:14<1:14:43,  1.72s/it, loss=36.9]

{'eval_batch': 2940, 'eval_batch/loss': 36.854190826416016}


Evaluating: Epoch 1:  53%|█████▎    | 2951/5548 [1:24:31<1:14:23,  1.72s/it, loss=37.9]

{'eval_batch': 2950, 'eval_batch/loss': 37.91320037841797}


Evaluating: Epoch 1:  53%|█████▎    | 2961/5548 [1:24:48<1:14:06,  1.72s/it, loss=38.4]

{'eval_batch': 2960, 'eval_batch/loss': 38.435380935668945}


Evaluating: Epoch 1:  54%|█████▎    | 2971/5548 [1:25:06<1:13:48,  1.72s/it, loss=38.2]

{'eval_batch': 2970, 'eval_batch/loss': 38.179019927978516}


Evaluating: Epoch 1:  54%|█████▎    | 2981/5548 [1:25:23<1:13:32,  1.72s/it, loss=37.6]

{'eval_batch': 2980, 'eval_batch/loss': 37.60087013244629}


Evaluating: Epoch 1:  54%|█████▍    | 2991/5548 [1:25:40<1:13:16,  1.72s/it, loss=38.4]

{'eval_batch': 2990, 'eval_batch/loss': 38.442068099975586}


Evaluating: Epoch 1:  54%|█████▍    | 3001/5548 [1:25:57<1:12:57,  1.72s/it, loss=38.6]

{'eval_batch': 3000, 'eval_batch/loss': 38.626047134399414}


Evaluating: Epoch 1:  54%|█████▍    | 3011/5548 [1:26:14<1:12:39,  1.72s/it, loss=38.7]

{'eval_batch': 3010, 'eval_batch/loss': 38.680667877197266}


Evaluating: Epoch 1:  54%|█████▍    | 3021/5548 [1:26:31<1:12:23,  1.72s/it, loss=37.6]

{'eval_batch': 3020, 'eval_batch/loss': 37.601051330566406}


Evaluating: Epoch 1:  55%|█████▍    | 3031/5548 [1:26:49<1:12:05,  1.72s/it, loss=37.7]

{'eval_batch': 3030, 'eval_batch/loss': 37.7264461517334}


Evaluating: Epoch 1:  55%|█████▍    | 3041/5548 [1:27:06<1:11:50,  1.72s/it, loss=38]

{'eval_batch': 3040, 'eval_batch/loss': 38.016361236572266}


Evaluating: Epoch 1:  55%|█████▍    | 3051/5548 [1:27:23<1:11:31,  1.72s/it, loss=38.2]

{'eval_batch': 3050, 'eval_batch/loss': 38.24338150024414}


Evaluating: Epoch 1:  55%|█████▌    | 3061/5548 [1:27:40<1:11:14,  1.72s/it, loss=39]

{'eval_batch': 3060, 'eval_batch/loss': 39.00096130371094}


Evaluating: Epoch 1:  55%|█████▌    | 3071/5548 [1:27:57<1:10:58,  1.72s/it, loss=39.2]

{'eval_batch': 3070, 'eval_batch/loss': 39.19660186767578}


Evaluating: Epoch 1:  56%|█████▌    | 3081/5548 [1:28:15<1:10:41,  1.72s/it, loss=38.7]

{'eval_batch': 3080, 'eval_batch/loss': 38.72867774963379}


Evaluating: Epoch 1:  56%|█████▌    | 3091/5548 [1:28:32<1:10:24,  1.72s/it, loss=38.1]

{'eval_batch': 3090, 'eval_batch/loss': 38.08689308166504}


Evaluating: Epoch 1:  56%|█████▌    | 3101/5548 [1:28:49<1:10:05,  1.72s/it, loss=38.1]

{'eval_batch': 3100, 'eval_batch/loss': 38.10932922363281}


Evaluating: Epoch 1:  56%|█████▌    | 3111/5548 [1:29:06<1:09:48,  1.72s/it, loss=38.8]

{'eval_batch': 3110, 'eval_batch/loss': 38.796810150146484}


Evaluating: Epoch 1:  56%|█████▋    | 3121/5548 [1:29:23<1:09:30,  1.72s/it, loss=38]

{'eval_batch': 3120, 'eval_batch/loss': 38.03606414794922}


Evaluating: Epoch 1:  56%|█████▋    | 3131/5548 [1:29:41<1:09:13,  1.72s/it, loss=37.7]

{'eval_batch': 3130, 'eval_batch/loss': 37.67591857910156}


Evaluating: Epoch 1:  57%|█████▋    | 3141/5548 [1:29:58<1:08:56,  1.72s/it, loss=36]

{'eval_batch': 3140, 'eval_batch/loss': 36.01360321044922}


Evaluating: Epoch 1:  57%|█████▋    | 3151/5548 [1:30:15<1:08:41,  1.72s/it, loss=38]

{'eval_batch': 3150, 'eval_batch/loss': 38.003971099853516}


Evaluating: Epoch 1:  57%|█████▋    | 3161/5548 [1:30:32<1:08:23,  1.72s/it, loss=39.1]

{'eval_batch': 3160, 'eval_batch/loss': 39.06696701049805}


Evaluating: Epoch 1:  57%|█████▋    | 3171/5548 [1:30:49<1:08:08,  1.72s/it, loss=39.8]

{'eval_batch': 3170, 'eval_batch/loss': 39.802978515625}


Evaluating: Epoch 1:  57%|█████▋    | 3181/5548 [1:31:07<1:07:51,  1.72s/it, loss=40]

{'eval_batch': 3180, 'eval_batch/loss': 40.00892639160156}


Evaluating: Epoch 1:  58%|█████▊    | 3191/5548 [1:31:24<1:07:32,  1.72s/it, loss=39]

{'eval_batch': 3190, 'eval_batch/loss': 38.95804405212402}


Evaluating: Epoch 1:  58%|█████▊    | 3201/5548 [1:31:41<1:07:15,  1.72s/it, loss=38.9]

{'eval_batch': 3200, 'eval_batch/loss': 38.93367576599121}


Evaluating: Epoch 1:  58%|█████▊    | 3211/5548 [1:31:58<1:06:57,  1.72s/it, loss=38.5]

{'eval_batch': 3210, 'eval_batch/loss': 38.51875686645508}


Evaluating: Epoch 1:  58%|█████▊    | 3221/5548 [1:32:15<1:06:40,  1.72s/it, loss=39.7]

{'eval_batch': 3220, 'eval_batch/loss': 39.700531005859375}


Evaluating: Epoch 1:  58%|█████▊    | 3231/5548 [1:32:32<1:06:22,  1.72s/it, loss=39.1]

{'eval_batch': 3230, 'eval_batch/loss': 39.05671691894531}


Evaluating: Epoch 1:  58%|█████▊    | 3241/5548 [1:32:50<1:06:08,  1.72s/it, loss=37.3]

{'eval_batch': 3240, 'eval_batch/loss': 37.33254432678223}


Evaluating: Epoch 1:  59%|█████▊    | 3251/5548 [1:33:07<1:05:49,  1.72s/it, loss=36.3]

{'eval_batch': 3250, 'eval_batch/loss': 36.29073143005371}


Evaluating: Epoch 1:  59%|█████▉    | 3261/5548 [1:33:24<1:05:29,  1.72s/it, loss=35.9]

{'eval_batch': 3260, 'eval_batch/loss': 35.90559768676758}


Evaluating: Epoch 1:  59%|█████▉    | 3271/5548 [1:33:41<1:05:14,  1.72s/it, loss=35.5]

{'eval_batch': 3270, 'eval_batch/loss': 35.53789138793945}


Evaluating: Epoch 1:  59%|█████▉    | 3281/5548 [1:33:58<1:04:58,  1.72s/it, loss=35.9]

{'eval_batch': 3280, 'eval_batch/loss': 35.90509223937988}


Evaluating: Epoch 1:  59%|█████▉    | 3291/5548 [1:34:16<1:04:39,  1.72s/it, loss=36.4]

{'eval_batch': 3290, 'eval_batch/loss': 36.416208267211914}


Evaluating: Epoch 1:  59%|█████▉    | 3301/5548 [1:34:33<1:04:21,  1.72s/it, loss=36.6]

{'eval_batch': 3300, 'eval_batch/loss': 36.638227462768555}


Evaluating: Epoch 1:  60%|█████▉    | 3311/5548 [1:34:50<1:04:05,  1.72s/it, loss=36.2]

{'eval_batch': 3310, 'eval_batch/loss': 36.23738479614258}


Evaluating: Epoch 1:  60%|█████▉    | 3321/5548 [1:35:07<1:03:46,  1.72s/it, loss=36.1]

{'eval_batch': 3320, 'eval_batch/loss': 36.08338928222656}


Evaluating: Epoch 1:  60%|██████    | 3331/5548 [1:35:24<1:03:32,  1.72s/it, loss=35.8]

{'eval_batch': 3330, 'eval_batch/loss': 35.76339340209961}


Evaluating: Epoch 1:  60%|██████    | 3341/5548 [1:35:42<1:03:12,  1.72s/it, loss=34.6]

{'eval_batch': 3340, 'eval_batch/loss': 34.629228591918945}


Evaluating: Epoch 1:  60%|██████    | 3351/5548 [1:35:59<1:02:56,  1.72s/it, loss=34.8]

{'eval_batch': 3350, 'eval_batch/loss': 34.77866172790527}


Evaluating: Epoch 1:  61%|██████    | 3361/5548 [1:36:16<1:02:38,  1.72s/it, loss=35.1]

{'eval_batch': 3360, 'eval_batch/loss': 35.08767318725586}


Evaluating: Epoch 1:  61%|██████    | 3371/5548 [1:36:33<1:02:22,  1.72s/it, loss=35.6]

{'eval_batch': 3370, 'eval_batch/loss': 35.57904243469238}


Evaluating: Epoch 1:  61%|██████    | 3381/5548 [1:36:50<1:02:06,  1.72s/it, loss=34.2]

{'eval_batch': 3380, 'eval_batch/loss': 34.224422454833984}


Evaluating: Epoch 1:  61%|██████    | 3391/5548 [1:37:08<1:01:49,  1.72s/it, loss=34.3]

{'eval_batch': 3390, 'eval_batch/loss': 34.251893043518066}


Evaluating: Epoch 1:  61%|██████▏   | 3401/5548 [1:37:25<1:01:33,  1.72s/it, loss=34.8]

{'eval_batch': 3400, 'eval_batch/loss': 34.7702579498291}


Evaluating: Epoch 1:  61%|██████▏   | 3411/5548 [1:37:42<1:01:16,  1.72s/it, loss=35.9]

{'eval_batch': 3410, 'eval_batch/loss': 35.88168716430664}


Evaluating: Epoch 1:  62%|██████▏   | 3421/5548 [1:37:59<1:00:55,  1.72s/it, loss=36.6]

{'eval_batch': 3420, 'eval_batch/loss': 36.622642517089844}


Evaluating: Epoch 1:  62%|██████▏   | 3431/5548 [1:38:16<1:00:39,  1.72s/it, loss=37.1]

{'eval_batch': 3430, 'eval_batch/loss': 37.136735916137695}


Evaluating: Epoch 1:  62%|██████▏   | 3441/5548 [1:38:33<1:00:21,  1.72s/it, loss=37.8]

{'eval_batch': 3440, 'eval_batch/loss': 37.761722564697266}


Evaluating: Epoch 1:  62%|██████▏   | 3451/5548 [1:38:51<1:00:04,  1.72s/it, loss=38.4]

{'eval_batch': 3450, 'eval_batch/loss': 38.363014221191406}


Evaluating: Epoch 1:  62%|██████▏   | 3461/5548 [1:39:08<59:46,  1.72s/it, loss=38]

{'eval_batch': 3460, 'eval_batch/loss': 38.01530838012695}


Evaluating: Epoch 1:  63%|██████▎   | 3471/5548 [1:39:25<59:29,  1.72s/it, loss=38.1]

{'eval_batch': 3470, 'eval_batch/loss': 38.10506248474121}


Evaluating: Epoch 1:  63%|██████▎   | 3481/5548 [1:39:42<59:13,  1.72s/it, loss=38.8]

{'eval_batch': 3480, 'eval_batch/loss': 38.77931213378906}


Evaluating: Epoch 1:  63%|██████▎   | 3491/5548 [1:39:59<58:56,  1.72s/it, loss=39.1]

{'eval_batch': 3490, 'eval_batch/loss': 39.101070404052734}


Evaluating: Epoch 1:  63%|██████▎   | 3501/5548 [1:40:17<58:38,  1.72s/it, loss=39.3]

{'eval_batch': 3500, 'eval_batch/loss': 39.293954849243164}


Evaluating: Epoch 1:  63%|██████▎   | 3511/5548 [1:40:34<58:21,  1.72s/it, loss=39.1]

{'eval_batch': 3510, 'eval_batch/loss': 39.14394760131836}


Evaluating: Epoch 1:  63%|██████▎   | 3521/5548 [1:40:51<58:05,  1.72s/it, loss=38.4]

{'eval_batch': 3520, 'eval_batch/loss': 38.44694137573242}


Evaluating: Epoch 1:  64%|██████▎   | 3531/5548 [1:41:08<57:48,  1.72s/it, loss=38.8]

{'eval_batch': 3530, 'eval_batch/loss': 38.81532096862793}


Evaluating: Epoch 1:  64%|██████▍   | 3541/5548 [1:41:25<57:28,  1.72s/it, loss=38.6]

{'eval_batch': 3540, 'eval_batch/loss': 38.6069450378418}


Evaluating: Epoch 1:  64%|██████▍   | 3551/5548 [1:41:43<57:10,  1.72s/it, loss=38.9]

{'eval_batch': 3550, 'eval_batch/loss': 38.9094123840332}


Evaluating: Epoch 1:  64%|██████▍   | 3561/5548 [1:42:00<56:56,  1.72s/it, loss=39.1]

{'eval_batch': 3560, 'eval_batch/loss': 39.111297607421875}


Evaluating: Epoch 1:  64%|██████▍   | 3571/5548 [1:42:17<56:37,  1.72s/it, loss=37.3]

{'eval_batch': 3570, 'eval_batch/loss': 37.34714126586914}


Evaluating: Epoch 1:  65%|██████▍   | 3581/5548 [1:42:34<56:21,  1.72s/it, loss=36.8]

{'eval_batch': 3580, 'eval_batch/loss': 36.77954864501953}


Evaluating: Epoch 1:  65%|██████▍   | 3591/5548 [1:42:51<56:04,  1.72s/it, loss=37.7]

{'eval_batch': 3590, 'eval_batch/loss': 37.69309997558594}


Evaluating: Epoch 1:  65%|██████▍   | 3601/5548 [1:43:08<55:47,  1.72s/it, loss=37.5]

{'eval_batch': 3600, 'eval_batch/loss': 37.548282623291016}


Evaluating: Epoch 1:  65%|██████▌   | 3611/5548 [1:43:26<55:29,  1.72s/it, loss=36.9]

{'eval_batch': 3610, 'eval_batch/loss': 36.88118553161621}


Evaluating: Epoch 1:  65%|██████▌   | 3621/5548 [1:43:43<55:11,  1.72s/it, loss=37.4]

{'eval_batch': 3620, 'eval_batch/loss': 37.35736846923828}


Evaluating: Epoch 1:  65%|██████▌   | 3631/5548 [1:44:00<54:56,  1.72s/it, loss=37.7]

{'eval_batch': 3630, 'eval_batch/loss': 37.72953987121582}


Evaluating: Epoch 1:  66%|██████▌   | 3641/5548 [1:44:17<54:38,  1.72s/it, loss=37.4]

{'eval_batch': 3640, 'eval_batch/loss': 37.42586898803711}


Evaluating: Epoch 1:  66%|██████▌   | 3651/5548 [1:44:34<54:20,  1.72s/it, loss=37.2]

{'eval_batch': 3650, 'eval_batch/loss': 37.16552734375}


Evaluating: Epoch 1:  66%|██████▌   | 3661/5548 [1:44:52<54:03,  1.72s/it, loss=37.3]

{'eval_batch': 3660, 'eval_batch/loss': 37.30816078186035}


Evaluating: Epoch 1:  66%|██████▌   | 3671/5548 [1:45:09<53:45,  1.72s/it, loss=37.4]

{'eval_batch': 3670, 'eval_batch/loss': 37.38419151306152}


Evaluating: Epoch 1:  66%|██████▋   | 3681/5548 [1:45:26<53:29,  1.72s/it, loss=37.9]

{'eval_batch': 3680, 'eval_batch/loss': 37.874250411987305}


Evaluating: Epoch 1:  67%|██████▋   | 3691/5548 [1:45:43<53:15,  1.72s/it, loss=37.5]

{'eval_batch': 3690, 'eval_batch/loss': 37.52758026123047}


Evaluating: Epoch 1:  67%|██████▋   | 3701/5548 [1:46:00<52:54,  1.72s/it, loss=36.3]

{'eval_batch': 3700, 'eval_batch/loss': 36.257537841796875}


Evaluating: Epoch 1:  67%|██████▋   | 3711/5548 [1:46:18<52:37,  1.72s/it, loss=35.3]

{'eval_batch': 3710, 'eval_batch/loss': 35.28420639038086}


Evaluating: Epoch 1:  67%|██████▋   | 3721/5548 [1:46:35<52:19,  1.72s/it, loss=36.1]

{'eval_batch': 3720, 'eval_batch/loss': 36.085906982421875}


Evaluating: Epoch 1:  67%|██████▋   | 3731/5548 [1:46:52<52:02,  1.72s/it, loss=35.8]

{'eval_batch': 3730, 'eval_batch/loss': 35.822418212890625}


Evaluating: Epoch 1:  67%|██████▋   | 3741/5548 [1:47:09<51:44,  1.72s/it, loss=35.1]

{'eval_batch': 3740, 'eval_batch/loss': 35.05666732788086}


Evaluating: Epoch 1:  68%|██████▊   | 3751/5548 [1:47:26<51:30,  1.72s/it, loss=34.6]

{'eval_batch': 3750, 'eval_batch/loss': 34.61689758300781}


Evaluating: Epoch 1:  68%|██████▊   | 3761/5548 [1:47:43<51:10,  1.72s/it, loss=34.6]

{'eval_batch': 3760, 'eval_batch/loss': 34.62940216064453}


Evaluating: Epoch 1:  68%|██████▊   | 3771/5548 [1:48:01<50:53,  1.72s/it, loss=33.5]

{'eval_batch': 3770, 'eval_batch/loss': 33.48413276672363}


Evaluating: Epoch 1:  68%|██████▊   | 3781/5548 [1:48:18<50:37,  1.72s/it, loss=33.9]

{'eval_batch': 3780, 'eval_batch/loss': 33.91010665893555}


Evaluating: Epoch 1:  68%|██████▊   | 3791/5548 [1:48:35<50:19,  1.72s/it, loss=34.4]

{'eval_batch': 3790, 'eval_batch/loss': 34.39322280883789}


Evaluating: Epoch 1:  69%|██████▊   | 3801/5548 [1:48:52<50:05,  1.72s/it, loss=34.3]

{'eval_batch': 3800, 'eval_batch/loss': 34.25179481506348}


Evaluating: Epoch 1:  69%|██████▊   | 3811/5548 [1:49:09<49:44,  1.72s/it, loss=33.9]

{'eval_batch': 3810, 'eval_batch/loss': 33.93852424621582}


Evaluating: Epoch 1:  69%|██████▉   | 3821/5548 [1:49:27<49:31,  1.72s/it, loss=34.3]

{'eval_batch': 3820, 'eval_batch/loss': 34.26632308959961}


Evaluating: Epoch 1:  69%|██████▉   | 3831/5548 [1:49:44<49:11,  1.72s/it, loss=34]

{'eval_batch': 3830, 'eval_batch/loss': 33.98585510253906}


Evaluating: Epoch 1:  69%|██████▉   | 3841/5548 [1:50:01<48:53,  1.72s/it, loss=34.7]

{'eval_batch': 3840, 'eval_batch/loss': 34.654178619384766}


Evaluating: Epoch 1:  69%|██████▉   | 3851/5548 [1:50:18<48:36,  1.72s/it, loss=35.5]

{'eval_batch': 3850, 'eval_batch/loss': 35.48468589782715}


Evaluating: Epoch 1:  70%|██████▉   | 3861/5548 [1:50:35<48:18,  1.72s/it, loss=34.8]

{'eval_batch': 3860, 'eval_batch/loss': 34.76033401489258}


Evaluating: Epoch 1:  70%|██████▉   | 3871/5548 [1:50:53<48:03,  1.72s/it, loss=35.2]

{'eval_batch': 3870, 'eval_batch/loss': 35.183387756347656}


Evaluating: Epoch 1:  70%|██████▉   | 3881/5548 [1:51:10<47:46,  1.72s/it, loss=35.1]

{'eval_batch': 3880, 'eval_batch/loss': 35.09094047546387}


Evaluating: Epoch 1:  70%|███████   | 3891/5548 [1:51:27<47:28,  1.72s/it, loss=35.2]

{'eval_batch': 3890, 'eval_batch/loss': 35.213592529296875}


Evaluating: Epoch 1:  70%|███████   | 3901/5548 [1:51:44<47:10,  1.72s/it, loss=36]

{'eval_batch': 3900, 'eval_batch/loss': 35.970455169677734}


Evaluating: Epoch 1:  70%|███████   | 3911/5548 [1:52:01<46:52,  1.72s/it, loss=36.5]

{'eval_batch': 3910, 'eval_batch/loss': 36.49787139892578}


Evaluating: Epoch 1:  71%|███████   | 3921/5548 [1:52:18<46:37,  1.72s/it, loss=36.7]

{'eval_batch': 3920, 'eval_batch/loss': 36.698347091674805}


Evaluating: Epoch 1:  71%|███████   | 3931/5548 [1:52:36<46:19,  1.72s/it, loss=36.3]

{'eval_batch': 3930, 'eval_batch/loss': 36.26103401184082}


Evaluating: Epoch 1:  71%|███████   | 3941/5548 [1:52:53<46:02,  1.72s/it, loss=36.4]

{'eval_batch': 3940, 'eval_batch/loss': 36.40568161010742}


Evaluating: Epoch 1:  71%|███████   | 3951/5548 [1:53:10<45:45,  1.72s/it, loss=36.4]

{'eval_batch': 3950, 'eval_batch/loss': 36.43734169006348}


Evaluating: Epoch 1:  71%|███████▏  | 3961/5548 [1:53:27<45:28,  1.72s/it, loss=36.6]

{'eval_batch': 3960, 'eval_batch/loss': 36.62887954711914}


Evaluating: Epoch 1:  72%|███████▏  | 3971/5548 [1:53:44<45:09,  1.72s/it, loss=36.2]

{'eval_batch': 3970, 'eval_batch/loss': 36.186838150024414}


Evaluating: Epoch 1:  72%|███████▏  | 3981/5548 [1:54:02<44:53,  1.72s/it, loss=36.4]

{'eval_batch': 3980, 'eval_batch/loss': 36.40401077270508}


Evaluating: Epoch 1:  72%|███████▏  | 3991/5548 [1:54:19<44:37,  1.72s/it, loss=36.1]

{'eval_batch': 3990, 'eval_batch/loss': 36.076494216918945}


Evaluating: Epoch 1:  72%|███████▏  | 4001/5548 [1:54:36<44:18,  1.72s/it, loss=35.7]

{'eval_batch': 4000, 'eval_batch/loss': 35.65831184387207}


Evaluating: Epoch 1:  72%|███████▏  | 4011/5548 [1:54:53<44:02,  1.72s/it, loss=34.7]

{'eval_batch': 4010, 'eval_batch/loss': 34.71890640258789}


Evaluating: Epoch 1:  72%|███████▏  | 4021/5548 [1:55:10<43:43,  1.72s/it, loss=34.7]

{'eval_batch': 4020, 'eval_batch/loss': 34.69748306274414}


Evaluating: Epoch 1:  73%|███████▎  | 4031/5548 [1:55:28<43:26,  1.72s/it, loss=34.8]

{'eval_batch': 4030, 'eval_batch/loss': 34.83276557922363}


Evaluating: Epoch 1:  73%|███████▎  | 4041/5548 [1:55:45<43:10,  1.72s/it, loss=35.5]

{'eval_batch': 4040, 'eval_batch/loss': 35.53190612792969}


Evaluating: Epoch 1:  73%|███████▎  | 4051/5548 [1:56:02<42:53,  1.72s/it, loss=35.2]

{'eval_batch': 4050, 'eval_batch/loss': 35.18122863769531}


Evaluating: Epoch 1:  73%|███████▎  | 4061/5548 [1:56:19<42:35,  1.72s/it, loss=35.4]

{'eval_batch': 4060, 'eval_batch/loss': 35.41582679748535}


Evaluating: Epoch 1:  73%|███████▎  | 4071/5548 [1:56:36<42:17,  1.72s/it, loss=35.1]

{'eval_batch': 4070, 'eval_batch/loss': 35.079349517822266}


Evaluating: Epoch 1:  74%|███████▎  | 4081/5548 [1:56:53<42:01,  1.72s/it, loss=35.8]

{'eval_batch': 4080, 'eval_batch/loss': 35.82579040527344}


Evaluating: Epoch 1:  74%|███████▎  | 4091/5548 [1:57:11<41:43,  1.72s/it, loss=36.4]

{'eval_batch': 4090, 'eval_batch/loss': 36.36322212219238}


Evaluating: Epoch 1:  74%|███████▍  | 4101/5548 [1:57:28<41:26,  1.72s/it, loss=37.2]

{'eval_batch': 4100, 'eval_batch/loss': 37.152618408203125}


Evaluating: Epoch 1:  74%|███████▍  | 4111/5548 [1:57:45<41:08,  1.72s/it, loss=36.5]

{'eval_batch': 4110, 'eval_batch/loss': 36.53469276428223}


Evaluating: Epoch 1:  74%|███████▍  | 4121/5548 [1:58:02<40:52,  1.72s/it, loss=36.6]

{'eval_batch': 4120, 'eval_batch/loss': 36.59512519836426}


Evaluating: Epoch 1:  74%|███████▍  | 4131/5548 [1:58:19<40:35,  1.72s/it, loss=37]

{'eval_batch': 4130, 'eval_batch/loss': 36.97833824157715}


Evaluating: Epoch 1:  75%|███████▍  | 4141/5548 [1:58:37<40:18,  1.72s/it, loss=36.5]

{'eval_batch': 4140, 'eval_batch/loss': 36.49256896972656}


Evaluating: Epoch 1:  75%|███████▍  | 4151/5548 [1:58:54<40:03,  1.72s/it, loss=36.1]

{'eval_batch': 4150, 'eval_batch/loss': 36.05562210083008}


Evaluating: Epoch 1:  75%|███████▌  | 4161/5548 [1:59:11<39:43,  1.72s/it, loss=36.2]

{'eval_batch': 4160, 'eval_batch/loss': 36.20392990112305}


Evaluating: Epoch 1:  75%|███████▌  | 4171/5548 [1:59:28<39:26,  1.72s/it, loss=36.6]

{'eval_batch': 4170, 'eval_batch/loss': 36.63943862915039}


Evaluating: Epoch 1:  75%|███████▌  | 4181/5548 [1:59:45<39:10,  1.72s/it, loss=36.8]

{'eval_batch': 4180, 'eval_batch/loss': 36.780744552612305}


Evaluating: Epoch 1:  76%|███████▌  | 4191/5548 [2:00:03<38:51,  1.72s/it, loss=35.5]

{'eval_batch': 4190, 'eval_batch/loss': 35.5408878326416}


Evaluating: Epoch 1:  76%|███████▌  | 4201/5548 [2:00:20<38:36,  1.72s/it, loss=35.2]

{'eval_batch': 4200, 'eval_batch/loss': 35.205427169799805}


Evaluating: Epoch 1:  76%|███████▌  | 4211/5548 [2:00:37<38:19,  1.72s/it, loss=35.1]

{'eval_batch': 4210, 'eval_batch/loss': 35.14249610900879}


Evaluating: Epoch 1:  76%|███████▌  | 4221/5548 [2:00:54<38:00,  1.72s/it, loss=35.6]

{'eval_batch': 4220, 'eval_batch/loss': 35.62408256530762}


Evaluating: Epoch 1:  76%|███████▋  | 4231/5548 [2:01:11<37:43,  1.72s/it, loss=36.1]

{'eval_batch': 4230, 'eval_batch/loss': 36.1056022644043}


Evaluating: Epoch 1:  76%|███████▋  | 4241/5548 [2:01:28<37:26,  1.72s/it, loss=35.6]

{'eval_batch': 4240, 'eval_batch/loss': 35.608076095581055}


Evaluating: Epoch 1:  77%|███████▋  | 4251/5548 [2:01:46<37:10,  1.72s/it, loss=35.1]

{'eval_batch': 4250, 'eval_batch/loss': 35.05879592895508}


Evaluating: Epoch 1:  77%|███████▋  | 4261/5548 [2:02:03<36:52,  1.72s/it, loss=35.4]

{'eval_batch': 4260, 'eval_batch/loss': 35.3780632019043}


Evaluating: Epoch 1:  77%|███████▋  | 4271/5548 [2:02:20<36:34,  1.72s/it, loss=35.6]

{'eval_batch': 4270, 'eval_batch/loss': 35.592926025390625}


Evaluating: Epoch 1:  77%|███████▋  | 4281/5548 [2:02:37<36:16,  1.72s/it, loss=35.6]

{'eval_batch': 4280, 'eval_batch/loss': 35.642290115356445}


Evaluating: Epoch 1:  77%|███████▋  | 4291/5548 [2:02:54<36:00,  1.72s/it, loss=35.1]

{'eval_batch': 4290, 'eval_batch/loss': 35.07611846923828}


Evaluating: Epoch 1:  78%|███████▊  | 4301/5548 [2:03:12<35:43,  1.72s/it, loss=35.2]

{'eval_batch': 4300, 'eval_batch/loss': 35.2181510925293}


Evaluating: Epoch 1:  78%|███████▊  | 4311/5548 [2:03:29<35:25,  1.72s/it, loss=35.2]

{'eval_batch': 4310, 'eval_batch/loss': 35.24582099914551}


Evaluating: Epoch 1:  78%|███████▊  | 4321/5548 [2:03:46<35:10,  1.72s/it, loss=35.9]

{'eval_batch': 4320, 'eval_batch/loss': 35.85322189331055}


Evaluating: Epoch 1:  78%|███████▊  | 4331/5548 [2:04:03<34:51,  1.72s/it, loss=35.3]

{'eval_batch': 4330, 'eval_batch/loss': 35.27832794189453}


Evaluating: Epoch 1:  78%|███████▊  | 4341/5548 [2:04:20<34:34,  1.72s/it, loss=35]

{'eval_batch': 4340, 'eval_batch/loss': 35.0205020904541}


Evaluating: Epoch 1:  78%|███████▊  | 4351/5548 [2:04:37<34:17,  1.72s/it, loss=35.9]

{'eval_batch': 4350, 'eval_batch/loss': 35.93532180786133}


Evaluating: Epoch 1:  79%|███████▊  | 4361/5548 [2:04:55<34:00,  1.72s/it, loss=36.4]

{'eval_batch': 4360, 'eval_batch/loss': 36.36236572265625}


Evaluating: Epoch 1:  79%|███████▉  | 4371/5548 [2:05:12<33:43,  1.72s/it, loss=35.7]

{'eval_batch': 4370, 'eval_batch/loss': 35.73520469665527}


Evaluating: Epoch 1:  79%|███████▉  | 4381/5548 [2:05:29<33:25,  1.72s/it, loss=35]

{'eval_batch': 4380, 'eval_batch/loss': 34.98158645629883}


Evaluating: Epoch 1:  79%|███████▉  | 4391/5548 [2:05:46<33:08,  1.72s/it, loss=35.5]

{'eval_batch': 4390, 'eval_batch/loss': 35.51062774658203}


Evaluating: Epoch 1:  79%|███████▉  | 4401/5548 [2:06:03<32:51,  1.72s/it, loss=35.7]

{'eval_batch': 4400, 'eval_batch/loss': 35.730051040649414}


Evaluating: Epoch 1:  80%|███████▉  | 4411/5548 [2:06:21<32:34,  1.72s/it, loss=36.4]

{'eval_batch': 4410, 'eval_batch/loss': 36.37460136413574}


Evaluating: Epoch 1:  80%|███████▉  | 4421/5548 [2:06:38<32:17,  1.72s/it, loss=36.5]

{'eval_batch': 4420, 'eval_batch/loss': 36.460166931152344}


Evaluating: Epoch 1:  80%|███████▉  | 4431/5548 [2:06:55<31:59,  1.72s/it, loss=36.6]

{'eval_batch': 4430, 'eval_batch/loss': 36.55426788330078}


Evaluating: Epoch 1:  80%|████████  | 4441/5548 [2:07:12<31:42,  1.72s/it, loss=36.2]

{'eval_batch': 4440, 'eval_batch/loss': 36.15825271606445}


Evaluating: Epoch 1:  80%|████████  | 4451/5548 [2:07:29<31:24,  1.72s/it, loss=36.1]

{'eval_batch': 4450, 'eval_batch/loss': 36.14207458496094}


Evaluating: Epoch 1:  80%|████████  | 4461/5548 [2:07:47<31:08,  1.72s/it, loss=37.4]

{'eval_batch': 4460, 'eval_batch/loss': 37.368919372558594}


Evaluating: Epoch 1:  81%|████████  | 4471/5548 [2:08:04<30:50,  1.72s/it, loss=37.7]

{'eval_batch': 4470, 'eval_batch/loss': 37.677385330200195}


Evaluating: Epoch 1:  81%|████████  | 4481/5548 [2:08:21<30:32,  1.72s/it, loss=37]

{'eval_batch': 4480, 'eval_batch/loss': 37.015106201171875}


Evaluating: Epoch 1:  81%|████████  | 4491/5548 [2:08:38<30:16,  1.72s/it, loss=36.7]

{'eval_batch': 4490, 'eval_batch/loss': 36.73760223388672}


Evaluating: Epoch 1:  81%|████████  | 4501/5548 [2:08:55<30:00,  1.72s/it, loss=36]

{'eval_batch': 4500, 'eval_batch/loss': 35.97088432312012}


Evaluating: Epoch 1:  81%|████████▏ | 4511/5548 [2:09:12<29:42,  1.72s/it, loss=35.2]

{'eval_batch': 4510, 'eval_batch/loss': 35.1943359375}


Evaluating: Epoch 1:  81%|████████▏ | 4521/5548 [2:09:30<29:26,  1.72s/it, loss=35.5]

{'eval_batch': 4520, 'eval_batch/loss': 35.47471618652344}


Evaluating: Epoch 1:  82%|████████▏ | 4531/5548 [2:09:47<29:07,  1.72s/it, loss=35.6]

{'eval_batch': 4530, 'eval_batch/loss': 35.62083435058594}


Evaluating: Epoch 1:  82%|████████▏ | 4541/5548 [2:10:04<28:50,  1.72s/it, loss=34.7]

{'eval_batch': 4540, 'eval_batch/loss': 34.706546783447266}


Evaluating: Epoch 1:  82%|████████▏ | 4551/5548 [2:10:21<28:33,  1.72s/it, loss=34.9]

{'eval_batch': 4550, 'eval_batch/loss': 34.89005088806152}


Evaluating: Epoch 1:  82%|████████▏ | 4561/5548 [2:10:38<28:16,  1.72s/it, loss=35.3]

{'eval_batch': 4560, 'eval_batch/loss': 35.26682662963867}


Evaluating: Epoch 1:  82%|████████▏ | 4571/5548 [2:10:56<27:59,  1.72s/it, loss=35.4]

{'eval_batch': 4570, 'eval_batch/loss': 35.36161804199219}


Evaluating: Epoch 1:  83%|████████▎ | 4581/5548 [2:11:13<27:41,  1.72s/it, loss=35.7]

{'eval_batch': 4580, 'eval_batch/loss': 35.6571159362793}


Evaluating: Epoch 1:  83%|████████▎ | 4591/5548 [2:11:30<27:25,  1.72s/it, loss=36.3]

{'eval_batch': 4590, 'eval_batch/loss': 36.32925796508789}


Evaluating: Epoch 1:  83%|████████▎ | 4601/5548 [2:11:47<27:07,  1.72s/it, loss=37.3]

{'eval_batch': 4600, 'eval_batch/loss': 37.261634826660156}


Evaluating: Epoch 1:  83%|████████▎ | 4611/5548 [2:12:04<26:49,  1.72s/it, loss=37.3]

{'eval_batch': 4610, 'eval_batch/loss': 37.31845283508301}


Evaluating: Epoch 1:  83%|████████▎ | 4621/5548 [2:12:22<26:33,  1.72s/it, loss=37.5]

{'eval_batch': 4620, 'eval_batch/loss': 37.4932861328125}


Evaluating: Epoch 1:  83%|████████▎ | 4631/5548 [2:12:39<26:17,  1.72s/it, loss=37.2]

{'eval_batch': 4630, 'eval_batch/loss': 37.24727439880371}


Evaluating: Epoch 1:  84%|████████▎ | 4641/5548 [2:12:56<25:58,  1.72s/it, loss=37.9]

{'eval_batch': 4640, 'eval_batch/loss': 37.92671012878418}


Evaluating: Epoch 1:  84%|████████▍ | 4651/5548 [2:13:13<25:41,  1.72s/it, loss=38.6]

{'eval_batch': 4650, 'eval_batch/loss': 38.583702087402344}


Evaluating: Epoch 1:  84%|████████▍ | 4661/5548 [2:13:30<25:24,  1.72s/it, loss=39.7]

{'eval_batch': 4660, 'eval_batch/loss': 39.69218826293945}


Evaluating: Epoch 1:  84%|████████▍ | 4671/5548 [2:13:47<25:07,  1.72s/it, loss=39.4]

{'eval_batch': 4670, 'eval_batch/loss': 39.383358001708984}


Evaluating: Epoch 1:  84%|████████▍ | 4681/5548 [2:14:05<24:50,  1.72s/it, loss=39.6]

{'eval_batch': 4680, 'eval_batch/loss': 39.636009216308594}


Evaluating: Epoch 1:  85%|████████▍ | 4691/5548 [2:14:22<24:32,  1.72s/it, loss=39.7]

{'eval_batch': 4690, 'eval_batch/loss': 39.66374206542969}


Evaluating: Epoch 1:  85%|████████▍ | 4701/5548 [2:14:39<24:15,  1.72s/it, loss=40.4]

{'eval_batch': 4700, 'eval_batch/loss': 40.414146423339844}


Evaluating: Epoch 1:  85%|████████▍ | 4711/5548 [2:14:56<23:58,  1.72s/it, loss=40.9]

{'eval_batch': 4710, 'eval_batch/loss': 40.92243957519531}


Evaluating: Epoch 1:  85%|████████▌ | 4721/5548 [2:15:13<23:42,  1.72s/it, loss=40.3]

{'eval_batch': 4720, 'eval_batch/loss': 40.25469398498535}


Evaluating: Epoch 1:  85%|████████▌ | 4731/5548 [2:15:31<23:24,  1.72s/it, loss=39.6]

{'eval_batch': 4730, 'eval_batch/loss': 39.59516525268555}


Evaluating: Epoch 1:  85%|████████▌ | 4741/5548 [2:15:48<23:08,  1.72s/it, loss=39.6]

{'eval_batch': 4740, 'eval_batch/loss': 39.63091850280762}


Evaluating: Epoch 1:  86%|████████▌ | 4751/5548 [2:16:05<22:50,  1.72s/it, loss=39.5]

{'eval_batch': 4750, 'eval_batch/loss': 39.54807090759277}


Evaluating: Epoch 1:  86%|████████▌ | 4761/5548 [2:16:22<22:33,  1.72s/it, loss=38.8]

{'eval_batch': 4760, 'eval_batch/loss': 38.84940147399902}


Evaluating: Epoch 1:  86%|████████▌ | 4771/5548 [2:16:39<22:15,  1.72s/it, loss=38.3]

{'eval_batch': 4770, 'eval_batch/loss': 38.2575798034668}


Evaluating: Epoch 1:  86%|████████▌ | 4781/5548 [2:16:57<21:58,  1.72s/it, loss=37.9]

{'eval_batch': 4780, 'eval_batch/loss': 37.93568801879883}


Evaluating: Epoch 1:  86%|████████▋ | 4791/5548 [2:17:14<21:41,  1.72s/it, loss=37.8]

{'eval_batch': 4790, 'eval_batch/loss': 37.757184982299805}


Evaluating: Epoch 1:  87%|████████▋ | 4801/5548 [2:17:31<21:24,  1.72s/it, loss=37.6]

{'eval_batch': 4800, 'eval_batch/loss': 37.60546112060547}


Evaluating: Epoch 1:  87%|████████▋ | 4811/5548 [2:17:48<21:06,  1.72s/it, loss=37.3]

{'eval_batch': 4810, 'eval_batch/loss': 37.34784126281738}


Evaluating: Epoch 1:  87%|████████▋ | 4821/5548 [2:18:05<20:49,  1.72s/it, loss=38.1]

{'eval_batch': 4820, 'eval_batch/loss': 38.07160949707031}


Evaluating: Epoch 1:  87%|████████▋ | 4831/5548 [2:18:23<20:31,  1.72s/it, loss=38.7]

{'eval_batch': 4830, 'eval_batch/loss': 38.73558044433594}


Evaluating: Epoch 1:  87%|████████▋ | 4841/5548 [2:18:40<20:15,  1.72s/it, loss=38.6]

{'eval_batch': 4840, 'eval_batch/loss': 38.58721733093262}


Evaluating: Epoch 1:  87%|████████▋ | 4851/5548 [2:18:57<19:58,  1.72s/it, loss=38.6]

{'eval_batch': 4850, 'eval_batch/loss': 38.57098960876465}


Evaluating: Epoch 1:  88%|████████▊ | 4861/5548 [2:19:14<19:41,  1.72s/it, loss=39.3]

{'eval_batch': 4860, 'eval_batch/loss': 39.3331241607666}


Evaluating: Epoch 1:  88%|████████▊ | 4871/5548 [2:19:31<19:23,  1.72s/it, loss=39.1]

{'eval_batch': 4870, 'eval_batch/loss': 39.098201751708984}


Evaluating: Epoch 1:  88%|████████▊ | 4881/5548 [2:19:48<19:07,  1.72s/it, loss=39.6]

{'eval_batch': 4880, 'eval_batch/loss': 39.60741424560547}


Evaluating: Epoch 1:  88%|████████▊ | 4891/5548 [2:20:06<18:50,  1.72s/it, loss=39.2]

{'eval_batch': 4890, 'eval_batch/loss': 39.20194625854492}


Evaluating: Epoch 1:  88%|████████▊ | 4901/5548 [2:20:23<18:32,  1.72s/it, loss=40.4]

{'eval_batch': 4900, 'eval_batch/loss': 40.40437889099121}


Evaluating: Epoch 1:  89%|████████▊ | 4911/5548 [2:20:40<18:14,  1.72s/it, loss=39.7]

{'eval_batch': 4910, 'eval_batch/loss': 39.701894760131836}


Evaluating: Epoch 1:  89%|████████▊ | 4921/5548 [2:20:57<17:57,  1.72s/it, loss=39.9]

{'eval_batch': 4920, 'eval_batch/loss': 39.85071563720703}


Evaluating: Epoch 1:  89%|████████▉ | 4931/5548 [2:21:14<17:40,  1.72s/it, loss=39.6]

{'eval_batch': 4930, 'eval_batch/loss': 39.6025276184082}


Evaluating: Epoch 1:  89%|████████▉ | 4941/5548 [2:21:32<17:23,  1.72s/it, loss=40]

{'eval_batch': 4940, 'eval_batch/loss': 39.95425224304199}


Evaluating: Epoch 1:  89%|████████▉ | 4951/5548 [2:21:49<17:06,  1.72s/it, loss=40.8]

{'eval_batch': 4950, 'eval_batch/loss': 40.750850677490234}


Evaluating: Epoch 1:  89%|████████▉ | 4961/5548 [2:22:06<16:49,  1.72s/it, loss=41.6]

{'eval_batch': 4960, 'eval_batch/loss': 41.57584190368652}


Evaluating: Epoch 1:  90%|████████▉ | 4971/5548 [2:22:23<16:31,  1.72s/it, loss=41.1]

{'eval_batch': 4970, 'eval_batch/loss': 41.1301155090332}


Evaluating: Epoch 1:  90%|████████▉ | 4981/5548 [2:22:40<16:14,  1.72s/it, loss=41.5]

{'eval_batch': 4980, 'eval_batch/loss': 41.455020904541016}


Evaluating: Epoch 1:  90%|████████▉ | 4991/5548 [2:22:58<15:57,  1.72s/it, loss=41.3]

{'eval_batch': 4990, 'eval_batch/loss': 41.29164695739746}


Evaluating: Epoch 1:  90%|█████████ | 5001/5548 [2:23:15<15:40,  1.72s/it, loss=41.5]

{'eval_batch': 5000, 'eval_batch/loss': 41.47633743286133}


Evaluating: Epoch 1:  90%|█████████ | 5011/5548 [2:23:32<15:22,  1.72s/it, loss=40.8]

{'eval_batch': 5010, 'eval_batch/loss': 40.77934455871582}


Evaluating: Epoch 1:  91%|█████████ | 5021/5548 [2:23:49<15:05,  1.72s/it, loss=40.9]

{'eval_batch': 5020, 'eval_batch/loss': 40.897796630859375}


Evaluating: Epoch 1:  91%|█████████ | 5031/5548 [2:24:06<14:48,  1.72s/it, loss=40]

{'eval_batch': 5030, 'eval_batch/loss': 39.985633850097656}


Evaluating: Epoch 1:  91%|█████████ | 5041/5548 [2:24:24<14:31,  1.72s/it, loss=40.6]

{'eval_batch': 5040, 'eval_batch/loss': 40.56594467163086}


Evaluating: Epoch 1:  91%|█████████ | 5051/5548 [2:24:41<14:14,  1.72s/it, loss=40.7]

{'eval_batch': 5050, 'eval_batch/loss': 40.73270225524902}


Evaluating: Epoch 1:  91%|█████████ | 5061/5548 [2:24:58<13:56,  1.72s/it, loss=39.9]

{'eval_batch': 5060, 'eval_batch/loss': 39.907907485961914}


Evaluating: Epoch 1:  91%|█████████▏| 5071/5548 [2:25:15<13:39,  1.72s/it, loss=39.7]

{'eval_batch': 5070, 'eval_batch/loss': 39.7388916015625}


Evaluating: Epoch 1:  92%|█████████▏| 5081/5548 [2:25:32<13:22,  1.72s/it, loss=39.5]

{'eval_batch': 5080, 'eval_batch/loss': 39.548757553100586}


Evaluating: Epoch 1:  92%|█████████▏| 5091/5548 [2:25:49<13:05,  1.72s/it, loss=39.8]

{'eval_batch': 5090, 'eval_batch/loss': 39.8449649810791}


Evaluating: Epoch 1:  92%|█████████▏| 5101/5548 [2:26:07<12:48,  1.72s/it, loss=39.9]

{'eval_batch': 5100, 'eval_batch/loss': 39.92135047912598}


Evaluating: Epoch 1:  92%|█████████▏| 5111/5548 [2:26:24<12:31,  1.72s/it, loss=40.1]

{'eval_batch': 5110, 'eval_batch/loss': 40.07301712036133}


Evaluating: Epoch 1:  92%|█████████▏| 5121/5548 [2:26:41<12:14,  1.72s/it, loss=40]

{'eval_batch': 5120, 'eval_batch/loss': 40.00337219238281}


Evaluating: Epoch 1:  92%|█████████▏| 5131/5548 [2:26:58<11:57,  1.72s/it, loss=40.2]

{'eval_batch': 5130, 'eval_batch/loss': 40.16444206237793}


Evaluating: Epoch 1:  93%|█████████▎| 5141/5548 [2:27:15<11:39,  1.72s/it, loss=40.4]

{'eval_batch': 5140, 'eval_batch/loss': 40.410451889038086}


Evaluating: Epoch 1:  93%|█████████▎| 5151/5548 [2:27:33<11:22,  1.72s/it, loss=40.5]

{'eval_batch': 5150, 'eval_batch/loss': 40.47251892089844}


Evaluating: Epoch 1:  93%|█████████▎| 5161/5548 [2:27:50<11:05,  1.72s/it, loss=40.7]

{'eval_batch': 5160, 'eval_batch/loss': 40.74667739868164}


Evaluating: Epoch 1:  93%|█████████▎| 5171/5548 [2:28:07<10:47,  1.72s/it, loss=41]

{'eval_batch': 5170, 'eval_batch/loss': 40.98725128173828}


Evaluating: Epoch 1:  93%|█████████▎| 5181/5548 [2:28:24<10:31,  1.72s/it, loss=40.8]

{'eval_batch': 5180, 'eval_batch/loss': 40.79499626159668}


Evaluating: Epoch 1:  94%|█████████▎| 5191/5548 [2:28:41<10:13,  1.72s/it, loss=40.8]

{'eval_batch': 5190, 'eval_batch/loss': 40.77344512939453}


Evaluating: Epoch 1:  94%|█████████▎| 5201/5548 [2:28:59<09:56,  1.72s/it, loss=40.5]

{'eval_batch': 5200, 'eval_batch/loss': 40.495174407958984}


Evaluating: Epoch 1:  94%|█████████▍| 5211/5548 [2:29:16<09:39,  1.72s/it, loss=39.8]

{'eval_batch': 5210, 'eval_batch/loss': 39.79622459411621}


Evaluating: Epoch 1:  94%|█████████▍| 5221/5548 [2:29:33<09:22,  1.72s/it, loss=39.5]

{'eval_batch': 5220, 'eval_batch/loss': 39.50445747375488}


Evaluating: Epoch 1:  94%|█████████▍| 5231/5548 [2:29:50<09:05,  1.72s/it, loss=39.6]

{'eval_batch': 5230, 'eval_batch/loss': 39.55698776245117}


Evaluating: Epoch 1:  94%|█████████▍| 5241/5548 [2:30:07<08:47,  1.72s/it, loss=39.3]

{'eval_batch': 5240, 'eval_batch/loss': 39.32628059387207}


Evaluating: Epoch 1:  95%|█████████▍| 5251/5548 [2:30:25<08:30,  1.72s/it, loss=39.1]

{'eval_batch': 5250, 'eval_batch/loss': 39.08178901672363}


Evaluating: Epoch 1:  95%|█████████▍| 5261/5548 [2:30:42<08:13,  1.72s/it, loss=38]

{'eval_batch': 5260, 'eval_batch/loss': 37.959177017211914}


Evaluating: Epoch 1:  95%|█████████▌| 5271/5548 [2:30:59<07:56,  1.72s/it, loss=38.5]

{'eval_batch': 5270, 'eval_batch/loss': 38.53308868408203}


Evaluating: Epoch 1:  95%|█████████▌| 5281/5548 [2:31:16<07:39,  1.72s/it, loss=38.4]

{'eval_batch': 5280, 'eval_batch/loss': 38.36081886291504}


Evaluating: Epoch 1:  95%|█████████▌| 5291/5548 [2:31:33<07:22,  1.72s/it, loss=38.7]

{'eval_batch': 5290, 'eval_batch/loss': 38.66064453125}


Evaluating: Epoch 1:  96%|█████████▌| 5301/5548 [2:31:51<07:04,  1.72s/it, loss=38.7]

{'eval_batch': 5300, 'eval_batch/loss': 38.7259407043457}


Evaluating: Epoch 1:  96%|█████████▌| 5311/5548 [2:32:08<06:47,  1.72s/it, loss=38.8]

{'eval_batch': 5310, 'eval_batch/loss': 38.81399917602539}


Evaluating: Epoch 1:  96%|█████████▌| 5321/5548 [2:32:25<06:30,  1.72s/it, loss=37.5]

{'eval_batch': 5320, 'eval_batch/loss': 37.50754165649414}


Evaluating: Epoch 1:  96%|█████████▌| 5331/5548 [2:32:42<06:12,  1.72s/it, loss=37.8]

{'eval_batch': 5330, 'eval_batch/loss': 37.80684280395508}


Evaluating: Epoch 1:  96%|█████████▋| 5341/5548 [2:32:59<05:55,  1.72s/it, loss=38.2]

{'eval_batch': 5340, 'eval_batch/loss': 38.22621536254883}


Evaluating: Epoch 1:  96%|█████████▋| 5351/5548 [2:33:16<05:38,  1.72s/it, loss=38.4]

{'eval_batch': 5350, 'eval_batch/loss': 38.37932205200195}


Evaluating: Epoch 1:  97%|█████████▋| 5361/5548 [2:33:34<05:21,  1.72s/it, loss=38.2]

{'eval_batch': 5360, 'eval_batch/loss': 38.23305130004883}


Evaluating: Epoch 1:  97%|█████████▋| 5371/5548 [2:33:51<05:04,  1.72s/it, loss=38.4]

{'eval_batch': 5370, 'eval_batch/loss': 38.365692138671875}


Evaluating: Epoch 1:  97%|█████████▋| 5381/5548 [2:34:08<04:46,  1.72s/it, loss=38.3]

{'eval_batch': 5380, 'eval_batch/loss': 38.32936477661133}


Evaluating: Epoch 1:  97%|█████████▋| 5391/5548 [2:34:25<04:29,  1.72s/it, loss=37.9]

{'eval_batch': 5390, 'eval_batch/loss': 37.85367202758789}


Evaluating: Epoch 1:  97%|█████████▋| 5401/5548 [2:34:42<04:12,  1.72s/it, loss=38.3]

{'eval_batch': 5400, 'eval_batch/loss': 38.26959991455078}


Evaluating: Epoch 1:  98%|█████████▊| 5411/5548 [2:35:00<03:55,  1.72s/it, loss=38.7]

{'eval_batch': 5410, 'eval_batch/loss': 38.744699478149414}


Evaluating: Epoch 1:  98%|█████████▊| 5421/5548 [2:35:17<03:38,  1.72s/it, loss=38.6]

{'eval_batch': 5420, 'eval_batch/loss': 38.57577705383301}


Evaluating: Epoch 1:  98%|█████████▊| 5431/5548 [2:35:34<03:21,  1.72s/it, loss=37.8]

{'eval_batch': 5430, 'eval_batch/loss': 37.768781661987305}


Evaluating: Epoch 1:  98%|█████████▊| 5441/5548 [2:35:51<03:03,  1.72s/it, loss=38]

{'eval_batch': 5440, 'eval_batch/loss': 37.98581314086914}


Evaluating: Epoch 1:  98%|█████████▊| 5451/5548 [2:36:08<02:46,  1.72s/it, loss=37.9]

{'eval_batch': 5450, 'eval_batch/loss': 37.85912322998047}


Evaluating: Epoch 1:  98%|█████████▊| 5461/5548 [2:36:26<02:29,  1.72s/it, loss=37.4]

{'eval_batch': 5460, 'eval_batch/loss': 37.445587158203125}


Evaluating: Epoch 1:  99%|█████████▊| 5471/5548 [2:36:43<02:12,  1.72s/it, loss=37.3]

{'eval_batch': 5470, 'eval_batch/loss': 37.2822380065918}


Evaluating: Epoch 1:  99%|█████████▉| 5481/5548 [2:37:00<01:55,  1.72s/it, loss=37.1]

{'eval_batch': 5480, 'eval_batch/loss': 37.10631561279297}


Evaluating: Epoch 1:  99%|█████████▉| 5491/5548 [2:37:17<01:37,  1.72s/it, loss=37.9]

{'eval_batch': 5490, 'eval_batch/loss': 37.94866943359375}


Evaluating: Epoch 1:  99%|█████████▉| 5501/5548 [2:37:34<01:20,  1.72s/it, loss=38.1]

{'eval_batch': 5500, 'eval_batch/loss': 38.11554527282715}


Evaluating: Epoch 1:  99%|█████████▉| 5511/5548 [2:37:51<01:03,  1.72s/it, loss=37.6]

{'eval_batch': 5510, 'eval_batch/loss': 37.59030342102051}


Evaluating: Epoch 1: 100%|█████████▉| 5521/5548 [2:38:09<00:46,  1.72s/it, loss=37.7]

{'eval_batch': 5520, 'eval_batch/loss': 37.736183166503906}


Evaluating: Epoch 1: 100%|█████████▉| 5531/5548 [2:38:26<00:29,  1.72s/it, loss=37.5]

{'eval_batch': 5530, 'eval_batch/loss': 37.47984313964844}


Evaluating: Epoch 1: 100%|█████████▉| 5541/5548 [2:38:43<00:12,  1.72s/it, loss=37.7]

{'eval_batch': 5540, 'eval_batch/loss': 37.658782958984375}


                                                                                     

{'eval_batch': 5547, 'eval_batch/loss': 37.74500846862793}




In [None]:
torch.cuda.empty_cache()