In [1]:
#!/usr/bin/env python
# coding: utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7" 

import datetime
import pkg_resources
import time
import scipy.stats as stats
import gc
import re
import operator
import sys
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold,KFold
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from nltk.stem import PorterStemmer
from sklearn.metrics import roc_auc_score
from torch.nn.parallel.data_parallel import data_parallel
from apex.parallel import DistributedDataParallel
from torch.utils.data.sampler import *
import torch.optim as optim
from torch.autograd import Variable
from tqdm import tqdm
import os
from pytorch_pretrained_bert import BertTokenizer, BertAdam, BertModel
from pytorch_pretrained_bert.modeling import BertEmbeddings, BertEncoder, BertConfig,  BertPreTrainedModel
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam
import warnings
warnings.filterwarnings(action='once')
import pickle
from apex import amp
import shutil
import argparse
device = torch.device('cuda')


use_split = 0

TRAIN = True

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

debug = False

MAX_SEQUENCE_LENGTH = 220
SEED = 1314112342
EPOCHS = 1
Data_dir="../input/jigsaw-unintended-bias-in-toxicity-classification"
Input_dir = "../input/jigsaw-unintended-bias-in-toxicity-classification"
WORK_DIR = "output"


if debug:
    train_size = 500                        #Train size to match time limit
    valid_size = 500                          #Validation Size
    
else:
    train_size = 1600000                        #Train size to match time limit
    valid_size = 200000 
    
num_to_load = train_size
valid_size = valid_size
TOXICITY_COLUMN = 'target'
bert_loc = 'bert-base-uncased'


# # Evaluate functions

# From baseline kernel
def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]>0.5
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
                             power_mean(bias_df[SUBGROUP_AUC], POWER),
                             power_mean(bias_df[BPSN_AUC], POWER),
                             power_mean(bias_df[BNSP_AUC], POWER)
                             ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)



SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]>0.5]
    return compute_auc((subgroup_examples[label]>0.5), subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[(df[subgroup]>0.5) & (df[label]<=0.5)]
    non_subgroup_positive_examples = df[(df[subgroup]<=0.5) & (df[label]>0.5)]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label]>0.5, examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[(df[subgroup]>0.5) & (df[label]>0.5)]
    non_subgroup_negative_examples = df[(df[subgroup]<=0.5) & (df[label]<=0.5)]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label]>0.5, examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]>0.5])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)

################################################################################################

# Converting the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming

truncate_options = ['head', 
                    'tail', 
                    'both']

truncate_option = truncate_options[2]

def convert_lines(example, max_seq_length,tokenizer, truncate_option=truncate_option):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        
        tokens_a = tokenizer.tokenize(text)
        
        if len(tokens_a)>max_seq_length:
            
            if truncate_option == 'head':
                
                tokens_a = tokens_a[:max_seq_length]
            
            elif truncate_option == 'tail':
                
                tokens_a = tokens_a[-max_seq_length:]

            elif truncate_option == 'both':
                
                if len(tokens_a) > max_seq_length:
                    
                    tokens_a = tokens_a[:int(max_seq_length/2)] + tokens_a[-int(max_seq_length/2):]
                    
                
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased/bert-base-uncased-vocab.txt', 
                                          cache_dir=None,
                                          do_lower_case=True)

train_df = pd.read_csv(os.path.join(Data_dir,"train.csv"))
print('loaded %d records' % len(train_df))

# Make sure all comment_text values are strings
train_df['comment_text'] = train_df['comment_text'].astype(str) 
if not os.path.exists(os.path.join(WORK_DIR,"sequences.train.pkl")) or debug:
    sequences = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)
    if not debug:
        pd.to_pickle(sequences,os.path.join(WORK_DIR,"sequences.train.pkl"))
else:
    sequences = pd.read_pickle(os.path.join(WORK_DIR,"sequences.train.pkl"))
train_df=train_df.fillna(0)

################################################################################################


# List all identities
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
y_columns=['target']

train_df = train_df.drop(['comment_text'],axis=1)
# convert target to 0,1
#---------------------- 0602
train_df['target_binary']=(train_df['target']>=0.5).astype(float)
y_columns = ['target_binary'] + ['target',
                                 'severe_toxicity',
                                 'obscene',
                                 'identity_attack',
                                 'insult',
                                 'threat',
                                 # 'sexual_explicit'
                                ] # + identity_columns
#-------------------------------------
skf = KFold(n_splits=10, shuffle=True,random_state=SEED).split(sequences,train_df[y_columns].values)
for split,(train_idx,test_idx) in enumerate(skf):
    if split == use_split:
        print(len(train_idx),len(test_idx))
        X = sequences[train_idx]                
        y = train_df[y_columns].values[train_idx]
        X_val = sequences[test_idx]                
        y_val = train_df[y_columns].values[test_idx]
        break

#X = sequences[:num_to_load]                
#y = train_df[y_columns].values[:num_to_load]
#X_val = sequences[num_to_load:]                
#y_val = train_df[y_columns].values[num_to_load:]


#test_df=train_df.tail(valid_size).copy()
#train_df=train_df.head(num_to_load)
test_df = train_df.iloc[test_idx].copy()
train_df = train_df.iloc[train_idx]

#---------------- 0601
# Overall
weights = np.ones((len(train_df),)) / 4

# Subgroup
weights += (train_df[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4

# Background Positive
weights += (( (train_df['target'].values>=0.5).astype(bool).astype(np.int) +
   (train_df[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4


# Subgroup Negative
weights += (( (train_df['target'].values<0.5).astype(bool).astype(np.int) +
   (train_df[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4


# loss_weight = 1.0 / weights.mean() 
# weights = weights * loss_weight 
#----------------

train_dataset = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), 
                                               torch.tensor(y,dtype=torch.float),
                                               torch.tensor(weights,dtype=torch.float)
                                              )
valid_dataset = torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float))

#lr=2e-5
lr=4e-5
batch_size = 128
accumulation_steps = 1
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


################################################################################################

class BertClassification(BertPreTrainedModel):
    

    def __init__(self, config, num_labels=2):
        super(BertClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        
        #-------------------- dense in new pooler 
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
        #--------------------
        
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        
        encoded_layers, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, 
                                                  output_all_encoded_layers=True)
        
        #--------------- use another pooling
        hidden_states = encoded_layers[-1]

        #--- lase element pooling
        sent_embed = hidden_states[:, 0]
        
        pooled_output = self.dense(sent_embed)
        pooled_output = self.activation(pooled_output)
        
        #---------------
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits


# In[6]:


################################################################################################

# model = BertForSequenceClassification.from_pretrained(bert_loc,
#                                                       cache_dir=None,
#                                                       num_labels=len(y_columns))
# torch.cuda.set_device(0)
# torch.distributed.init_process_group(backend='nccl',init_method='env://')
# torch.backends.cudnn.benchmark = True
model = BertClassification.from_pretrained('../working', cache_dir=None, num_labels=7)

##model.load_state_dict(torch.load('output/BERT_iteration_12000_lr_2e-05.bin'))
model.zero_grad()
model = model.to(device)
param_optimizer = list(model.named_parameters())

for param in model.bert.embeddings.parameters():
    param.requires_grad = False

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

train = train_dataset
valid = valid_dataset

num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr,
                     warmup=0.05,
                     t_total=num_train_optimization_steps)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)

tq = tqdm(range(EPOCHS))


for epoch in tq:
    if TRAIN:
        # add parallel support


        model = nn.DataParallel(model, device_ids=[0,1,2,3,4,5,6,7])

        model.train()
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)

        avg_loss = 0.
        avg_accuracy = 0.
        lossf=None
        tk0 = tqdm(enumerate(train_loader),total=len(train_loader),leave=False)
        optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
        for i,(x_batch, y_batch, w_batch) in tk0:
            
            y_pred = model(x_batch.cuda(), attention_mask=(x_batch>0).cuda(), labels=None)
            y_batch = y_batch.cuda()
            w_batch = w_batch.cuda()

            loss_main =  F.binary_cross_entropy_with_logits(input=y_pred[:,0],
                                                    target=y_batch[:,0],
                                                    weight=w_batch)

            loss_aux =  F.binary_cross_entropy_with_logits(input=y_pred[:, 1:],
                                                        target=y_batch[:,1:])

            loss = loss_main + loss_aux * 6

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            # loss.backward()
            if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
                optimizer.step()                            # Now we can do an optimizer step
                optimizer.zero_grad()
            if lossf:
                lossf = 0.98*lossf+0.02*loss.item()
            else:
                lossf = loss.item()
            tk0.set_postfix(loss = lossf)
            avg_loss += loss.item() / len(train_loader)

            if i % 1000 == 0:
                print('saving model checkpoint at iteration={}'.format(i))
                torch.save(model.module.state_dict(), 'output/RERUN_BERT_iteration_{}_lr_{}.bin'.format(i, lr))

        tq.set_postfix(avg_loss=avg_loss)

        output_model_file = 'output/RERUN_BERT_epoch_1_2e-5.bin'
        torch.save(model.module.state_dict(), output_model_file)
    
    SEED += 1
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    ################################################################################################

    ################################################################################################
    ### validation
    model.eval()
    if not TRAIN:
        print("Start validation")
        model.load_state_dict(torch.load(output_model_file))
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=128, shuffle=False)
    valid_preds = np.zeros((len(X_val)))
    tk1 = tqdm(valid_loader)
    valid_accuracy = 0.0
    
    for i,(x_batch, y_batch)  in enumerate(tk1):

        pred = model(x_batch.cuda(), attention_mask=(x_batch>0).cuda(), labels=None).detach()[:,0]
        valid_preds[i*batch_size:(i+1)*batch_size]=(torch.sigmoid(pred[:])).cpu().squeeze().detach().numpy()

    valid_auc = roc_auc_score(y_val[:,0], valid_preds[:])
    
    print("valid_auc: ", valid_auc)
    
    MODEL_NAME = 'BERT'
    test_df[MODEL_NAME]=torch.sigmoid(torch.tensor(valid_preds)).numpy()
    TOXICITY_COLUMN = 'target_binary'
    bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
    print(bias_metrics_df)
    
    final_metric = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, MODEL_NAME))
    print('final metric is', final_metric)
    

loaded 1804874 records
1624386 180488


  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/12691 [00:00<?, ?it/s][A
  0%|          | 0/12691 [00:28<?, ?it/s, loss=4.17][A

saving model checkpoint at iteration=0



  0%|          | 1/12691 [00:29<103:14:10, 29.29s/it, loss=4.17][A
  0%|          | 1/12691 [00:30<103:14:10, 29.29s/it, loss=4.17][A
  0%|          | 2/12691 [00:30<73:31:13, 20.86s/it, loss=4.17] [A
  0%|          | 2/12691 [00:31<73:31:13, 20.86s/it, loss=4.17][A
  0%|          | 3/12691 [00:31<52:43:56, 14.96s/it, loss=4.17][A
  0%|          | 3/12691 [00:32<52:43:56, 14.96s/it, loss=4.17][A
  0%|          | 4/12691 [00:32<38:05:33, 10.81s/it, loss=4.17][A
  0%|          | 4/12691 [00:33<38:05:33, 10.81s/it, loss=4.17][A
  0%|          | 5/12691 [00:33<27:53:03,  7.91s/it, loss=4.17][A
  0%|          | 5/12691 [00:35<27:53:03,  7.91s/it, loss=4.17][A
  0%|          | 6/12691 [00:35<20:42:27,  5.88s/it, loss=4.17][A
  0%|          | 6/12691 [00:36<20:42:27,  5.88s/it, loss=4.16][A
  0%|          | 7/12691 [00:36<15:45:46,  4.47s/it, loss=4.16][A
  0%|          | 7/12691 [00:37<15:45:46,  4.47s/it, loss=4.16][A
  0%|          | 8/12691 [00:37<12:13:11,  3.47s/it, loss=

  0%|          | 62/12691 [01:44<4:10:13,  1.19s/it, loss=2.73][A
  0%|          | 62/12691 [01:46<4:10:13,  1.19s/it, loss=2.7] [A
  0%|          | 63/12691 [01:46<4:13:43,  1.21s/it, loss=2.7][A
  0%|          | 63/12691 [01:47<4:13:43,  1.21s/it, loss=2.67][A
  1%|          | 64/12691 [01:47<4:10:45,  1.19s/it, loss=2.67][A
  1%|          | 64/12691 [01:48<4:10:45,  1.19s/it, loss=2.64][A
  1%|          | 65/12691 [01:48<4:10:32,  1.19s/it, loss=2.64][A
  1%|          | 65/12691 [01:49<4:10:32,  1.19s/it, loss=2.62][A
  1%|          | 66/12691 [01:49<4:08:41,  1.18s/it, loss=2.62][A
  1%|          | 66/12691 [01:50<4:08:41,  1.18s/it, loss=2.59][A
  1%|          | 67/12691 [01:50<4:06:11,  1.17s/it, loss=2.59][A
  1%|          | 67/12691 [01:52<4:06:11,  1.17s/it, loss=2.56][A
  1%|          | 68/12691 [01:52<4:04:01,  1.16s/it, loss=2.56][A
  1%|          | 68/12691 [01:53<4:04:01,  1.16s/it, loss=2.53][A
  1%|          | 69/12691 [01:53<4:12:54,  1.20s/it, loss=2.53]

  1%|          | 122/12691 [02:56<4:03:46,  1.16s/it, loss=1.55][A
  1%|          | 123/12691 [02:56<4:01:12,  1.15s/it, loss=1.55][A
  1%|          | 123/12691 [02:57<4:01:12,  1.15s/it, loss=1.54][A
  1%|          | 124/12691 [02:57<4:07:34,  1.18s/it, loss=1.54][A
  1%|          | 124/12691 [02:58<4:07:34,  1.18s/it, loss=1.52][A
  1%|          | 125/12691 [02:58<4:07:27,  1.18s/it, loss=1.52][A
  1%|          | 125/12691 [03:00<4:07:27,  1.18s/it, loss=1.52][A
  1%|          | 126/12691 [03:00<4:06:42,  1.18s/it, loss=1.52][A
  1%|          | 126/12691 [03:01<4:06:42,  1.18s/it, loss=1.5] [A
  1%|          | 127/12691 [03:01<4:07:03,  1.18s/it, loss=1.5][A
  1%|          | 127/12691 [03:02<4:07:03,  1.18s/it, loss=1.5][A
  1%|          | 128/12691 [03:02<4:10:53,  1.20s/it, loss=1.5][A
  1%|          | 128/12691 [03:03<4:10:53,  1.20s/it, loss=1.48][A
  1%|          | 129/12691 [03:03<4:08:28,  1.19s/it, loss=1.48][A
  1%|          | 129/12691 [03:04<4:08:28,  1.19s/i

  1%|▏         | 182/12691 [04:06<4:02:13,  1.16s/it, loss=1.07][A
  1%|▏         | 183/12691 [04:06<3:57:21,  1.14s/it, loss=1.07][A
  1%|▏         | 183/12691 [04:07<3:57:21,  1.14s/it, loss=1.06][A
  1%|▏         | 184/12691 [04:07<3:53:45,  1.12s/it, loss=1.06][A
  1%|▏         | 184/12691 [04:08<3:53:45,  1.12s/it, loss=1.06][A
  1%|▏         | 185/12691 [04:08<3:52:04,  1.11s/it, loss=1.06][A
  1%|▏         | 185/12691 [04:09<3:52:04,  1.11s/it, loss=1.05][A
  1%|▏         | 186/12691 [04:09<3:56:59,  1.14s/it, loss=1.05][A
  1%|▏         | 186/12691 [04:10<3:56:59,  1.14s/it, loss=1.04][A
  1%|▏         | 187/12691 [04:10<3:54:45,  1.13s/it, loss=1.04][A
  1%|▏         | 187/12691 [04:12<3:54:45,  1.13s/it, loss=1.04][A
  1%|▏         | 188/12691 [04:12<3:54:18,  1.12s/it, loss=1.04][A
  1%|▏         | 188/12691 [04:13<3:54:18,  1.12s/it, loss=1.03][A
  1%|▏         | 189/12691 [04:13<3:54:04,  1.12s/it, loss=1.03][A
  1%|▏         | 189/12691 [04:14<3:54:04,  1.12

  2%|▏         | 242/12691 [05:15<4:15:50,  1.23s/it, loss=0.867][A
  2%|▏         | 242/12691 [05:16<4:15:50,  1.23s/it, loss=0.867][A
  2%|▏         | 243/12691 [05:16<4:09:16,  1.20s/it, loss=0.867][A
  2%|▏         | 243/12691 [05:17<4:09:16,  1.20s/it, loss=0.866][A
  2%|▏         | 244/12691 [05:17<4:03:35,  1.17s/it, loss=0.866][A
  2%|▏         | 244/12691 [05:18<4:03:35,  1.17s/it, loss=0.864][A
  2%|▏         | 245/12691 [05:18<4:01:08,  1.16s/it, loss=0.864][A
  2%|▏         | 245/12691 [05:19<4:01:08,  1.16s/it, loss=0.86] [A
  2%|▏         | 246/12691 [05:19<3:59:45,  1.16s/it, loss=0.86][A
  2%|▏         | 246/12691 [05:21<3:59:45,  1.16s/it, loss=0.858][A
  2%|▏         | 247/12691 [05:21<3:58:38,  1.15s/it, loss=0.858][A
  2%|▏         | 247/12691 [05:22<3:58:38,  1.15s/it, loss=0.856][A
  2%|▏         | 248/12691 [05:22<4:06:31,  1.19s/it, loss=0.856][A
  2%|▏         | 248/12691 [05:23<4:06:31,  1.19s/it, loss=0.854][A
  2%|▏         | 249/12691 [05:23<4

  2%|▏         | 301/12691 [06:25<4:08:31,  1.20s/it, loss=0.784][A
  2%|▏         | 301/12691 [06:26<4:08:31,  1.20s/it, loss=0.787][A
  2%|▏         | 302/12691 [06:26<4:04:06,  1.18s/it, loss=0.787][A
  2%|▏         | 302/12691 [06:27<4:04:06,  1.18s/it, loss=0.784][A
  2%|▏         | 303/12691 [06:27<4:11:59,  1.22s/it, loss=0.784][A
  2%|▏         | 303/12691 [06:28<4:11:59,  1.22s/it, loss=0.782][A
  2%|▏         | 304/12691 [06:28<4:08:43,  1.20s/it, loss=0.782][A
  2%|▏         | 304/12691 [06:30<4:08:43,  1.20s/it, loss=0.778][A
  2%|▏         | 305/12691 [06:30<4:10:06,  1.21s/it, loss=0.778][A
  2%|▏         | 305/12691 [06:31<4:10:06,  1.21s/it, loss=0.777][A
  2%|▏         | 306/12691 [06:31<4:05:14,  1.19s/it, loss=0.777][A
  2%|▏         | 306/12691 [06:32<4:05:14,  1.19s/it, loss=0.776][A
  2%|▏         | 307/12691 [06:32<4:05:07,  1.19s/it, loss=0.776][A
  2%|▏         | 307/12691 [06:33<4:05:07,  1.19s/it, loss=0.774][A
  2%|▏         | 308/12691 [06:33<

  3%|▎         | 360/12691 [07:37<4:09:18,  1.21s/it, loss=0.75][A
  3%|▎         | 360/12691 [07:38<4:09:18,  1.21s/it, loss=0.747][A
  3%|▎         | 361/12691 [07:38<4:04:53,  1.19s/it, loss=0.747][A
  3%|▎         | 361/12691 [07:39<4:04:53,  1.19s/it, loss=0.748][A
  3%|▎         | 362/12691 [07:39<4:04:22,  1.19s/it, loss=0.748][A
  3%|▎         | 362/12691 [07:41<4:04:22,  1.19s/it, loss=0.747][A
  3%|▎         | 363/12691 [07:41<4:05:42,  1.20s/it, loss=0.747][A
  3%|▎         | 363/12691 [07:42<4:05:42,  1.20s/it, loss=0.745][A
  3%|▎         | 364/12691 [07:42<4:03:34,  1.19s/it, loss=0.745][A
  3%|▎         | 364/12691 [07:43<4:03:34,  1.19s/it, loss=0.745][A
  3%|▎         | 365/12691 [07:43<4:15:58,  1.25s/it, loss=0.745][A
  3%|▎         | 365/12691 [07:44<4:15:58,  1.25s/it, loss=0.742][A
  3%|▎         | 366/12691 [07:44<4:08:53,  1.21s/it, loss=0.742][A
  3%|▎         | 366/12691 [07:45<4:08:53,  1.21s/it, loss=0.74] [A
  3%|▎         | 367/12691 [07:45<4

  3%|▎         | 419/12691 [08:46<4:00:42,  1.18s/it, loss=0.733][A
  3%|▎         | 419/12691 [08:47<4:00:42,  1.18s/it, loss=0.732][A
  3%|▎         | 420/12691 [08:47<3:57:12,  1.16s/it, loss=0.732][A
  3%|▎         | 420/12691 [08:49<3:57:12,  1.16s/it, loss=0.73] [A
  3%|▎         | 421/12691 [08:49<4:01:57,  1.18s/it, loss=0.73][A
  3%|▎         | 421/12691 [08:50<4:01:57,  1.18s/it, loss=0.731][A
  3%|▎         | 422/12691 [08:50<3:58:58,  1.17s/it, loss=0.731][A
  3%|▎         | 422/12691 [08:51<3:58:58,  1.17s/it, loss=0.732][A
  3%|▎         | 423/12691 [08:51<3:59:12,  1.17s/it, loss=0.732][A
  3%|▎         | 423/12691 [08:52<3:59:12,  1.17s/it, loss=0.731][A
  3%|▎         | 424/12691 [08:52<3:56:25,  1.16s/it, loss=0.731][A
  3%|▎         | 424/12691 [08:53<3:56:25,  1.16s/it, loss=0.729][A
  3%|▎         | 425/12691 [08:53<3:54:56,  1.15s/it, loss=0.729][A
  3%|▎         | 425/12691 [08:54<3:54:56,  1.15s/it, loss=0.728][A
  3%|▎         | 426/12691 [08:54<3

  4%|▍         | 478/12691 [09:55<3:50:47,  1.13s/it, loss=0.706][A
  4%|▍         | 478/12691 [09:57<3:50:47,  1.13s/it, loss=0.705][A
  4%|▍         | 479/12691 [09:57<3:49:58,  1.13s/it, loss=0.705][A
  4%|▍         | 479/12691 [09:58<3:49:58,  1.13s/it, loss=0.705][A
  4%|▍         | 480/12691 [09:58<3:46:35,  1.11s/it, loss=0.705][A
  4%|▍         | 480/12691 [09:59<3:46:35,  1.11s/it, loss=0.707][A
  4%|▍         | 481/12691 [09:59<3:47:07,  1.12s/it, loss=0.707][A
  4%|▍         | 481/12691 [10:00<3:47:07,  1.12s/it, loss=0.71] [A
  4%|▍         | 482/12691 [10:00<3:51:05,  1.14s/it, loss=0.71][A
  4%|▍         | 482/12691 [10:01<3:51:05,  1.14s/it, loss=0.71][A
  4%|▍         | 483/12691 [10:01<3:57:39,  1.17s/it, loss=0.71][A
  4%|▍         | 483/12691 [10:02<3:57:39,  1.17s/it, loss=0.708][A
  4%|▍         | 484/12691 [10:02<3:57:34,  1.17s/it, loss=0.708][A
  4%|▍         | 484/12691 [10:04<3:57:34,  1.17s/it, loss=0.706][A
  4%|▍         | 485/12691 [10:04<3:5

  4%|▍         | 537/12691 [11:03<4:00:39,  1.19s/it, loss=0.707][A
  4%|▍         | 537/12691 [11:05<4:00:39,  1.19s/it, loss=0.706][A
  4%|▍         | 538/12691 [11:05<4:10:15,  1.24s/it, loss=0.706][A
  4%|▍         | 538/12691 [11:06<4:10:15,  1.24s/it, loss=0.703][A
  4%|▍         | 539/12691 [11:06<4:05:24,  1.21s/it, loss=0.703][A
  4%|▍         | 539/12691 [11:07<4:05:24,  1.21s/it, loss=0.703][A
  4%|▍         | 540/12691 [11:07<3:59:36,  1.18s/it, loss=0.703][A
  4%|▍         | 540/12691 [11:08<3:59:36,  1.18s/it, loss=0.703][A
  4%|▍         | 541/12691 [11:08<3:55:05,  1.16s/it, loss=0.703][A
  4%|▍         | 541/12691 [11:09<3:55:05,  1.16s/it, loss=0.702][A
  4%|▍         | 542/12691 [11:09<3:53:35,  1.15s/it, loss=0.702][A
  4%|▍         | 542/12691 [11:10<3:53:35,  1.15s/it, loss=0.704][A
  4%|▍         | 543/12691 [11:10<3:51:48,  1.14s/it, loss=0.704][A
  4%|▍         | 543/12691 [11:12<3:51:48,  1.14s/it, loss=0.703][A
  4%|▍         | 544/12691 [11:12<

  5%|▍         | 596/12691 [12:13<3:55:15,  1.17s/it, loss=0.697][A
  5%|▍         | 597/12691 [12:13<3:52:43,  1.15s/it, loss=0.697][A
  5%|▍         | 597/12691 [12:14<3:52:43,  1.15s/it, loss=0.698][A
  5%|▍         | 598/12691 [12:14<3:52:07,  1.15s/it, loss=0.698][A
  5%|▍         | 598/12691 [12:15<3:52:07,  1.15s/it, loss=0.699][A
  5%|▍         | 599/12691 [12:15<3:51:54,  1.15s/it, loss=0.699][A
  5%|▍         | 599/12691 [12:16<3:51:54,  1.15s/it, loss=0.698][A
  5%|▍         | 600/12691 [12:16<4:00:12,  1.19s/it, loss=0.698][A
  5%|▍         | 600/12691 [12:17<4:00:12,  1.19s/it, loss=0.695][A
  5%|▍         | 601/12691 [12:17<3:56:37,  1.17s/it, loss=0.695][A
  5%|▍         | 601/12691 [12:19<3:56:37,  1.17s/it, loss=0.692][A
  5%|▍         | 602/12691 [12:19<3:56:38,  1.17s/it, loss=0.692][A
  5%|▍         | 602/12691 [12:20<3:56:38,  1.17s/it, loss=0.695][A
  5%|▍         | 603/12691 [12:20<3:55:28,  1.17s/it, loss=0.695][A
  5%|▍         | 603/12691 [12:21<

  5%|▌         | 655/12691 [13:21<3:47:31,  1.13s/it, loss=0.696][A
  5%|▌         | 656/12691 [13:21<3:51:48,  1.16s/it, loss=0.696][A
  5%|▌         | 656/12691 [13:22<3:51:48,  1.16s/it, loss=0.692][A
  5%|▌         | 657/12691 [13:22<3:49:41,  1.15s/it, loss=0.692][A
  5%|▌         | 657/12691 [13:23<3:49:41,  1.15s/it, loss=0.691][A
  5%|▌         | 658/12691 [13:23<3:48:20,  1.14s/it, loss=0.691][A
  5%|▌         | 658/12691 [13:24<3:48:20,  1.14s/it, loss=0.691][A
  5%|▌         | 659/12691 [13:24<3:47:33,  1.13s/it, loss=0.691][A
  5%|▌         | 659/12691 [13:25<3:47:33,  1.13s/it, loss=0.692][A
  5%|▌         | 660/12691 [13:25<3:47:38,  1.14s/it, loss=0.692][A
  5%|▌         | 660/12691 [13:27<3:47:38,  1.14s/it, loss=0.692][A
  5%|▌         | 661/12691 [13:27<3:47:21,  1.13s/it, loss=0.692][A
  5%|▌         | 661/12691 [13:28<3:47:21,  1.13s/it, loss=0.695][A
  5%|▌         | 662/12691 [13:28<3:51:17,  1.15s/it, loss=0.695][A
  5%|▌         | 662/12691 [13:29<

  6%|▌         | 714/12691 [14:30<3:53:13,  1.17s/it, loss=0.693][A
  6%|▌         | 715/12691 [14:30<3:51:03,  1.16s/it, loss=0.693][A
  6%|▌         | 715/12691 [14:31<3:51:03,  1.16s/it, loss=0.691][A
  6%|▌         | 716/12691 [14:31<3:49:19,  1.15s/it, loss=0.691][A
  6%|▌         | 716/12691 [14:32<3:49:19,  1.15s/it, loss=0.689][A
  6%|▌         | 717/12691 [14:32<3:53:50,  1.17s/it, loss=0.689][A
  6%|▌         | 717/12691 [14:33<3:53:50,  1.17s/it, loss=0.691][A
  6%|▌         | 718/12691 [14:33<3:51:58,  1.16s/it, loss=0.691][A
  6%|▌         | 718/12691 [14:34<3:51:58,  1.16s/it, loss=0.689][A
  6%|▌         | 719/12691 [14:34<3:49:42,  1.15s/it, loss=0.689][A
  6%|▌         | 719/12691 [14:35<3:49:42,  1.15s/it, loss=0.69] [A
  6%|▌         | 720/12691 [14:35<3:48:50,  1.15s/it, loss=0.69][A
  6%|▌         | 720/12691 [14:36<3:48:50,  1.15s/it, loss=0.688][A
  6%|▌         | 721/12691 [14:36<3:47:37,  1.14s/it, loss=0.688][A
  6%|▌         | 721/12691 [14:38<3

  6%|▌         | 773/12691 [15:40<3:53:07,  1.17s/it, loss=0.69] [A
  6%|▌         | 774/12691 [15:40<3:57:16,  1.19s/it, loss=0.69][A
  6%|▌         | 774/12691 [15:41<3:57:16,  1.19s/it, loss=0.691][A
  6%|▌         | 775/12691 [15:41<3:54:26,  1.18s/it, loss=0.691][A
  6%|▌         | 775/12691 [15:42<3:54:26,  1.18s/it, loss=0.697][A
  6%|▌         | 776/12691 [15:42<3:50:58,  1.16s/it, loss=0.697][A
  6%|▌         | 776/12691 [15:43<3:50:58,  1.16s/it, loss=0.697][A
  6%|▌         | 777/12691 [15:43<3:53:32,  1.18s/it, loss=0.697][A
  6%|▌         | 777/12691 [15:44<3:53:32,  1.18s/it, loss=0.701][A
  6%|▌         | 778/12691 [15:44<3:54:17,  1.18s/it, loss=0.701][A
  6%|▌         | 778/12691 [15:45<3:54:17,  1.18s/it, loss=0.701][A
  6%|▌         | 779/12691 [15:45<3:58:34,  1.20s/it, loss=0.701][A
  6%|▌         | 779/12691 [15:47<3:58:34,  1.20s/it, loss=0.701][A
  6%|▌         | 780/12691 [15:47<3:54:02,  1.18s/it, loss=0.701][A
  6%|▌         | 780/12691 [15:48<3

  7%|▋         | 832/12691 [16:50<3:47:28,  1.15s/it, loss=0.699][A
  7%|▋         | 833/12691 [16:50<3:47:13,  1.15s/it, loss=0.699][A
  7%|▋         | 833/12691 [16:51<3:47:13,  1.15s/it, loss=0.7]  [A
  7%|▋         | 834/12691 [16:51<3:47:53,  1.15s/it, loss=0.7][A
  7%|▋         | 834/12691 [16:52<3:47:53,  1.15s/it, loss=0.7][A
  7%|▋         | 835/12691 [16:52<3:53:53,  1.18s/it, loss=0.7][A
  7%|▋         | 835/12691 [16:53<3:53:53,  1.18s/it, loss=0.699][A
  7%|▋         | 836/12691 [16:53<3:50:20,  1.17s/it, loss=0.699][A
  7%|▋         | 836/12691 [16:54<3:50:20,  1.17s/it, loss=0.697][A
  7%|▋         | 837/12691 [16:54<3:48:03,  1.15s/it, loss=0.697][A
  7%|▋         | 837/12691 [16:55<3:48:03,  1.15s/it, loss=0.697][A
  7%|▋         | 838/12691 [16:55<3:47:19,  1.15s/it, loss=0.697][A
  7%|▋         | 838/12691 [16:56<3:47:19,  1.15s/it, loss=0.695][A
  7%|▋         | 839/12691 [16:56<3:46:39,  1.15s/it, loss=0.695][A
  7%|▋         | 839/12691 [16:58<3:46:3

  7%|▋         | 891/12691 [17:58<3:45:44,  1.15s/it, loss=0.695][A
  7%|▋         | 892/12691 [17:58<3:44:08,  1.14s/it, loss=0.695][A
  7%|▋         | 892/12691 [17:59<3:44:08,  1.14s/it, loss=0.694][A
  7%|▋         | 893/12691 [17:59<3:50:25,  1.17s/it, loss=0.694][A
  7%|▋         | 893/12691 [18:00<3:50:25,  1.17s/it, loss=0.692][A
  7%|▋         | 894/12691 [18:00<3:48:00,  1.16s/it, loss=0.692][A
  7%|▋         | 894/12691 [18:01<3:48:00,  1.16s/it, loss=0.692][A
  7%|▋         | 895/12691 [18:01<3:47:04,  1.15s/it, loss=0.692][A
  7%|▋         | 895/12691 [18:03<3:47:04,  1.15s/it, loss=0.689][A
  7%|▋         | 896/12691 [18:03<3:46:05,  1.15s/it, loss=0.689][A
  7%|▋         | 896/12691 [18:04<3:46:05,  1.15s/it, loss=0.689][A
  7%|▋         | 897/12691 [18:04<3:50:27,  1.17s/it, loss=0.689][A
  7%|▋         | 897/12691 [18:05<3:50:27,  1.17s/it, loss=0.687][A
  7%|▋         | 898/12691 [18:05<3:47:01,  1.16s/it, loss=0.687][A
  7%|▋         | 898/12691 [18:06<

  7%|▋         | 950/12691 [19:06<3:40:37,  1.13s/it, loss=0.676][A
  7%|▋         | 951/12691 [19:06<3:40:23,  1.13s/it, loss=0.676][A
  7%|▋         | 951/12691 [19:07<3:40:23,  1.13s/it, loss=0.677][A
  8%|▊         | 952/12691 [19:07<3:40:56,  1.13s/it, loss=0.677][A
  8%|▊         | 952/12691 [19:08<3:40:56,  1.13s/it, loss=0.676][A
  8%|▊         | 953/12691 [19:08<3:45:53,  1.15s/it, loss=0.676][A
  8%|▊         | 953/12691 [19:09<3:45:53,  1.15s/it, loss=0.679][A
  8%|▊         | 954/12691 [19:09<3:44:17,  1.15s/it, loss=0.679][A
  8%|▊         | 954/12691 [19:10<3:44:17,  1.15s/it, loss=0.68] [A
  8%|▊         | 955/12691 [19:10<3:41:47,  1.13s/it, loss=0.68][A
  8%|▊         | 955/12691 [19:11<3:41:47,  1.13s/it, loss=0.679][A
  8%|▊         | 956/12691 [19:11<3:39:57,  1.12s/it, loss=0.679][A
  8%|▊         | 956/12691 [19:13<3:39:57,  1.12s/it, loss=0.677][A
  8%|▊         | 957/12691 [19:13<3:39:16,  1.12s/it, loss=0.677][A
  8%|▊         | 957/12691 [19:14<3

saving model checkpoint at iteration=1000



  8%|▊         | 1001/12691 [20:05<4:42:06,  1.45s/it, loss=0.676][A
  8%|▊         | 1001/12691 [20:06<4:42:06,  1.45s/it, loss=0.678][A
  8%|▊         | 1002/12691 [20:06<4:28:44,  1.38s/it, loss=0.678][A
  8%|▊         | 1002/12691 [20:07<4:28:44,  1.38s/it, loss=0.675][A
  8%|▊         | 1003/12691 [20:07<4:13:27,  1.30s/it, loss=0.675][A
  8%|▊         | 1003/12691 [20:08<4:13:27,  1.30s/it, loss=0.676][A
  8%|▊         | 1004/12691 [20:08<4:03:13,  1.25s/it, loss=0.676][A
  8%|▊         | 1004/12691 [20:09<4:03:13,  1.25s/it, loss=0.672][A
  8%|▊         | 1005/12691 [20:09<3:55:25,  1.21s/it, loss=0.672][A
  8%|▊         | 1005/12691 [20:10<3:55:25,  1.21s/it, loss=0.669][A
  8%|▊         | 1006/12691 [20:10<3:51:12,  1.19s/it, loss=0.669][A
  8%|▊         | 1006/12691 [20:12<3:51:12,  1.19s/it, loss=0.669][A
  8%|▊         | 1007/12691 [20:12<3:45:17,  1.16s/it, loss=0.669][A
  8%|▊         | 1007/12691 [20:13<3:45:17,  1.16s/it, loss=0.668][A
  8%|▊         | 10

  8%|▊         | 1059/12691 [21:13<3:49:14,  1.18s/it, loss=0.672][A
  8%|▊         | 1060/12691 [21:13<3:46:55,  1.17s/it, loss=0.672][A
  8%|▊         | 1060/12691 [21:14<3:46:55,  1.17s/it, loss=0.674][A
  8%|▊         | 1061/12691 [21:14<3:45:15,  1.16s/it, loss=0.674][A
  8%|▊         | 1061/12691 [21:15<3:45:15,  1.16s/it, loss=0.673][A
  8%|▊         | 1062/12691 [21:15<3:47:13,  1.17s/it, loss=0.673][A
  8%|▊         | 1062/12691 [21:16<3:47:13,  1.17s/it, loss=0.668][A
  8%|▊         | 1063/12691 [21:16<3:44:40,  1.16s/it, loss=0.668][A
  8%|▊         | 1063/12691 [21:18<3:44:40,  1.16s/it, loss=0.667][A
  8%|▊         | 1064/12691 [21:18<3:49:02,  1.18s/it, loss=0.667][A
  8%|▊         | 1064/12691 [21:19<3:49:02,  1.18s/it, loss=0.665][A
  8%|▊         | 1065/12691 [21:19<3:47:01,  1.17s/it, loss=0.665][A
  8%|▊         | 1065/12691 [21:20<3:47:01,  1.17s/it, loss=0.665][A
  8%|▊         | 1066/12691 [21:20<3:45:07,  1.16s/it, loss=0.665][A
  8%|▊         | 106

  9%|▉         | 1118/12691 [22:21<3:43:18,  1.16s/it, loss=0.671][A
  9%|▉         | 1118/12691 [22:22<3:43:18,  1.16s/it, loss=0.668][A
  9%|▉         | 1119/12691 [22:22<3:49:16,  1.19s/it, loss=0.668][A
  9%|▉         | 1119/12691 [22:24<3:49:16,  1.19s/it, loss=0.667][A
  9%|▉         | 1120/12691 [22:24<3:47:54,  1.18s/it, loss=0.667][A
  9%|▉         | 1120/12691 [22:25<3:47:54,  1.18s/it, loss=0.665][A
  9%|▉         | 1121/12691 [22:25<3:45:13,  1.17s/it, loss=0.665][A
  9%|▉         | 1121/12691 [22:26<3:45:13,  1.17s/it, loss=0.665][A
  9%|▉         | 1122/12691 [22:26<3:42:26,  1.15s/it, loss=0.665][A
  9%|▉         | 1122/12691 [22:27<3:42:26,  1.15s/it, loss=0.663][A
  9%|▉         | 1123/12691 [22:27<3:41:29,  1.15s/it, loss=0.663][A
  9%|▉         | 1123/12691 [22:28<3:41:29,  1.15s/it, loss=0.662][A
  9%|▉         | 1124/12691 [22:28<3:42:16,  1.15s/it, loss=0.662][A
  9%|▉         | 1124/12691 [22:29<3:42:16,  1.15s/it, loss=0.663][A
  9%|▉         | 112

  9%|▉         | 1176/12691 [23:30<3:38:29,  1.14s/it, loss=0.68] [A
  9%|▉         | 1177/12691 [23:30<3:36:35,  1.13s/it, loss=0.68][A
  9%|▉         | 1177/12691 [23:31<3:36:35,  1.13s/it, loss=0.679][A
  9%|▉         | 1178/12691 [23:31<3:40:47,  1.15s/it, loss=0.679][A
  9%|▉         | 1178/12691 [23:32<3:40:47,  1.15s/it, loss=0.675][A
  9%|▉         | 1179/12691 [23:32<3:38:41,  1.14s/it, loss=0.675][A
  9%|▉         | 1179/12691 [23:33<3:38:41,  1.14s/it, loss=0.674][A
  9%|▉         | 1180/12691 [23:33<3:36:36,  1.13s/it, loss=0.674][A
  9%|▉         | 1180/12691 [23:34<3:36:36,  1.13s/it, loss=0.675][A
  9%|▉         | 1181/12691 [23:34<3:36:11,  1.13s/it, loss=0.675][A
  9%|▉         | 1181/12691 [23:35<3:36:11,  1.13s/it, loss=0.674][A
  9%|▉         | 1182/12691 [23:35<3:39:57,  1.15s/it, loss=0.674][A
  9%|▉         | 1182/12691 [23:36<3:39:57,  1.15s/it, loss=0.675][A
  9%|▉         | 1183/12691 [23:36<3:38:31,  1.14s/it, loss=0.675][A
  9%|▉         | 1183

 10%|▉         | 1235/12691 [24:36<3:37:49,  1.14s/it, loss=0.679][A
 10%|▉         | 1235/12691 [24:37<3:37:49,  1.14s/it, loss=0.68] [A
 10%|▉         | 1236/12691 [24:37<3:39:24,  1.15s/it, loss=0.68][A
 10%|▉         | 1236/12691 [24:38<3:39:24,  1.15s/it, loss=0.683][A
 10%|▉         | 1237/12691 [24:38<3:50:13,  1.21s/it, loss=0.683][A
 10%|▉         | 1237/12691 [24:39<3:50:13,  1.21s/it, loss=0.683][A
 10%|▉         | 1238/12691 [24:39<3:48:49,  1.20s/it, loss=0.683][A
 10%|▉         | 1238/12691 [24:40<3:48:49,  1.20s/it, loss=0.684][A
 10%|▉         | 1239/12691 [24:40<3:49:51,  1.20s/it, loss=0.684][A
 10%|▉         | 1239/12691 [24:42<3:49:51,  1.20s/it, loss=0.686][A
 10%|▉         | 1240/12691 [24:42<3:47:28,  1.19s/it, loss=0.686][A
 10%|▉         | 1240/12691 [24:43<3:47:28,  1.19s/it, loss=0.686][A
 10%|▉         | 1241/12691 [24:43<3:44:43,  1.18s/it, loss=0.686][A
 10%|▉         | 1241/12691 [24:44<3:44:43,  1.18s/it, loss=0.687][A
 10%|▉         | 1242

 10%|█         | 1293/12691 [25:46<3:46:32,  1.19s/it, loss=0.673][A
 10%|█         | 1294/12691 [25:46<3:43:28,  1.18s/it, loss=0.673][A
 10%|█         | 1294/12691 [25:47<3:43:28,  1.18s/it, loss=0.672][A
 10%|█         | 1295/12691 [25:47<3:47:03,  1.20s/it, loss=0.672][A
 10%|█         | 1295/12691 [25:49<3:47:03,  1.20s/it, loss=0.67] [A
 10%|█         | 1296/12691 [25:49<3:50:35,  1.21s/it, loss=0.67][A
 10%|█         | 1296/12691 [25:50<3:50:35,  1.21s/it, loss=0.667][A
 10%|█         | 1297/12691 [25:50<3:54:04,  1.23s/it, loss=0.667][A
 10%|█         | 1297/12691 [25:51<3:54:04,  1.23s/it, loss=0.664][A
 10%|█         | 1298/12691 [25:51<3:57:05,  1.25s/it, loss=0.664][A
 10%|█         | 1298/12691 [25:53<3:57:05,  1.25s/it, loss=0.664][A
 10%|█         | 1299/12691 [25:53<4:03:36,  1.28s/it, loss=0.664][A
 10%|█         | 1299/12691 [25:54<4:03:36,  1.28s/it, loss=0.661][A
 10%|█         | 1300/12691 [25:54<4:02:07,  1.28s/it, loss=0.661][A
 10%|█         | 1300

 11%|█         | 1352/12691 [26:58<3:48:01,  1.21s/it, loss=0.674][A
 11%|█         | 1352/12691 [26:59<3:48:01,  1.21s/it, loss=0.671][A
 11%|█         | 1353/12691 [26:59<3:46:05,  1.20s/it, loss=0.671][A
 11%|█         | 1353/12691 [27:00<3:46:05,  1.20s/it, loss=0.671][A
 11%|█         | 1354/12691 [27:00<3:44:45,  1.19s/it, loss=0.671][A
 11%|█         | 1354/12691 [27:01<3:44:45,  1.19s/it, loss=0.673][A
 11%|█         | 1355/12691 [27:01<3:50:27,  1.22s/it, loss=0.673][A
 11%|█         | 1355/12691 [27:03<3:50:27,  1.22s/it, loss=0.671][A
 11%|█         | 1356/12691 [27:03<3:49:34,  1.22s/it, loss=0.671][A
 11%|█         | 1356/12691 [27:04<3:49:34,  1.22s/it, loss=0.671][A
 11%|█         | 1357/12691 [27:04<3:46:52,  1.20s/it, loss=0.671][A
 11%|█         | 1357/12691 [27:05<3:46:52,  1.20s/it, loss=0.669][A
 11%|█         | 1358/12691 [27:05<3:45:35,  1.19s/it, loss=0.669][A
 11%|█         | 1358/12691 [27:06<3:45:35,  1.19s/it, loss=0.668][A
 11%|█         | 135

 11%|█         | 1410/12691 [28:09<3:54:26,  1.25s/it, loss=0.669][A
 11%|█         | 1411/12691 [28:09<3:51:58,  1.23s/it, loss=0.669][A
 11%|█         | 1411/12691 [28:10<3:51:58,  1.23s/it, loss=0.67] [A
 11%|█         | 1412/12691 [28:10<3:47:40,  1.21s/it, loss=0.67][A
 11%|█         | 1412/12691 [28:12<3:47:40,  1.21s/it, loss=0.671][A
 11%|█         | 1413/12691 [28:12<3:49:06,  1.22s/it, loss=0.671][A
 11%|█         | 1413/12691 [28:13<3:49:06,  1.22s/it, loss=0.67] [A
 11%|█         | 1414/12691 [28:13<3:48:39,  1.22s/it, loss=0.67][A
 11%|█         | 1414/12691 [28:14<3:48:39,  1.22s/it, loss=0.669][A
 11%|█         | 1415/12691 [28:14<3:44:19,  1.19s/it, loss=0.669][A
 11%|█         | 1415/12691 [28:15<3:44:19,  1.19s/it, loss=0.669][A
 11%|█         | 1416/12691 [28:15<3:53:35,  1.24s/it, loss=0.669][A
 11%|█         | 1416/12691 [28:17<3:53:35,  1.24s/it, loss=0.668][A
 11%|█         | 1417/12691 [28:17<3:53:18,  1.24s/it, loss=0.668][A
 11%|█         | 1417/

 12%|█▏        | 1469/12691 [29:18<3:42:59,  1.19s/it, loss=0.674][A
 12%|█▏        | 1469/12691 [29:19<3:42:59,  1.19s/it, loss=0.672][A
 12%|█▏        | 1470/12691 [29:19<3:39:28,  1.17s/it, loss=0.672][A
 12%|█▏        | 1470/12691 [29:21<3:39:28,  1.17s/it, loss=0.672][A
 12%|█▏        | 1471/12691 [29:21<3:37:44,  1.16s/it, loss=0.672][A
 12%|█▏        | 1471/12691 [29:22<3:37:44,  1.16s/it, loss=0.673][A
 12%|█▏        | 1472/12691 [29:22<3:40:02,  1.18s/it, loss=0.673][A
 12%|█▏        | 1472/12691 [29:23<3:40:02,  1.18s/it, loss=0.671][A
 12%|█▏        | 1473/12691 [29:23<3:39:41,  1.18s/it, loss=0.671][A
 12%|█▏        | 1473/12691 [29:24<3:39:41,  1.18s/it, loss=0.672][A
 12%|█▏        | 1474/12691 [29:24<3:37:18,  1.16s/it, loss=0.672][A
 12%|█▏        | 1474/12691 [29:25<3:37:18,  1.16s/it, loss=0.675][A
 12%|█▏        | 1475/12691 [29:25<3:37:19,  1.16s/it, loss=0.675][A
 12%|█▏        | 1475/12691 [29:26<3:37:19,  1.16s/it, loss=0.678][A
 12%|█▏        | 147

 12%|█▏        | 1527/12691 [30:29<3:45:30,  1.21s/it, loss=0.667][A
 12%|█▏        | 1528/12691 [30:29<3:40:21,  1.18s/it, loss=0.667][A
 12%|█▏        | 1528/12691 [30:30<3:40:21,  1.18s/it, loss=0.668][A
 12%|█▏        | 1529/12691 [30:30<3:35:51,  1.16s/it, loss=0.668][A
 12%|█▏        | 1529/12691 [30:31<3:35:51,  1.16s/it, loss=0.67] [A
 12%|█▏        | 1530/12691 [30:31<3:35:45,  1.16s/it, loss=0.67][A
 12%|█▏        | 1530/12691 [30:32<3:35:45,  1.16s/it, loss=0.67][A
 12%|█▏        | 1531/12691 [30:32<3:34:49,  1.15s/it, loss=0.67][A
 12%|█▏        | 1531/12691 [30:33<3:34:49,  1.15s/it, loss=0.669][A
 12%|█▏        | 1532/12691 [30:33<3:33:57,  1.15s/it, loss=0.669][A
 12%|█▏        | 1532/12691 [30:35<3:33:57,  1.15s/it, loss=0.671][A
 12%|█▏        | 1533/12691 [30:35<3:31:56,  1.14s/it, loss=0.671][A
 12%|█▏        | 1533/12691 [30:36<3:31:56,  1.14s/it, loss=0.671][A
 12%|█▏        | 1534/12691 [30:36<3:35:25,  1.16s/it, loss=0.671][A
 12%|█▏        | 1534/1

 12%|█▏        | 1586/12691 [31:37<3:40:16,  1.19s/it, loss=0.668][A
 12%|█▏        | 1586/12691 [31:39<3:40:16,  1.19s/it, loss=0.667][A
 13%|█▎        | 1587/12691 [31:39<3:38:17,  1.18s/it, loss=0.667][A
 13%|█▎        | 1587/12691 [31:40<3:38:17,  1.18s/it, loss=0.666][A
 13%|█▎        | 1588/12691 [31:40<3:36:12,  1.17s/it, loss=0.666][A
 13%|█▎        | 1588/12691 [31:41<3:36:12,  1.17s/it, loss=0.664][A
 13%|█▎        | 1589/12691 [31:41<3:43:53,  1.21s/it, loss=0.664][A
 13%|█▎        | 1589/12691 [31:42<3:43:53,  1.21s/it, loss=0.665][A
 13%|█▎        | 1590/12691 [31:42<3:39:44,  1.19s/it, loss=0.665][A
 13%|█▎        | 1590/12691 [31:43<3:39:44,  1.19s/it, loss=0.666][A
 13%|█▎        | 1591/12691 [31:43<3:38:05,  1.18s/it, loss=0.666][A
 13%|█▎        | 1591/12691 [31:44<3:38:05,  1.18s/it, loss=0.664][A
 13%|█▎        | 1592/12691 [31:44<3:35:58,  1.17s/it, loss=0.664][A
 13%|█▎        | 1592/12691 [31:46<3:35:58,  1.17s/it, loss=0.664][A
 13%|█▎        | 159

 13%|█▎        | 1644/12691 [32:47<3:29:47,  1.14s/it, loss=0.675][A
 13%|█▎        | 1645/12691 [32:47<3:34:26,  1.16s/it, loss=0.675][A
 13%|█▎        | 1645/12691 [32:48<3:34:26,  1.16s/it, loss=0.676][A
 13%|█▎        | 1646/12691 [32:48<3:32:23,  1.15s/it, loss=0.676][A
 13%|█▎        | 1646/12691 [32:49<3:32:23,  1.15s/it, loss=0.675][A
 13%|█▎        | 1647/12691 [32:49<3:31:17,  1.15s/it, loss=0.675][A
 13%|█▎        | 1647/12691 [32:50<3:31:17,  1.15s/it, loss=0.673][A
 13%|█▎        | 1648/12691 [32:50<3:30:34,  1.14s/it, loss=0.673][A
 13%|█▎        | 1648/12691 [32:51<3:30:34,  1.14s/it, loss=0.675][A
 13%|█▎        | 1649/12691 [32:51<3:30:08,  1.14s/it, loss=0.675][A
 13%|█▎        | 1649/12691 [32:52<3:30:08,  1.14s/it, loss=0.674][A
 13%|█▎        | 1650/12691 [32:52<3:29:31,  1.14s/it, loss=0.674][A
 13%|█▎        | 1650/12691 [32:54<3:29:31,  1.14s/it, loss=0.674][A
 13%|█▎        | 1651/12691 [32:54<3:38:25,  1.19s/it, loss=0.674][A
 13%|█▎        | 165

 13%|█▎        | 1703/12691 [33:55<3:41:31,  1.21s/it, loss=0.672][A
 13%|█▎        | 1703/12691 [33:56<3:41:31,  1.21s/it, loss=0.671][A
 13%|█▎        | 1704/12691 [33:56<3:38:03,  1.19s/it, loss=0.671][A
 13%|█▎        | 1704/12691 [33:57<3:38:03,  1.19s/it, loss=0.671][A
 13%|█▎        | 1705/12691 [33:57<3:36:14,  1.18s/it, loss=0.671][A
 13%|█▎        | 1705/12691 [33:58<3:36:14,  1.18s/it, loss=0.673][A
 13%|█▎        | 1706/12691 [33:58<3:33:21,  1.17s/it, loss=0.673][A
 13%|█▎        | 1706/12691 [34:00<3:33:21,  1.17s/it, loss=0.673][A
 13%|█▎        | 1707/12691 [34:00<3:37:32,  1.19s/it, loss=0.673][A
 13%|█▎        | 1707/12691 [34:01<3:37:32,  1.19s/it, loss=0.674][A
 13%|█▎        | 1708/12691 [34:01<3:34:38,  1.17s/it, loss=0.674][A
 13%|█▎        | 1708/12691 [34:02<3:34:38,  1.17s/it, loss=0.675][A
 13%|█▎        | 1709/12691 [34:02<3:32:21,  1.16s/it, loss=0.675][A
 13%|█▎        | 1709/12691 [34:03<3:32:21,  1.16s/it, loss=0.674][A
 13%|█▎        | 171

 14%|█▍        | 1761/12691 [35:05<3:36:06,  1.19s/it, loss=0.666][A
 14%|█▍        | 1762/12691 [35:05<3:38:42,  1.20s/it, loss=0.666][A
 14%|█▍        | 1762/12691 [35:06<3:38:42,  1.20s/it, loss=0.667][A
 14%|█▍        | 1763/12691 [35:06<3:34:52,  1.18s/it, loss=0.667][A
 14%|█▍        | 1763/12691 [35:08<3:34:52,  1.18s/it, loss=0.666][A
 14%|█▍        | 1764/12691 [35:08<3:34:22,  1.18s/it, loss=0.666][A
 14%|█▍        | 1764/12691 [35:09<3:34:22,  1.18s/it, loss=0.666][A
 14%|█▍        | 1765/12691 [35:09<3:32:35,  1.17s/it, loss=0.666][A
 14%|█▍        | 1765/12691 [35:10<3:32:35,  1.17s/it, loss=0.666][A
 14%|█▍        | 1766/12691 [35:10<3:33:00,  1.17s/it, loss=0.666][A
 14%|█▍        | 1766/12691 [35:11<3:33:00,  1.17s/it, loss=0.664][A
 14%|█▍        | 1767/12691 [35:11<3:34:18,  1.18s/it, loss=0.664][A
 14%|█▍        | 1767/12691 [35:12<3:34:18,  1.18s/it, loss=0.663][A
 14%|█▍        | 1768/12691 [35:12<3:34:35,  1.18s/it, loss=0.663][A
 14%|█▍        | 176

 14%|█▍        | 1820/12691 [36:15<3:42:23,  1.23s/it, loss=0.662][A
 14%|█▍        | 1820/12691 [36:16<3:42:23,  1.23s/it, loss=0.664][A
 14%|█▍        | 1821/12691 [36:16<3:43:58,  1.24s/it, loss=0.664][A
 14%|█▍        | 1821/12691 [36:17<3:43:58,  1.24s/it, loss=0.664][A
 14%|█▍        | 1822/12691 [36:17<3:44:59,  1.24s/it, loss=0.664][A
 14%|█▍        | 1822/12691 [36:18<3:44:59,  1.24s/it, loss=0.663][A
 14%|█▍        | 1823/12691 [36:18<3:46:36,  1.25s/it, loss=0.663][A
 14%|█▍        | 1823/12691 [36:20<3:46:36,  1.25s/it, loss=0.667][A
 14%|█▍        | 1824/12691 [36:20<3:54:06,  1.29s/it, loss=0.667][A
 14%|█▍        | 1824/12691 [36:21<3:54:06,  1.29s/it, loss=0.665][A
 14%|█▍        | 1825/12691 [36:21<3:46:15,  1.25s/it, loss=0.665][A
 14%|█▍        | 1825/12691 [36:22<3:46:15,  1.25s/it, loss=0.666][A
 14%|█▍        | 1826/12691 [36:22<3:44:21,  1.24s/it, loss=0.666][A
 14%|█▍        | 1826/12691 [36:23<3:44:21,  1.24s/it, loss=0.666][A
 14%|█▍        | 182

 15%|█▍        | 1878/12691 [37:24<3:29:18,  1.16s/it, loss=0.657][A
 15%|█▍        | 1879/12691 [37:24<3:28:03,  1.15s/it, loss=0.657][A
 15%|█▍        | 1879/12691 [37:25<3:28:03,  1.15s/it, loss=0.653][A
 15%|█▍        | 1880/12691 [37:25<3:30:45,  1.17s/it, loss=0.653][A
 15%|█▍        | 1880/12691 [37:26<3:30:45,  1.17s/it, loss=0.654][A
 15%|█▍        | 1881/12691 [37:26<3:28:41,  1.16s/it, loss=0.654][A
 15%|█▍        | 1881/12691 [37:27<3:28:41,  1.16s/it, loss=0.653][A
 15%|█▍        | 1882/12691 [37:27<3:29:45,  1.16s/it, loss=0.653][A
 15%|█▍        | 1882/12691 [37:29<3:29:45,  1.16s/it, loss=0.653][A
 15%|█▍        | 1883/12691 [37:29<3:26:40,  1.15s/it, loss=0.653][A
 15%|█▍        | 1883/12691 [37:30<3:26:40,  1.15s/it, loss=0.652][A
 15%|█▍        | 1884/12691 [37:30<3:25:26,  1.14s/it, loss=0.652][A
 15%|█▍        | 1884/12691 [37:31<3:25:26,  1.14s/it, loss=0.654][A
 15%|█▍        | 1885/12691 [37:31<3:26:19,  1.15s/it, loss=0.654][A
 15%|█▍        | 188

 15%|█▌        | 1937/12691 [38:31<3:26:37,  1.15s/it, loss=0.678][A
 15%|█▌        | 1937/12691 [38:32<3:26:37,  1.15s/it, loss=0.68] [A
 15%|█▌        | 1938/12691 [38:32<3:25:38,  1.15s/it, loss=0.68][A
 15%|█▌        | 1938/12691 [38:34<3:25:38,  1.15s/it, loss=0.68][A
 15%|█▌        | 1939/12691 [38:34<3:23:52,  1.14s/it, loss=0.68][A
 15%|█▌        | 1939/12691 [38:35<3:23:52,  1.14s/it, loss=0.685][A
 15%|█▌        | 1940/12691 [38:35<3:23:54,  1.14s/it, loss=0.685][A
 15%|█▌        | 1940/12691 [38:36<3:23:54,  1.14s/it, loss=0.682][A
 15%|█▌        | 1941/12691 [38:36<3:28:54,  1.17s/it, loss=0.682][A
 15%|█▌        | 1941/12691 [38:37<3:28:54,  1.17s/it, loss=0.683][A
 15%|█▌        | 1942/12691 [38:37<3:27:47,  1.16s/it, loss=0.683][A
 15%|█▌        | 1942/12691 [38:38<3:27:47,  1.16s/it, loss=0.682][A
 15%|█▌        | 1943/12691 [38:38<3:29:08,  1.17s/it, loss=0.682][A
 15%|█▌        | 1943/12691 [38:39<3:29:08,  1.17s/it, loss=0.683][A
 15%|█▌        | 1944/1

 16%|█▌        | 1995/12691 [39:41<3:36:17,  1.21s/it, loss=0.673][A
 16%|█▌        | 1996/12691 [39:41<3:36:30,  1.21s/it, loss=0.673][A
 16%|█▌        | 1996/12691 [39:43<3:36:30,  1.21s/it, loss=0.671][A
 16%|█▌        | 1997/12691 [39:43<3:41:09,  1.24s/it, loss=0.671][A
 16%|█▌        | 1997/12691 [39:44<3:41:09,  1.24s/it, loss=0.672][A
 16%|█▌        | 1998/12691 [39:44<3:37:59,  1.22s/it, loss=0.672][A
 16%|█▌        | 1998/12691 [39:45<3:37:59,  1.22s/it, loss=0.671][A
 16%|█▌        | 1999/12691 [39:45<3:35:48,  1.21s/it, loss=0.671][A
 16%|█▌        | 1999/12691 [39:46<3:35:48,  1.21s/it, loss=0.672][A
 16%|█▌        | 2000/12691 [39:46<3:32:17,  1.19s/it, loss=0.672][A
 16%|█▌        | 2000/12691 [39:47<3:32:17,  1.19s/it, loss=0.67] [A

saving model checkpoint at iteration=2000



 16%|█▌        | 2001/12691 [39:48<4:30:31,  1.52s/it, loss=0.67][A
 16%|█▌        | 2001/12691 [39:50<4:30:31,  1.52s/it, loss=0.669][A
 16%|█▌        | 2002/12691 [39:50<4:15:40,  1.44s/it, loss=0.669][A
 16%|█▌        | 2002/12691 [39:51<4:15:40,  1.44s/it, loss=0.67] [A
 16%|█▌        | 2003/12691 [39:51<4:09:17,  1.40s/it, loss=0.67][A
 16%|█▌        | 2003/12691 [39:52<4:09:17,  1.40s/it, loss=0.671][A
 16%|█▌        | 2004/12691 [39:52<3:57:52,  1.34s/it, loss=0.671][A
 16%|█▌        | 2004/12691 [39:53<3:57:52,  1.34s/it, loss=0.671][A
 16%|█▌        | 2005/12691 [39:53<3:52:55,  1.31s/it, loss=0.671][A
 16%|█▌        | 2005/12691 [39:55<3:52:55,  1.31s/it, loss=0.672][A
 16%|█▌        | 2006/12691 [39:55<3:48:53,  1.29s/it, loss=0.672][A
 16%|█▌        | 2006/12691 [39:56<3:48:53,  1.29s/it, loss=0.672][A
 16%|█▌        | 2007/12691 [39:56<3:44:04,  1.26s/it, loss=0.672][A
 16%|█▌        | 2007/12691 [39:57<3:44:04,  1.26s/it, loss=0.673][A
 16%|█▌        | 2008

 16%|█▌        | 2059/12691 [40:59<3:45:59,  1.28s/it, loss=0.656][A
 16%|█▌        | 2060/12691 [40:59<3:45:47,  1.27s/it, loss=0.656][A
 16%|█▌        | 2060/12691 [41:00<3:45:47,  1.27s/it, loss=0.654][A
 16%|█▌        | 2061/12691 [41:00<3:42:26,  1.26s/it, loss=0.654][A
 16%|█▌        | 2061/12691 [41:01<3:42:26,  1.26s/it, loss=0.653][A
 16%|█▌        | 2062/12691 [41:01<3:41:01,  1.25s/it, loss=0.653][A
 16%|█▌        | 2062/12691 [41:02<3:41:01,  1.25s/it, loss=0.653][A
 16%|█▋        | 2063/12691 [41:02<3:40:21,  1.24s/it, loss=0.653][A
 16%|█▋        | 2063/12691 [41:04<3:40:21,  1.24s/it, loss=0.655][A
 16%|█▋        | 2064/12691 [41:04<3:35:10,  1.21s/it, loss=0.655][A
 16%|█▋        | 2064/12691 [41:05<3:35:10,  1.21s/it, loss=0.656][A
 16%|█▋        | 2065/12691 [41:05<3:35:34,  1.22s/it, loss=0.656][A
 16%|█▋        | 2065/12691 [41:06<3:35:34,  1.22s/it, loss=0.655][A
 16%|█▋        | 2066/12691 [41:06<3:34:07,  1.21s/it, loss=0.655][A
 16%|█▋        | 206

 17%|█▋        | 2118/12691 [42:07<3:21:55,  1.15s/it, loss=0.649][A
 17%|█▋        | 2118/12691 [42:08<3:21:55,  1.15s/it, loss=0.649][A
 17%|█▋        | 2119/12691 [42:08<3:19:24,  1.13s/it, loss=0.649][A
 17%|█▋        | 2119/12691 [42:09<3:19:24,  1.13s/it, loss=0.648][A
 17%|█▋        | 2120/12691 [42:09<3:16:38,  1.12s/it, loss=0.648][A
 17%|█▋        | 2120/12691 [42:11<3:16:38,  1.12s/it, loss=0.652][A
 17%|█▋        | 2121/12691 [42:11<3:21:52,  1.15s/it, loss=0.652][A
 17%|█▋        | 2121/12691 [42:12<3:21:52,  1.15s/it, loss=0.65] [A
 17%|█▋        | 2122/12691 [42:12<3:19:38,  1.13s/it, loss=0.65][A
 17%|█▋        | 2122/12691 [42:13<3:19:38,  1.13s/it, loss=0.651][A
 17%|█▋        | 2123/12691 [42:13<3:18:17,  1.13s/it, loss=0.651][A
 17%|█▋        | 2123/12691 [42:14<3:18:17,  1.13s/it, loss=0.652][A
 17%|█▋        | 2124/12691 [42:14<3:18:03,  1.12s/it, loss=0.652][A
 17%|█▋        | 2124/12691 [42:15<3:18:03,  1.12s/it, loss=0.652][A
 17%|█▋        | 2125

 17%|█▋        | 2176/12691 [43:18<3:31:09,  1.20s/it, loss=0.661][A
 17%|█▋        | 2177/12691 [43:18<3:29:14,  1.19s/it, loss=0.661][A
 17%|█▋        | 2177/12691 [43:19<3:29:14,  1.19s/it, loss=0.661][A
 17%|█▋        | 2178/12691 [43:19<3:28:34,  1.19s/it, loss=0.661][A
 17%|█▋        | 2178/12691 [43:20<3:28:34,  1.19s/it, loss=0.66] [A
 17%|█▋        | 2179/12691 [43:20<3:26:54,  1.18s/it, loss=0.66][A
 17%|█▋        | 2179/12691 [43:21<3:26:54,  1.18s/it, loss=0.657][A
 17%|█▋        | 2180/12691 [43:21<3:28:57,  1.19s/it, loss=0.657][A
 17%|█▋        | 2180/12691 [43:22<3:28:57,  1.19s/it, loss=0.658][A
 17%|█▋        | 2181/12691 [43:22<3:30:20,  1.20s/it, loss=0.658][A
 17%|█▋        | 2181/12691 [43:24<3:30:20,  1.20s/it, loss=0.661][A
 17%|█▋        | 2182/12691 [43:24<3:30:22,  1.20s/it, loss=0.661][A
 17%|█▋        | 2182/12691 [43:25<3:30:22,  1.20s/it, loss=0.665][A
 17%|█▋        | 2183/12691 [43:25<3:25:52,  1.18s/it, loss=0.665][A
 17%|█▋        | 2183

 18%|█▊        | 2235/12691 [44:25<3:19:28,  1.14s/it, loss=0.66][A
 18%|█▊        | 2235/12691 [44:26<3:19:28,  1.14s/it, loss=0.658][A
 18%|█▊        | 2236/12691 [44:26<3:18:33,  1.14s/it, loss=0.658][A
 18%|█▊        | 2236/12691 [44:27<3:18:33,  1.14s/it, loss=0.658][A
 18%|█▊        | 2237/12691 [44:27<3:22:25,  1.16s/it, loss=0.658][A
 18%|█▊        | 2237/12691 [44:29<3:22:25,  1.16s/it, loss=0.659][A
 18%|█▊        | 2238/12691 [44:29<3:25:04,  1.18s/it, loss=0.659][A
 18%|█▊        | 2238/12691 [44:30<3:25:04,  1.18s/it, loss=0.66] [A
 18%|█▊        | 2239/12691 [44:30<3:22:28,  1.16s/it, loss=0.66][A
 18%|█▊        | 2239/12691 [44:31<3:22:28,  1.16s/it, loss=0.657][A
 18%|█▊        | 2240/12691 [44:31<3:18:32,  1.14s/it, loss=0.657][A
 18%|█▊        | 2240/12691 [44:32<3:18:32,  1.14s/it, loss=0.658][A
 18%|█▊        | 2241/12691 [44:32<3:16:40,  1.13s/it, loss=0.658][A
 18%|█▊        | 2241/12691 [44:33<3:16:40,  1.13s/it, loss=0.658][A
 18%|█▊        | 2242/

 18%|█▊        | 2293/12691 [45:32<3:14:36,  1.12s/it, loss=0.656][A
 18%|█▊        | 2294/12691 [45:32<3:17:06,  1.14s/it, loss=0.656][A
 18%|█▊        | 2294/12691 [45:34<3:17:06,  1.14s/it, loss=0.654][A
 18%|█▊        | 2295/12691 [45:34<3:15:38,  1.13s/it, loss=0.654][A
 18%|█▊        | 2295/12691 [45:35<3:15:38,  1.13s/it, loss=0.658][A
 18%|█▊        | 2296/12691 [45:35<3:16:07,  1.13s/it, loss=0.658][A
 18%|█▊        | 2296/12691 [45:36<3:16:07,  1.13s/it, loss=0.658][A
 18%|█▊        | 2297/12691 [45:36<3:14:31,  1.12s/it, loss=0.658][A
 18%|█▊        | 2297/12691 [45:37<3:14:31,  1.12s/it, loss=0.658][A
 18%|█▊        | 2298/12691 [45:37<3:14:14,  1.12s/it, loss=0.658][A
 18%|█▊        | 2298/12691 [45:38<3:14:14,  1.12s/it, loss=0.657][A
 18%|█▊        | 2299/12691 [45:38<3:14:49,  1.12s/it, loss=0.657][A
 18%|█▊        | 2299/12691 [45:39<3:14:49,  1.12s/it, loss=0.657][A
 18%|█▊        | 2300/12691 [45:39<3:21:03,  1.16s/it, loss=0.657][A
 18%|█▊        | 230

 19%|█▊        | 2352/12691 [46:41<3:22:15,  1.17s/it, loss=0.668][A
 19%|█▊        | 2352/12691 [46:42<3:22:15,  1.17s/it, loss=0.67] [A
 19%|█▊        | 2353/12691 [46:42<3:19:45,  1.16s/it, loss=0.67][A
 19%|█▊        | 2353/12691 [46:43<3:19:45,  1.16s/it, loss=0.669][A
 19%|█▊        | 2354/12691 [46:43<3:19:28,  1.16s/it, loss=0.669][A
 19%|█▊        | 2354/12691 [46:44<3:19:28,  1.16s/it, loss=0.67] [A
 19%|█▊        | 2355/12691 [46:44<3:19:12,  1.16s/it, loss=0.67][A
 19%|█▊        | 2355/12691 [46:45<3:19:12,  1.16s/it, loss=0.672][A
 19%|█▊        | 2356/12691 [46:45<3:29:57,  1.22s/it, loss=0.672][A
 19%|█▊        | 2356/12691 [46:47<3:29:57,  1.22s/it, loss=0.675][A
 19%|█▊        | 2357/12691 [46:47<3:27:43,  1.21s/it, loss=0.675][A
 19%|█▊        | 2357/12691 [46:48<3:27:43,  1.21s/it, loss=0.674][A
 19%|█▊        | 2358/12691 [46:48<3:26:49,  1.20s/it, loss=0.674][A
 19%|█▊        | 2358/12691 [46:49<3:26:49,  1.20s/it, loss=0.672][A
 19%|█▊        | 2359/

 19%|█▉        | 2410/12691 [47:52<3:23:33,  1.19s/it, loss=0.675][A
 19%|█▉        | 2411/12691 [47:52<3:31:38,  1.24s/it, loss=0.675][A
 19%|█▉        | 2411/12691 [47:53<3:31:38,  1.24s/it, loss=0.674][A
 19%|█▉        | 2412/12691 [47:53<3:31:43,  1.24s/it, loss=0.674][A
 19%|█▉        | 2412/12691 [47:54<3:31:43,  1.24s/it, loss=0.672][A
 19%|█▉        | 2413/12691 [47:54<3:27:58,  1.21s/it, loss=0.672][A
 19%|█▉        | 2413/12691 [47:55<3:27:58,  1.21s/it, loss=0.673][A
 19%|█▉        | 2414/12691 [47:55<3:24:54,  1.20s/it, loss=0.673][A
 19%|█▉        | 2414/12691 [47:56<3:24:54,  1.20s/it, loss=0.672][A
 19%|█▉        | 2415/12691 [47:56<3:25:13,  1.20s/it, loss=0.672][A
 19%|█▉        | 2415/12691 [47:58<3:25:13,  1.20s/it, loss=0.672][A
 19%|█▉        | 2416/12691 [47:58<3:27:23,  1.21s/it, loss=0.672][A
 19%|█▉        | 2416/12691 [47:59<3:27:23,  1.21s/it, loss=0.674][A
 19%|█▉        | 2417/12691 [47:59<3:29:07,  1.22s/it, loss=0.674][A
 19%|█▉        | 241

 19%|█▉        | 2469/12691 [49:00<3:25:11,  1.20s/it, loss=0.671][A
 19%|█▉        | 2469/12691 [49:01<3:25:11,  1.20s/it, loss=0.675][A
 19%|█▉        | 2470/12691 [49:01<3:24:02,  1.20s/it, loss=0.675][A
 19%|█▉        | 2470/12691 [49:03<3:24:02,  1.20s/it, loss=0.677][A
 19%|█▉        | 2471/12691 [49:03<3:23:16,  1.19s/it, loss=0.677][A
 19%|█▉        | 2471/12691 [49:04<3:23:16,  1.19s/it, loss=0.675][A
 19%|█▉        | 2472/12691 [49:04<3:20:37,  1.18s/it, loss=0.675][A
 19%|█▉        | 2472/12691 [49:05<3:20:37,  1.18s/it, loss=0.675][A
 19%|█▉        | 2473/12691 [49:05<3:24:59,  1.20s/it, loss=0.675][A
 19%|█▉        | 2473/12691 [49:06<3:24:59,  1.20s/it, loss=0.677][A
 19%|█▉        | 2474/12691 [49:06<3:26:04,  1.21s/it, loss=0.677][A
 19%|█▉        | 2474/12691 [49:08<3:26:04,  1.21s/it, loss=0.676][A
 20%|█▉        | 2475/12691 [49:08<3:28:21,  1.22s/it, loss=0.676][A
 20%|█▉        | 2475/12691 [49:09<3:28:21,  1.22s/it, loss=0.674][A
 20%|█▉        | 247

 20%|█▉        | 2527/12691 [50:10<3:17:18,  1.16s/it, loss=0.663][A
 20%|█▉        | 2528/12691 [50:10<3:15:58,  1.16s/it, loss=0.663][A
 20%|█▉        | 2528/12691 [50:11<3:15:58,  1.16s/it, loss=0.661][A
 20%|█▉        | 2529/12691 [50:11<3:19:54,  1.18s/it, loss=0.661][A
 20%|█▉        | 2529/12691 [50:13<3:19:54,  1.18s/it, loss=0.661][A
 20%|█▉        | 2530/12691 [50:13<3:17:13,  1.16s/it, loss=0.661][A
 20%|█▉        | 2530/12691 [50:14<3:17:13,  1.16s/it, loss=0.661][A
 20%|█▉        | 2531/12691 [50:14<3:14:29,  1.15s/it, loss=0.661][A
 20%|█▉        | 2531/12691 [50:15<3:14:29,  1.15s/it, loss=0.661][A
 20%|█▉        | 2532/12691 [50:15<3:15:13,  1.15s/it, loss=0.661][A
 20%|█▉        | 2532/12691 [50:16<3:15:13,  1.15s/it, loss=0.661][A
 20%|█▉        | 2533/12691 [50:16<3:13:54,  1.15s/it, loss=0.661][A
 20%|█▉        | 2533/12691 [50:17<3:13:54,  1.15s/it, loss=0.659][A
 20%|█▉        | 2534/12691 [50:17<3:14:17,  1.15s/it, loss=0.659][A
 20%|█▉        | 253

 20%|██        | 2586/12691 [51:20<3:32:12,  1.26s/it, loss=0.66][A
 20%|██        | 2586/12691 [51:21<3:32:12,  1.26s/it, loss=0.662][A
 20%|██        | 2587/12691 [51:21<3:30:19,  1.25s/it, loss=0.662][A
 20%|██        | 2587/12691 [51:22<3:30:19,  1.25s/it, loss=0.659][A
 20%|██        | 2588/12691 [51:22<3:30:27,  1.25s/it, loss=0.659][A
 20%|██        | 2588/12691 [51:23<3:30:27,  1.25s/it, loss=0.66] [A
 20%|██        | 2589/12691 [51:23<3:31:24,  1.26s/it, loss=0.66][A
 20%|██        | 2589/12691 [51:25<3:31:24,  1.26s/it, loss=0.659][A
 20%|██        | 2590/12691 [51:25<3:36:36,  1.29s/it, loss=0.659][A
 20%|██        | 2590/12691 [51:26<3:36:36,  1.29s/it, loss=0.657][A
 20%|██        | 2591/12691 [51:26<3:35:14,  1.28s/it, loss=0.657][A
 20%|██        | 2591/12691 [51:27<3:35:14,  1.28s/it, loss=0.656][A
 20%|██        | 2592/12691 [51:27<3:33:49,  1.27s/it, loss=0.656][A
 20%|██        | 2592/12691 [51:28<3:33:49,  1.27s/it, loss=0.656][A
 20%|██        | 2593/

 21%|██        | 2644/12691 [52:32<3:18:03,  1.18s/it, loss=0.662][A
 21%|██        | 2645/12691 [52:32<3:18:03,  1.18s/it, loss=0.662][A
 21%|██        | 2645/12691 [52:34<3:18:03,  1.18s/it, loss=0.662][A
 21%|██        | 2646/12691 [52:34<3:26:50,  1.24s/it, loss=0.662][A
 21%|██        | 2646/12691 [52:35<3:26:50,  1.24s/it, loss=0.662][A
 21%|██        | 2647/12691 [52:35<3:21:47,  1.21s/it, loss=0.662][A
 21%|██        | 2647/12691 [52:36<3:21:47,  1.21s/it, loss=0.661][A
 21%|██        | 2648/12691 [52:36<3:18:53,  1.19s/it, loss=0.661][A
 21%|██        | 2648/12691 [52:37<3:18:53,  1.19s/it, loss=0.66] [A
 21%|██        | 2649/12691 [52:37<3:16:47,  1.18s/it, loss=0.66][A
 21%|██        | 2649/12691 [52:38<3:16:47,  1.18s/it, loss=0.659][A
 21%|██        | 2650/12691 [52:38<3:14:38,  1.16s/it, loss=0.659][A
 21%|██        | 2650/12691 [52:39<3:14:38,  1.16s/it, loss=0.658][A
 21%|██        | 2651/12691 [52:39<3:12:05,  1.15s/it, loss=0.658][A
 21%|██        | 2651

 21%|██▏       | 2703/12691 [53:40<3:18:32,  1.19s/it, loss=0.664][A
 21%|██▏       | 2703/12691 [53:41<3:18:32,  1.19s/it, loss=0.665][A
 21%|██▏       | 2704/12691 [53:41<3:16:14,  1.18s/it, loss=0.665][A
 21%|██▏       | 2704/12691 [53:42<3:16:14,  1.18s/it, loss=0.666][A
 21%|██▏       | 2705/12691 [53:42<3:14:38,  1.17s/it, loss=0.666][A
 21%|██▏       | 2705/12691 [53:43<3:14:38,  1.17s/it, loss=0.665][A
 21%|██▏       | 2706/12691 [53:43<3:14:19,  1.17s/it, loss=0.665][A
 21%|██▏       | 2706/12691 [53:45<3:14:19,  1.17s/it, loss=0.662][A
 21%|██▏       | 2707/12691 [53:45<3:13:28,  1.16s/it, loss=0.662][A
 21%|██▏       | 2707/12691 [53:46<3:13:28,  1.16s/it, loss=0.667][A
 21%|██▏       | 2708/12691 [53:46<3:20:01,  1.20s/it, loss=0.667][A
 21%|██▏       | 2708/12691 [53:47<3:20:01,  1.20s/it, loss=0.668][A
 21%|██▏       | 2709/12691 [53:47<3:17:19,  1.19s/it, loss=0.668][A
 21%|██▏       | 2709/12691 [53:48<3:17:19,  1.19s/it, loss=0.668][A
 21%|██▏       | 271

 22%|██▏       | 2761/12691 [54:49<3:13:06,  1.17s/it, loss=0.657][A
 22%|██▏       | 2762/12691 [54:49<3:13:49,  1.17s/it, loss=0.657][A
 22%|██▏       | 2762/12691 [54:51<3:13:49,  1.17s/it, loss=0.655][A
 22%|██▏       | 2763/12691 [54:51<3:21:50,  1.22s/it, loss=0.655][A
 22%|██▏       | 2763/12691 [54:52<3:21:50,  1.22s/it, loss=0.656][A
 22%|██▏       | 2764/12691 [54:52<3:17:36,  1.19s/it, loss=0.656][A
 22%|██▏       | 2764/12691 [54:53<3:17:36,  1.19s/it, loss=0.652][A
 22%|██▏       | 2765/12691 [54:53<3:14:29,  1.18s/it, loss=0.652][A
 22%|██▏       | 2765/12691 [54:54<3:14:29,  1.18s/it, loss=0.651][A
 22%|██▏       | 2766/12691 [54:54<3:13:23,  1.17s/it, loss=0.651][A
 22%|██▏       | 2766/12691 [54:55<3:13:23,  1.17s/it, loss=0.655][A
 22%|██▏       | 2767/12691 [54:55<3:11:34,  1.16s/it, loss=0.655][A
 22%|██▏       | 2767/12691 [54:56<3:11:34,  1.16s/it, loss=0.658][A
 22%|██▏       | 2768/12691 [54:56<3:11:40,  1.16s/it, loss=0.658][A
 22%|██▏       | 276

 22%|██▏       | 2820/12691 [55:58<3:11:31,  1.16s/it, loss=0.667][A
 22%|██▏       | 2820/12691 [55:59<3:11:31,  1.16s/it, loss=0.668][A
 22%|██▏       | 2821/12691 [55:59<3:10:16,  1.16s/it, loss=0.668][A
 22%|██▏       | 2821/12691 [56:01<3:10:16,  1.16s/it, loss=0.669][A
 22%|██▏       | 2822/12691 [56:01<3:14:27,  1.18s/it, loss=0.669][A
 22%|██▏       | 2822/12691 [56:02<3:14:27,  1.18s/it, loss=0.669][A
 22%|██▏       | 2823/12691 [56:02<3:16:15,  1.19s/it, loss=0.669][A
 22%|██▏       | 2823/12691 [56:03<3:16:15,  1.19s/it, loss=0.667][A
 22%|██▏       | 2824/12691 [56:03<3:17:58,  1.20s/it, loss=0.667][A
 22%|██▏       | 2824/12691 [56:04<3:17:58,  1.20s/it, loss=0.67] [A
 22%|██▏       | 2825/12691 [56:04<3:21:30,  1.23s/it, loss=0.67][A
 22%|██▏       | 2825/12691 [56:05<3:21:30,  1.23s/it, loss=0.669][A
 22%|██▏       | 2826/12691 [56:05<3:17:42,  1.20s/it, loss=0.669][A
 22%|██▏       | 2826/12691 [56:07<3:17:42,  1.20s/it, loss=0.667][A
 22%|██▏       | 2827

 23%|██▎       | 2878/12691 [57:07<3:08:20,  1.15s/it, loss=0.67] [A
 23%|██▎       | 2879/12691 [57:07<3:10:47,  1.17s/it, loss=0.67][A
 23%|██▎       | 2879/12691 [57:08<3:10:47,  1.17s/it, loss=0.67][A
 23%|██▎       | 2880/12691 [57:08<3:08:01,  1.15s/it, loss=0.67][A
 23%|██▎       | 2880/12691 [57:09<3:08:01,  1.15s/it, loss=0.67][A
 23%|██▎       | 2881/12691 [57:09<3:13:22,  1.18s/it, loss=0.67][A
 23%|██▎       | 2881/12691 [57:10<3:13:22,  1.18s/it, loss=0.672][A
 23%|██▎       | 2882/12691 [57:10<3:11:11,  1.17s/it, loss=0.672][A
 23%|██▎       | 2882/12691 [57:12<3:11:11,  1.17s/it, loss=0.676][A
 23%|██▎       | 2883/12691 [57:12<3:10:50,  1.17s/it, loss=0.676][A
 23%|██▎       | 2883/12691 [57:13<3:10:50,  1.17s/it, loss=0.673][A
 23%|██▎       | 2884/12691 [57:13<3:09:20,  1.16s/it, loss=0.673][A
 23%|██▎       | 2884/12691 [57:14<3:09:20,  1.16s/it, loss=0.674][A
 23%|██▎       | 2885/12691 [57:14<3:10:06,  1.16s/it, loss=0.674][A
 23%|██▎       | 2885/126

 23%|██▎       | 2937/12691 [58:16<3:13:59,  1.19s/it, loss=0.661][A
 23%|██▎       | 2937/12691 [58:17<3:13:59,  1.19s/it, loss=0.661][A
 23%|██▎       | 2938/12691 [58:17<3:11:01,  1.18s/it, loss=0.661][A
 23%|██▎       | 2938/12691 [58:18<3:11:01,  1.18s/it, loss=0.661][A
 23%|██▎       | 2939/12691 [58:18<3:09:28,  1.17s/it, loss=0.661][A
 23%|██▎       | 2939/12691 [58:19<3:09:28,  1.17s/it, loss=0.66] [A
 23%|██▎       | 2940/12691 [58:19<3:07:04,  1.15s/it, loss=0.66][A
 23%|██▎       | 2940/12691 [58:20<3:07:04,  1.15s/it, loss=0.66][A
 23%|██▎       | 2941/12691 [58:20<3:06:53,  1.15s/it, loss=0.66][A
 23%|██▎       | 2941/12691 [58:22<3:06:53,  1.15s/it, loss=0.658][A
 23%|██▎       | 2942/12691 [58:22<3:07:11,  1.15s/it, loss=0.658][A
 23%|██▎       | 2942/12691 [58:23<3:07:11,  1.15s/it, loss=0.66] [A
 23%|██▎       | 2943/12691 [58:23<3:12:15,  1.18s/it, loss=0.66][A
 23%|██▎       | 2943/12691 [58:24<3:12:15,  1.18s/it, loss=0.66][A
 23%|██▎       | 2944/126

 24%|██▎       | 2995/12691 [59:25<3:05:23,  1.15s/it, loss=0.663][A
 24%|██▎       | 2996/12691 [59:25<3:05:04,  1.15s/it, loss=0.663][A
 24%|██▎       | 2996/12691 [59:26<3:05:04,  1.15s/it, loss=0.66] [A
 24%|██▎       | 2997/12691 [59:26<3:03:12,  1.13s/it, loss=0.66][A
 24%|██▎       | 2997/12691 [59:27<3:03:12,  1.13s/it, loss=0.661][A
 24%|██▎       | 2998/12691 [59:27<3:06:23,  1.15s/it, loss=0.661][A
 24%|██▎       | 2998/12691 [59:28<3:06:23,  1.15s/it, loss=0.658][A
 24%|██▎       | 2999/12691 [59:28<3:03:36,  1.14s/it, loss=0.658][A
 24%|██▎       | 2999/12691 [59:29<3:03:36,  1.14s/it, loss=0.658][A
 24%|██▎       | 3000/12691 [59:29<3:02:01,  1.13s/it, loss=0.658][A
 24%|██▎       | 3000/12691 [59:30<3:02:01,  1.13s/it, loss=0.659][A

saving model checkpoint at iteration=3000



 24%|██▎       | 3001/12691 [59:31<3:50:40,  1.43s/it, loss=0.659][A
 24%|██▎       | 3001/12691 [59:32<3:50:40,  1.43s/it, loss=0.657][A
 24%|██▎       | 3002/12691 [59:32<3:38:20,  1.35s/it, loss=0.657][A
 24%|██▎       | 3002/12691 [59:34<3:38:20,  1.35s/it, loss=0.658][A
 24%|██▎       | 3003/12691 [59:34<3:29:43,  1.30s/it, loss=0.658][A
 24%|██▎       | 3003/12691 [59:35<3:29:43,  1.30s/it, loss=0.659][A
 24%|██▎       | 3004/12691 [59:35<3:21:47,  1.25s/it, loss=0.659][A
 24%|██▎       | 3004/12691 [59:36<3:21:47,  1.25s/it, loss=0.658][A
 24%|██▎       | 3005/12691 [59:36<3:22:30,  1.25s/it, loss=0.658][A
 24%|██▎       | 3005/12691 [59:37<3:22:30,  1.25s/it, loss=0.66] [A
 24%|██▎       | 3006/12691 [59:37<3:18:29,  1.23s/it, loss=0.66][A
 24%|██▎       | 3006/12691 [59:38<3:18:29,  1.23s/it, loss=0.659][A
 24%|██▎       | 3007/12691 [59:38<3:12:11,  1.19s/it, loss=0.659][A
 24%|██▎       | 3007/12691 [59:39<3:12:11,  1.19s/it, loss=0.659][A
 24%|██▎       | 300

 24%|██▍       | 3058/12691 [1:00:39<3:11:24,  1.19s/it, loss=0.653][A
 24%|██▍       | 3059/12691 [1:00:39<3:12:27,  1.20s/it, loss=0.653][A
 24%|██▍       | 3059/12691 [1:00:40<3:12:27,  1.20s/it, loss=0.653][A
 24%|██▍       | 3060/12691 [1:00:40<3:19:59,  1.25s/it, loss=0.653][A
 24%|██▍       | 3060/12691 [1:00:41<3:19:59,  1.25s/it, loss=0.651][A
 24%|██▍       | 3061/12691 [1:00:41<3:19:17,  1.24s/it, loss=0.651][A
 24%|██▍       | 3061/12691 [1:00:42<3:19:17,  1.24s/it, loss=0.651][A
 24%|██▍       | 3062/12691 [1:00:42<3:14:15,  1.21s/it, loss=0.651][A
 24%|██▍       | 3062/12691 [1:00:44<3:14:15,  1.21s/it, loss=0.652][A
 24%|██▍       | 3063/12691 [1:00:44<3:10:27,  1.19s/it, loss=0.652][A
 24%|██▍       | 3063/12691 [1:00:45<3:10:27,  1.19s/it, loss=0.652][A
 24%|██▍       | 3064/12691 [1:00:45<3:09:24,  1.18s/it, loss=0.652][A
 24%|██▍       | 3064/12691 [1:00:46<3:09:24,  1.18s/it, loss=0.649][A
 24%|██▍       | 3065/12691 [1:00:46<3:11:44,  1.20s/it, loss=0.

 25%|██▍       | 3115/12691 [1:01:46<3:02:08,  1.14s/it, loss=0.648][A
 25%|██▍       | 3115/12691 [1:01:47<3:02:08,  1.14s/it, loss=0.649][A
 25%|██▍       | 3116/12691 [1:01:47<3:04:59,  1.16s/it, loss=0.649][A
 25%|██▍       | 3116/12691 [1:01:49<3:04:59,  1.16s/it, loss=0.652][A
 25%|██▍       | 3117/12691 [1:01:49<3:02:06,  1.14s/it, loss=0.652][A
 25%|██▍       | 3117/12691 [1:01:50<3:02:06,  1.14s/it, loss=0.651][A
 25%|██▍       | 3118/12691 [1:01:50<3:01:59,  1.14s/it, loss=0.651][A
 25%|██▍       | 3118/12691 [1:01:51<3:01:59,  1.14s/it, loss=0.651][A
 25%|██▍       | 3119/12691 [1:01:51<3:03:09,  1.15s/it, loss=0.651][A
 25%|██▍       | 3119/12691 [1:01:52<3:03:09,  1.15s/it, loss=0.65] [A
 25%|██▍       | 3120/12691 [1:01:52<3:02:22,  1.14s/it, loss=0.65][A
 25%|██▍       | 3120/12691 [1:01:53<3:02:22,  1.14s/it, loss=0.651][A
 25%|██▍       | 3121/12691 [1:01:53<3:01:57,  1.14s/it, loss=0.651][A
 25%|██▍       | 3121/12691 [1:01:54<3:01:57,  1.14s/it, loss=0.6

 25%|██▍       | 3171/12691 [1:02:51<2:59:19,  1.13s/it, loss=0.665][A
 25%|██▍       | 3172/12691 [1:02:51<2:59:49,  1.13s/it, loss=0.665][A
 25%|██▍       | 3172/12691 [1:02:52<2:59:49,  1.13s/it, loss=0.668][A
 25%|██▌       | 3173/12691 [1:02:52<2:59:02,  1.13s/it, loss=0.668][A
 25%|██▌       | 3173/12691 [1:02:53<2:59:02,  1.13s/it, loss=0.669][A
 25%|██▌       | 3174/12691 [1:02:53<2:58:12,  1.12s/it, loss=0.669][A
 25%|██▌       | 3174/12691 [1:02:54<2:58:12,  1.12s/it, loss=0.669][A
 25%|██▌       | 3175/12691 [1:02:54<2:57:02,  1.12s/it, loss=0.669][A
 25%|██▌       | 3175/12691 [1:02:55<2:57:02,  1.12s/it, loss=0.67] [A
 25%|██▌       | 3176/12691 [1:02:55<2:56:39,  1.11s/it, loss=0.67][A
 25%|██▌       | 3176/12691 [1:02:56<2:56:39,  1.11s/it, loss=0.667][A
 25%|██▌       | 3177/12691 [1:02:56<2:56:05,  1.11s/it, loss=0.667][A
 25%|██▌       | 3177/12691 [1:02:58<2:56:05,  1.11s/it, loss=0.67] [A
 25%|██▌       | 3178/12691 [1:02:58<3:00:50,  1.14s/it, loss=0.6

 25%|██▌       | 3228/12691 [1:03:56<3:07:13,  1.19s/it, loss=0.675][A
 25%|██▌       | 3228/12691 [1:03:58<3:07:13,  1.19s/it, loss=0.675][A
 25%|██▌       | 3229/12691 [1:03:58<3:08:51,  1.20s/it, loss=0.675][A
 25%|██▌       | 3229/12691 [1:03:59<3:08:51,  1.20s/it, loss=0.676][A
 25%|██▌       | 3230/12691 [1:03:59<3:07:57,  1.19s/it, loss=0.676][A
 25%|██▌       | 3230/12691 [1:04:00<3:07:57,  1.19s/it, loss=0.674][A
 25%|██▌       | 3231/12691 [1:04:00<3:05:39,  1.18s/it, loss=0.674][A
 25%|██▌       | 3231/12691 [1:04:01<3:05:39,  1.18s/it, loss=0.671][A
 25%|██▌       | 3232/12691 [1:04:01<3:04:52,  1.17s/it, loss=0.671][A
 25%|██▌       | 3232/12691 [1:04:02<3:04:52,  1.17s/it, loss=0.669][A
 25%|██▌       | 3233/12691 [1:04:02<3:10:26,  1.21s/it, loss=0.669][A
 25%|██▌       | 3233/12691 [1:04:04<3:10:26,  1.21s/it, loss=0.668][A
 25%|██▌       | 3234/12691 [1:04:04<3:10:41,  1.21s/it, loss=0.668][A
 25%|██▌       | 3234/12691 [1:04:05<3:10:41,  1.21s/it, loss=0.

 26%|██▌       | 3284/12691 [1:05:04<3:07:04,  1.19s/it, loss=0.669][A
 26%|██▌       | 3285/12691 [1:05:04<3:05:32,  1.18s/it, loss=0.669][A
 26%|██▌       | 3285/12691 [1:05:05<3:05:32,  1.18s/it, loss=0.67] [A
 26%|██▌       | 3286/12691 [1:05:05<3:03:49,  1.17s/it, loss=0.67][A
 26%|██▌       | 3286/12691 [1:05:06<3:03:49,  1.17s/it, loss=0.674][A
 26%|██▌       | 3287/12691 [1:05:06<3:02:18,  1.16s/it, loss=0.674][A
 26%|██▌       | 3287/12691 [1:05:07<3:02:18,  1.16s/it, loss=0.677][A
 26%|██▌       | 3288/12691 [1:05:07<3:01:00,  1.15s/it, loss=0.677][A
 26%|██▌       | 3288/12691 [1:05:09<3:01:00,  1.15s/it, loss=0.678][A
 26%|██▌       | 3289/12691 [1:05:09<3:07:43,  1.20s/it, loss=0.678][A
 26%|██▌       | 3289/12691 [1:05:10<3:07:43,  1.20s/it, loss=0.677][A
 26%|██▌       | 3290/12691 [1:05:10<3:06:37,  1.19s/it, loss=0.677][A
 26%|██▌       | 3290/12691 [1:05:11<3:06:37,  1.19s/it, loss=0.678][A
 26%|██▌       | 3291/12691 [1:05:11<3:05:10,  1.18s/it, loss=0.6

 26%|██▋       | 3341/12691 [1:06:11<3:05:31,  1.19s/it, loss=0.662][A
 26%|██▋       | 3341/12691 [1:06:12<3:05:31,  1.19s/it, loss=0.659][A
 26%|██▋       | 3342/12691 [1:06:12<3:04:22,  1.18s/it, loss=0.659][A
 26%|██▋       | 3342/12691 [1:06:13<3:04:22,  1.18s/it, loss=0.658][A
 26%|██▋       | 3343/12691 [1:06:13<3:07:06,  1.20s/it, loss=0.658][A
 26%|██▋       | 3343/12691 [1:06:15<3:07:06,  1.20s/it, loss=0.661][A
 26%|██▋       | 3344/12691 [1:06:15<3:10:44,  1.22s/it, loss=0.661][A
 26%|██▋       | 3344/12691 [1:06:16<3:10:44,  1.22s/it, loss=0.661][A
 26%|██▋       | 3345/12691 [1:06:16<3:08:29,  1.21s/it, loss=0.661][A
 26%|██▋       | 3345/12691 [1:06:17<3:08:29,  1.21s/it, loss=0.662][A
 26%|██▋       | 3346/12691 [1:06:17<3:09:12,  1.21s/it, loss=0.662][A
 26%|██▋       | 3346/12691 [1:06:18<3:09:12,  1.21s/it, loss=0.66] [A
 26%|██▋       | 3347/12691 [1:06:18<3:08:51,  1.21s/it, loss=0.66][A
 26%|██▋       | 3347/12691 [1:06:19<3:08:51,  1.21s/it, loss=0.6

 27%|██▋       | 3397/12691 [1:07:20<3:15:08,  1.26s/it, loss=0.658][A
 27%|██▋       | 3398/12691 [1:07:20<3:14:09,  1.25s/it, loss=0.658][A
 27%|██▋       | 3398/12691 [1:07:22<3:14:09,  1.25s/it, loss=0.66] [A
 27%|██▋       | 3399/12691 [1:07:22<3:12:40,  1.24s/it, loss=0.66][A
 27%|██▋       | 3399/12691 [1:07:23<3:12:40,  1.24s/it, loss=0.663][A
 27%|██▋       | 3400/12691 [1:07:23<3:18:50,  1.28s/it, loss=0.663][A
 27%|██▋       | 3400/12691 [1:07:24<3:18:50,  1.28s/it, loss=0.662][A
 27%|██▋       | 3401/12691 [1:07:24<3:16:05,  1.27s/it, loss=0.662][A
 27%|██▋       | 3401/12691 [1:07:25<3:16:05,  1.27s/it, loss=0.663][A
 27%|██▋       | 3402/12691 [1:07:25<3:13:38,  1.25s/it, loss=0.663][A
 27%|██▋       | 3402/12691 [1:07:27<3:13:38,  1.25s/it, loss=0.663][A
 27%|██▋       | 3403/12691 [1:07:27<3:11:54,  1.24s/it, loss=0.663][A
 27%|██▋       | 3403/12691 [1:07:28<3:11:54,  1.24s/it, loss=0.664][A
 27%|██▋       | 3404/12691 [1:07:28<3:12:46,  1.25s/it, loss=0.6

 27%|██▋       | 3454/12691 [1:08:30<3:06:58,  1.21s/it, loss=0.666][A
 27%|██▋       | 3454/12691 [1:08:31<3:06:58,  1.21s/it, loss=0.664][A
 27%|██▋       | 3455/12691 [1:08:31<3:11:08,  1.24s/it, loss=0.664][A
 27%|██▋       | 3455/12691 [1:08:32<3:11:08,  1.24s/it, loss=0.662][A
 27%|██▋       | 3456/12691 [1:08:32<3:11:37,  1.25s/it, loss=0.662][A
 27%|██▋       | 3456/12691 [1:08:33<3:11:37,  1.25s/it, loss=0.662][A
 27%|██▋       | 3457/12691 [1:08:33<3:08:28,  1.22s/it, loss=0.662][A
 27%|██▋       | 3457/12691 [1:08:35<3:08:28,  1.22s/it, loss=0.664][A
 27%|██▋       | 3458/12691 [1:08:35<3:07:10,  1.22s/it, loss=0.664][A
 27%|██▋       | 3458/12691 [1:08:36<3:07:10,  1.22s/it, loss=0.664][A
 27%|██▋       | 3459/12691 [1:08:36<3:06:49,  1.21s/it, loss=0.664][A
 27%|██▋       | 3459/12691 [1:08:37<3:06:49,  1.21s/it, loss=0.663][A
 27%|██▋       | 3460/12691 [1:08:37<3:07:45,  1.22s/it, loss=0.663][A
 27%|██▋       | 3460/12691 [1:08:38<3:07:45,  1.22s/it, loss=0.

 28%|██▊       | 3510/12691 [1:09:37<3:04:23,  1.21s/it, loss=0.659][A
 28%|██▊       | 3511/12691 [1:09:37<3:04:04,  1.20s/it, loss=0.659][A
 28%|██▊       | 3511/12691 [1:09:39<3:04:04,  1.20s/it, loss=0.658][A
 28%|██▊       | 3512/12691 [1:09:39<3:03:31,  1.20s/it, loss=0.658][A
 28%|██▊       | 3512/12691 [1:09:40<3:03:31,  1.20s/it, loss=0.66] [A
 28%|██▊       | 3513/12691 [1:09:40<3:02:05,  1.19s/it, loss=0.66][A
 28%|██▊       | 3513/12691 [1:09:41<3:02:05,  1.19s/it, loss=0.66][A
 28%|██▊       | 3514/12691 [1:09:41<2:59:39,  1.17s/it, loss=0.66][A
 28%|██▊       | 3514/12691 [1:09:42<2:59:39,  1.17s/it, loss=0.66][A
 28%|██▊       | 3515/12691 [1:09:42<2:57:56,  1.16s/it, loss=0.66][A
 28%|██▊       | 3515/12691 [1:09:43<2:57:56,  1.16s/it, loss=0.657][A
 28%|██▊       | 3516/12691 [1:09:43<2:55:57,  1.15s/it, loss=0.657][A
 28%|██▊       | 3516/12691 [1:09:44<2:55:57,  1.15s/it, loss=0.656][A
 28%|██▊       | 3517/12691 [1:09:44<3:01:20,  1.19s/it, loss=0.656]

 28%|██▊       | 3567/12691 [1:10:43<3:01:07,  1.19s/it, loss=0.659][A
 28%|██▊       | 3567/12691 [1:10:44<3:01:07,  1.19s/it, loss=0.657][A
 28%|██▊       | 3568/12691 [1:10:44<2:58:18,  1.17s/it, loss=0.657][A
 28%|██▊       | 3568/12691 [1:10:46<2:58:18,  1.17s/it, loss=0.657][A
 28%|██▊       | 3569/12691 [1:10:46<2:57:40,  1.17s/it, loss=0.657][A
 28%|██▊       | 3569/12691 [1:10:47<2:57:40,  1.17s/it, loss=0.658][A
 28%|██▊       | 3570/12691 [1:10:47<2:56:42,  1.16s/it, loss=0.658][A
 28%|██▊       | 3570/12691 [1:10:48<2:56:42,  1.16s/it, loss=0.657][A
 28%|██▊       | 3571/12691 [1:10:48<2:56:10,  1.16s/it, loss=0.657][A
 28%|██▊       | 3571/12691 [1:10:49<2:56:10,  1.16s/it, loss=0.656][A
 28%|██▊       | 3572/12691 [1:10:49<3:05:10,  1.22s/it, loss=0.656][A
 28%|██▊       | 3572/12691 [1:10:51<3:05:10,  1.22s/it, loss=0.661][A
 28%|██▊       | 3573/12691 [1:10:51<3:06:52,  1.23s/it, loss=0.661][A
 28%|██▊       | 3573/12691 [1:10:52<3:06:52,  1.23s/it, loss=0.

 29%|██▊       | 3623/12691 [1:11:54<3:04:20,  1.22s/it, loss=0.663][A
 29%|██▊       | 3624/12691 [1:11:54<3:01:15,  1.20s/it, loss=0.663][A
 29%|██▊       | 3624/12691 [1:11:55<3:01:15,  1.20s/it, loss=0.663][A
 29%|██▊       | 3625/12691 [1:11:55<3:03:30,  1.21s/it, loss=0.663][A
 29%|██▊       | 3625/12691 [1:11:56<3:03:30,  1.21s/it, loss=0.662][A
 29%|██▊       | 3626/12691 [1:11:56<3:00:36,  1.20s/it, loss=0.662][A
 29%|██▊       | 3626/12691 [1:11:57<3:00:36,  1.20s/it, loss=0.662][A
 29%|██▊       | 3627/12691 [1:11:57<3:01:47,  1.20s/it, loss=0.662][A
 29%|██▊       | 3627/12691 [1:11:58<3:01:47,  1.20s/it, loss=0.659][A
 29%|██▊       | 3628/12691 [1:11:58<2:58:27,  1.18s/it, loss=0.659][A
 29%|██▊       | 3628/12691 [1:12:00<2:58:27,  1.18s/it, loss=0.66] [A
 29%|██▊       | 3629/12691 [1:12:00<2:56:56,  1.17s/it, loss=0.66][A
 29%|██▊       | 3629/12691 [1:12:01<2:56:56,  1.17s/it, loss=0.662][A
 29%|██▊       | 3630/12691 [1:12:01<2:56:03,  1.17s/it, loss=0.6

 29%|██▉       | 3680/12691 [1:13:01<2:54:49,  1.16s/it, loss=0.645][A
 29%|██▉       | 3680/12691 [1:13:02<2:54:49,  1.16s/it, loss=0.649][A
 29%|██▉       | 3681/12691 [1:13:02<2:53:44,  1.16s/it, loss=0.649][A
 29%|██▉       | 3681/12691 [1:13:03<2:53:44,  1.16s/it, loss=0.648][A
 29%|██▉       | 3682/12691 [1:13:03<2:57:53,  1.18s/it, loss=0.648][A
 29%|██▉       | 3682/12691 [1:13:05<2:57:53,  1.18s/it, loss=0.644][A
 29%|██▉       | 3683/12691 [1:13:05<3:04:22,  1.23s/it, loss=0.644][A
 29%|██▉       | 3683/12691 [1:13:06<3:04:22,  1.23s/it, loss=0.644][A
 29%|██▉       | 3684/12691 [1:13:06<3:04:21,  1.23s/it, loss=0.644][A
 29%|██▉       | 3684/12691 [1:13:07<3:04:21,  1.23s/it, loss=0.644][A
 29%|██▉       | 3685/12691 [1:13:07<3:01:46,  1.21s/it, loss=0.644][A
 29%|██▉       | 3685/12691 [1:13:08<3:01:46,  1.21s/it, loss=0.643][A
 29%|██▉       | 3686/12691 [1:13:08<2:59:07,  1.19s/it, loss=0.643][A
 29%|██▉       | 3686/12691 [1:13:09<2:59:07,  1.19s/it, loss=0.

 29%|██▉       | 3736/12691 [1:14:07<2:56:33,  1.18s/it, loss=0.656][A
 29%|██▉       | 3737/12691 [1:14:07<2:53:55,  1.17s/it, loss=0.656][A
 29%|██▉       | 3737/12691 [1:14:08<2:53:55,  1.17s/it, loss=0.656][A
 29%|██▉       | 3738/12691 [1:14:08<2:51:34,  1.15s/it, loss=0.656][A
 29%|██▉       | 3738/12691 [1:14:09<2:51:34,  1.15s/it, loss=0.657][A
 29%|██▉       | 3739/12691 [1:14:09<2:55:23,  1.18s/it, loss=0.657][A
 29%|██▉       | 3739/12691 [1:14:10<2:55:23,  1.18s/it, loss=0.657][A
 29%|██▉       | 3740/12691 [1:14:10<2:52:02,  1.15s/it, loss=0.657][A
 29%|██▉       | 3740/12691 [1:14:11<2:52:02,  1.15s/it, loss=0.656][A
 29%|██▉       | 3741/12691 [1:14:11<2:53:27,  1.16s/it, loss=0.656][A
 29%|██▉       | 3741/12691 [1:14:12<2:53:27,  1.16s/it, loss=0.656][A
 29%|██▉       | 3742/12691 [1:14:12<2:52:59,  1.16s/it, loss=0.656][A
 29%|██▉       | 3742/12691 [1:14:13<2:52:59,  1.16s/it, loss=0.655][A
 29%|██▉       | 3743/12691 [1:14:13<2:51:04,  1.15s/it, loss=0.

 30%|██▉       | 3793/12691 [1:15:12<2:48:39,  1.14s/it, loss=0.661][A
 30%|██▉       | 3793/12691 [1:15:13<2:48:39,  1.14s/it, loss=0.661][A
 30%|██▉       | 3794/12691 [1:15:13<2:53:09,  1.17s/it, loss=0.661][A
 30%|██▉       | 3794/12691 [1:15:15<2:53:09,  1.17s/it, loss=0.662][A
 30%|██▉       | 3795/12691 [1:15:15<2:51:23,  1.16s/it, loss=0.662][A
 30%|██▉       | 3795/12691 [1:15:16<2:51:23,  1.16s/it, loss=0.66] [A
 30%|██▉       | 3796/12691 [1:15:16<2:50:08,  1.15s/it, loss=0.66][A
 30%|██▉       | 3796/12691 [1:15:17<2:50:08,  1.15s/it, loss=0.66][A
 30%|██▉       | 3797/12691 [1:15:17<2:49:18,  1.14s/it, loss=0.66][A
 30%|██▉       | 3797/12691 [1:15:18<2:49:18,  1.14s/it, loss=0.662][A
 30%|██▉       | 3798/12691 [1:15:18<2:47:47,  1.13s/it, loss=0.662][A
 30%|██▉       | 3798/12691 [1:15:19<2:47:47,  1.13s/it, loss=0.663][A
 30%|██▉       | 3799/12691 [1:15:19<2:48:04,  1.13s/it, loss=0.663][A
 30%|██▉       | 3799/12691 [1:15:20<2:48:04,  1.13s/it, loss=0.664

 30%|███       | 3849/12691 [1:16:21<2:58:11,  1.21s/it, loss=0.65][A
 30%|███       | 3850/12691 [1:16:21<3:04:07,  1.25s/it, loss=0.65][A
 30%|███       | 3850/12691 [1:16:22<3:04:07,  1.25s/it, loss=0.651][A
 30%|███       | 3851/12691 [1:16:22<2:58:26,  1.21s/it, loss=0.651][A
 30%|███       | 3851/12691 [1:16:23<2:58:26,  1.21s/it, loss=0.651][A
 30%|███       | 3852/12691 [1:16:23<2:55:31,  1.19s/it, loss=0.651][A
 30%|███       | 3852/12691 [1:16:24<2:55:31,  1.19s/it, loss=0.654][A
 30%|███       | 3853/12691 [1:16:24<2:53:57,  1.18s/it, loss=0.654][A
 30%|███       | 3853/12691 [1:16:26<2:53:57,  1.18s/it, loss=0.655][A
 30%|███       | 3854/12691 [1:16:26<2:52:47,  1.17s/it, loss=0.655][A
 30%|███       | 3854/12691 [1:16:27<2:52:47,  1.17s/it, loss=0.656][A
 30%|███       | 3855/12691 [1:16:27<2:50:07,  1.16s/it, loss=0.656][A
 30%|███       | 3855/12691 [1:16:28<2:50:07,  1.16s/it, loss=0.653][A
 30%|███       | 3856/12691 [1:16:28<2:51:07,  1.16s/it, loss=0.65

 31%|███       | 3906/12691 [1:17:28<2:55:42,  1.20s/it, loss=0.643][A
 31%|███       | 3907/12691 [1:17:28<2:52:50,  1.18s/it, loss=0.643][A
 31%|███       | 3907/12691 [1:17:29<2:52:50,  1.18s/it, loss=0.643][A
 31%|███       | 3908/12691 [1:17:29<2:56:13,  1.20s/it, loss=0.643][A
 31%|███       | 3908/12691 [1:17:30<2:56:13,  1.20s/it, loss=0.64] [A
 31%|███       | 3909/12691 [1:17:30<2:58:24,  1.22s/it, loss=0.64][A
 31%|███       | 3909/12691 [1:17:32<2:58:24,  1.22s/it, loss=0.644][A
 31%|███       | 3910/12691 [1:17:32<3:00:40,  1.23s/it, loss=0.644][A
 31%|███       | 3910/12691 [1:17:33<3:00:40,  1.23s/it, loss=0.643][A
 31%|███       | 3911/12691 [1:17:33<3:01:05,  1.24s/it, loss=0.643][A
 31%|███       | 3911/12691 [1:17:34<3:01:05,  1.24s/it, loss=0.64] [A
 31%|███       | 3912/12691 [1:17:34<3:08:25,  1.29s/it, loss=0.64][A
 31%|███       | 3912/12691 [1:17:36<3:08:25,  1.29s/it, loss=0.637][A
 31%|███       | 3913/12691 [1:17:36<3:05:23,  1.27s/it, loss=0.63

 31%|███       | 3963/12691 [1:18:39<2:53:10,  1.19s/it, loss=0.643][A
 31%|███       | 3964/12691 [1:18:39<2:55:24,  1.21s/it, loss=0.643][A
 31%|███       | 3964/12691 [1:18:40<2:55:24,  1.21s/it, loss=0.641][A
 31%|███       | 3965/12691 [1:18:40<2:55:10,  1.20s/it, loss=0.641][A
 31%|███       | 3965/12691 [1:18:41<2:55:10,  1.20s/it, loss=0.638][A
 31%|███▏      | 3966/12691 [1:18:41<2:56:42,  1.22s/it, loss=0.638][A
 31%|███▏      | 3966/12691 [1:18:43<2:56:42,  1.22s/it, loss=0.638][A
 31%|███▏      | 3967/12691 [1:18:43<3:02:28,  1.26s/it, loss=0.638][A
 31%|███▏      | 3967/12691 [1:18:44<3:02:28,  1.26s/it, loss=0.638][A
 31%|███▏      | 3968/12691 [1:18:44<3:03:07,  1.26s/it, loss=0.638][A
 31%|███▏      | 3968/12691 [1:18:45<3:03:07,  1.26s/it, loss=0.64] [A
 31%|███▏      | 3969/12691 [1:18:45<2:59:57,  1.24s/it, loss=0.64][A
 31%|███▏      | 3969/12691 [1:18:46<2:59:57,  1.24s/it, loss=0.638][A
 31%|███▏      | 3970/12691 [1:18:46<2:55:26,  1.21s/it, loss=0.6

saving model checkpoint at iteration=4000



 32%|███▏      | 4001/12691 [1:19:25<3:33:50,  1.48s/it, loss=0.654][A
 32%|███▏      | 4001/12691 [1:19:26<3:33:50,  1.48s/it, loss=0.652][A
 32%|███▏      | 4002/12691 [1:19:26<3:19:26,  1.38s/it, loss=0.652][A
 32%|███▏      | 4002/12691 [1:19:27<3:19:26,  1.38s/it, loss=0.652][A
 32%|███▏      | 4003/12691 [1:19:27<3:08:25,  1.30s/it, loss=0.652][A
 32%|███▏      | 4003/12691 [1:19:29<3:08:25,  1.30s/it, loss=0.653][A
 32%|███▏      | 4004/12691 [1:19:29<3:09:10,  1.31s/it, loss=0.653][A
 32%|███▏      | 4004/12691 [1:19:30<3:09:10,  1.31s/it, loss=0.655][A
 32%|███▏      | 4005/12691 [1:19:30<3:00:48,  1.25s/it, loss=0.655][A
 32%|███▏      | 4005/12691 [1:19:31<3:00:48,  1.25s/it, loss=0.651][A
 32%|███▏      | 4006/12691 [1:19:31<2:56:20,  1.22s/it, loss=0.651][A
 32%|███▏      | 4006/12691 [1:19:32<2:56:20,  1.22s/it, loss=0.652][A
 32%|███▏      | 4007/12691 [1:19:32<2:52:27,  1.19s/it, loss=0.652][A
 32%|███▏      | 4007/12691 [1:19:33<2:52:27,  1.19s/it, loss=0

 32%|███▏      | 4057/12691 [1:20:32<2:53:20,  1.20s/it, loss=0.651][A
 32%|███▏      | 4058/12691 [1:20:32<2:50:59,  1.19s/it, loss=0.651][A
 32%|███▏      | 4058/12691 [1:20:34<2:50:59,  1.19s/it, loss=0.651][A
 32%|███▏      | 4059/12691 [1:20:34<2:50:13,  1.18s/it, loss=0.651][A
 32%|███▏      | 4059/12691 [1:20:35<2:50:13,  1.18s/it, loss=0.648][A
 32%|███▏      | 4060/12691 [1:20:35<2:56:08,  1.22s/it, loss=0.648][A
 32%|███▏      | 4060/12691 [1:20:36<2:56:08,  1.22s/it, loss=0.645][A
 32%|███▏      | 4061/12691 [1:20:36<2:52:16,  1.20s/it, loss=0.645][A
 32%|███▏      | 4061/12691 [1:20:37<2:52:16,  1.20s/it, loss=0.644][A
 32%|███▏      | 4062/12691 [1:20:37<2:50:36,  1.19s/it, loss=0.644][A
 32%|███▏      | 4062/12691 [1:20:38<2:50:36,  1.19s/it, loss=0.643][A
 32%|███▏      | 4063/12691 [1:20:38<2:51:56,  1.20s/it, loss=0.643][A
 32%|███▏      | 4063/12691 [1:20:40<2:51:56,  1.20s/it, loss=0.644][A
 32%|███▏      | 4064/12691 [1:20:40<2:50:32,  1.19s/it, loss=0.

 32%|███▏      | 4114/12691 [1:21:40<2:53:23,  1.21s/it, loss=0.67][A
 32%|███▏      | 4114/12691 [1:21:41<2:53:23,  1.21s/it, loss=0.67][A
 32%|███▏      | 4115/12691 [1:21:41<2:54:57,  1.22s/it, loss=0.67][A
 32%|███▏      | 4115/12691 [1:21:42<2:54:57,  1.22s/it, loss=0.672][A
 32%|███▏      | 4116/12691 [1:21:42<2:51:45,  1.20s/it, loss=0.672][A
 32%|███▏      | 4116/12691 [1:21:44<2:51:45,  1.20s/it, loss=0.67] [A
 32%|███▏      | 4117/12691 [1:21:44<2:53:52,  1.22s/it, loss=0.67][A
 32%|███▏      | 4117/12691 [1:21:45<2:53:52,  1.22s/it, loss=0.669][A
 32%|███▏      | 4118/12691 [1:21:45<2:51:17,  1.20s/it, loss=0.669][A
 32%|███▏      | 4118/12691 [1:21:46<2:51:17,  1.20s/it, loss=0.669][A
 32%|███▏      | 4119/12691 [1:21:46<2:48:36,  1.18s/it, loss=0.669][A
 32%|███▏      | 4119/12691 [1:21:47<2:48:36,  1.18s/it, loss=0.666][A
 32%|███▏      | 4120/12691 [1:21:47<2:49:33,  1.19s/it, loss=0.666][A
 32%|███▏      | 4120/12691 [1:21:48<2:49:33,  1.19s/it, loss=0.664]

 33%|███▎      | 4171/12691 [1:22:48<2:50:28,  1.20s/it, loss=0.645][A
 33%|███▎      | 4171/12691 [1:22:49<2:50:28,  1.20s/it, loss=0.645][A
 33%|███▎      | 4172/12691 [1:22:49<2:48:34,  1.19s/it, loss=0.645][A
 33%|███▎      | 4172/12691 [1:22:50<2:48:34,  1.19s/it, loss=0.645][A
 33%|███▎      | 4173/12691 [1:22:50<2:47:23,  1.18s/it, loss=0.645][A
 33%|███▎      | 4173/12691 [1:22:51<2:47:23,  1.18s/it, loss=0.646][A
 33%|███▎      | 4174/12691 [1:22:51<2:46:05,  1.17s/it, loss=0.646][A
 33%|███▎      | 4174/12691 [1:22:53<2:46:05,  1.17s/it, loss=0.648][A
 33%|███▎      | 4175/12691 [1:22:53<2:44:54,  1.16s/it, loss=0.648][A
 33%|███▎      | 4175/12691 [1:22:54<2:44:54,  1.16s/it, loss=0.65] [A
 33%|███▎      | 4176/12691 [1:22:54<2:44:25,  1.16s/it, loss=0.65][A
 33%|███▎      | 4176/12691 [1:22:55<2:44:25,  1.16s/it, loss=0.649][A
 33%|███▎      | 4177/12691 [1:22:55<2:52:09,  1.21s/it, loss=0.649][A
 33%|███▎      | 4177/12691 [1:22:56<2:52:09,  1.21s/it, loss=0.6

 33%|███▎      | 4227/12691 [1:23:54<2:41:32,  1.15s/it, loss=0.657][A
 33%|███▎      | 4228/12691 [1:23:54<2:42:37,  1.15s/it, loss=0.657][A
 33%|███▎      | 4228/12691 [1:23:55<2:42:37,  1.15s/it, loss=0.654][A
 33%|███▎      | 4229/12691 [1:23:55<2:42:04,  1.15s/it, loss=0.654][A
 33%|███▎      | 4229/12691 [1:23:56<2:42:04,  1.15s/it, loss=0.656][A
 33%|███▎      | 4230/12691 [1:23:56<2:40:06,  1.14s/it, loss=0.656][A
 33%|███▎      | 4230/12691 [1:23:57<2:40:06,  1.14s/it, loss=0.653][A
 33%|███▎      | 4231/12691 [1:23:57<2:38:57,  1.13s/it, loss=0.653][A
 33%|███▎      | 4231/12691 [1:23:59<2:38:57,  1.13s/it, loss=0.653][A
 33%|███▎      | 4232/12691 [1:23:59<2:41:56,  1.15s/it, loss=0.653][A
 33%|███▎      | 4232/12691 [1:24:00<2:41:56,  1.15s/it, loss=0.648][A
 33%|███▎      | 4233/12691 [1:24:00<2:41:50,  1.15s/it, loss=0.648][A
 33%|███▎      | 4233/12691 [1:24:01<2:41:50,  1.15s/it, loss=0.651][A
 33%|███▎      | 4234/12691 [1:24:01<2:40:41,  1.14s/it, loss=0.

 34%|███▍      | 4284/12691 [1:24:59<2:45:30,  1.18s/it, loss=0.654][A
 34%|███▍      | 4284/12691 [1:25:00<2:45:30,  1.18s/it, loss=0.654][A
 34%|███▍      | 4285/12691 [1:25:00<2:43:45,  1.17s/it, loss=0.654][A
 34%|███▍      | 4285/12691 [1:25:01<2:43:45,  1.17s/it, loss=0.654][A
 34%|███▍      | 4286/12691 [1:25:01<2:44:31,  1.17s/it, loss=0.654][A
 34%|███▍      | 4286/12691 [1:25:02<2:44:31,  1.17s/it, loss=0.654][A
 34%|███▍      | 4287/12691 [1:25:02<2:43:15,  1.17s/it, loss=0.654][A
 34%|███▍      | 4287/12691 [1:25:04<2:43:15,  1.17s/it, loss=0.651][A
 34%|███▍      | 4288/12691 [1:25:04<2:47:58,  1.20s/it, loss=0.651][A
 34%|███▍      | 4288/12691 [1:25:05<2:47:58,  1.20s/it, loss=0.653][A
 34%|███▍      | 4289/12691 [1:25:05<2:45:22,  1.18s/it, loss=0.653][A
 34%|███▍      | 4289/12691 [1:25:06<2:45:22,  1.18s/it, loss=0.653][A
 34%|███▍      | 4290/12691 [1:25:06<2:43:47,  1.17s/it, loss=0.653][A
 34%|███▍      | 4290/12691 [1:25:07<2:43:47,  1.17s/it, loss=0.

 34%|███▍      | 4340/12691 [1:26:05<2:42:00,  1.16s/it, loss=0.657][A
 34%|███▍      | 4341/12691 [1:26:05<2:40:44,  1.16s/it, loss=0.657][A
 34%|███▍      | 4341/12691 [1:26:06<2:40:44,  1.16s/it, loss=0.654][A
 34%|███▍      | 4342/12691 [1:26:06<2:39:57,  1.15s/it, loss=0.654][A
 34%|███▍      | 4342/12691 [1:26:08<2:39:57,  1.15s/it, loss=0.656][A
 34%|███▍      | 4343/12691 [1:26:08<2:41:25,  1.16s/it, loss=0.656][A
 34%|███▍      | 4343/12691 [1:26:09<2:41:25,  1.16s/it, loss=0.657][A
 34%|███▍      | 4344/12691 [1:26:09<2:41:02,  1.16s/it, loss=0.657][A
 34%|███▍      | 4344/12691 [1:26:10<2:41:02,  1.16s/it, loss=0.657][A
 34%|███▍      | 4345/12691 [1:26:10<2:40:13,  1.15s/it, loss=0.657][A
 34%|███▍      | 4345/12691 [1:26:11<2:40:13,  1.15s/it, loss=0.657][A
 34%|███▍      | 4346/12691 [1:26:11<2:43:34,  1.18s/it, loss=0.657][A
 34%|███▍      | 4346/12691 [1:26:12<2:43:34,  1.18s/it, loss=0.655][A
 34%|███▍      | 4347/12691 [1:26:12<2:43:19,  1.17s/it, loss=0.

 35%|███▍      | 4397/12691 [1:27:10<2:37:46,  1.14s/it, loss=0.66][A
 35%|███▍      | 4397/12691 [1:27:11<2:37:46,  1.14s/it, loss=0.663][A
 35%|███▍      | 4398/12691 [1:27:11<2:40:04,  1.16s/it, loss=0.663][A
 35%|███▍      | 4398/12691 [1:27:12<2:40:04,  1.16s/it, loss=0.66] [A
 35%|███▍      | 4399/12691 [1:27:12<2:45:48,  1.20s/it, loss=0.66][A
 35%|███▍      | 4399/12691 [1:27:14<2:45:48,  1.20s/it, loss=0.66][A
 35%|███▍      | 4400/12691 [1:27:14<2:42:38,  1.18s/it, loss=0.66][A
 35%|███▍      | 4400/12691 [1:27:15<2:42:38,  1.18s/it, loss=0.659][A
 35%|███▍      | 4401/12691 [1:27:15<2:41:47,  1.17s/it, loss=0.659][A
 35%|███▍      | 4401/12691 [1:27:16<2:41:47,  1.17s/it, loss=0.658][A
 35%|███▍      | 4402/12691 [1:27:16<2:39:33,  1.15s/it, loss=0.658][A
 35%|███▍      | 4402/12691 [1:27:17<2:39:33,  1.15s/it, loss=0.659][A
 35%|███▍      | 4403/12691 [1:27:17<2:38:02,  1.14s/it, loss=0.659][A
 35%|███▍      | 4403/12691 [1:27:18<2:38:02,  1.14s/it, loss=0.66] 

 35%|███▌      | 4453/12691 [1:28:16<2:40:50,  1.17s/it, loss=0.651][A
 35%|███▌      | 4454/12691 [1:28:16<2:38:37,  1.16s/it, loss=0.651][A
 35%|███▌      | 4454/12691 [1:28:17<2:38:37,  1.16s/it, loss=0.649][A
 35%|███▌      | 4455/12691 [1:28:17<2:40:56,  1.17s/it, loss=0.649][A
 35%|███▌      | 4455/12691 [1:28:18<2:40:56,  1.17s/it, loss=0.652][A
 35%|███▌      | 4456/12691 [1:28:18<2:39:56,  1.17s/it, loss=0.652][A
 35%|███▌      | 4456/12691 [1:28:20<2:39:56,  1.17s/it, loss=0.653][A
 35%|███▌      | 4457/12691 [1:28:20<2:42:47,  1.19s/it, loss=0.653][A
 35%|███▌      | 4457/12691 [1:28:21<2:42:47,  1.19s/it, loss=0.655][A
 35%|███▌      | 4458/12691 [1:28:21<2:42:23,  1.18s/it, loss=0.655][A
 35%|███▌      | 4458/12691 [1:28:22<2:42:23,  1.18s/it, loss=0.659][A
 35%|███▌      | 4459/12691 [1:28:22<2:40:36,  1.17s/it, loss=0.659][A
 35%|███▌      | 4459/12691 [1:28:23<2:40:36,  1.17s/it, loss=0.658][A
 35%|███▌      | 4460/12691 [1:28:23<2:41:39,  1.18s/it, loss=0.

 36%|███▌      | 4510/12691 [1:29:22<2:38:17,  1.16s/it, loss=0.655][A
 36%|███▌      | 4510/12691 [1:29:23<2:38:17,  1.16s/it, loss=0.656][A
 36%|███▌      | 4511/12691 [1:29:23<2:45:06,  1.21s/it, loss=0.656][A
 36%|███▌      | 4511/12691 [1:29:24<2:45:06,  1.21s/it, loss=0.659][A
 36%|███▌      | 4512/12691 [1:29:24<2:44:28,  1.21s/it, loss=0.659][A
 36%|███▌      | 4512/12691 [1:29:25<2:44:28,  1.21s/it, loss=0.657][A
 36%|███▌      | 4513/12691 [1:29:25<2:40:37,  1.18s/it, loss=0.657][A
 36%|███▌      | 4513/12691 [1:29:26<2:40:37,  1.18s/it, loss=0.658][A
 36%|███▌      | 4514/12691 [1:29:26<2:39:13,  1.17s/it, loss=0.658][A
 36%|███▌      | 4514/12691 [1:29:28<2:39:13,  1.17s/it, loss=0.656][A
 36%|███▌      | 4515/12691 [1:29:28<2:39:20,  1.17s/it, loss=0.656][A
 36%|███▌      | 4515/12691 [1:29:29<2:39:20,  1.17s/it, loss=0.658][A
 36%|███▌      | 4516/12691 [1:29:29<2:39:28,  1.17s/it, loss=0.658][A
 36%|███▌      | 4516/12691 [1:29:30<2:39:28,  1.17s/it, loss=0.

 36%|███▌      | 4566/12691 [1:30:28<2:40:58,  1.19s/it, loss=0.662][A
 36%|███▌      | 4567/12691 [1:30:28<2:39:51,  1.18s/it, loss=0.662][A
 36%|███▌      | 4567/12691 [1:30:29<2:39:51,  1.18s/it, loss=0.664][A
 36%|███▌      | 4568/12691 [1:30:29<2:38:20,  1.17s/it, loss=0.664][A
 36%|███▌      | 4568/12691 [1:30:30<2:38:20,  1.17s/it, loss=0.665][A
 36%|███▌      | 4569/12691 [1:30:30<2:33:48,  1.14s/it, loss=0.665][A
 36%|███▌      | 4569/12691 [1:30:31<2:33:48,  1.14s/it, loss=0.666][A
 36%|███▌      | 4570/12691 [1:30:31<2:34:08,  1.14s/it, loss=0.666][A
 36%|███▌      | 4570/12691 [1:30:32<2:34:08,  1.14s/it, loss=0.666][A
 36%|███▌      | 4571/12691 [1:30:32<2:35:11,  1.15s/it, loss=0.666][A
 36%|███▌      | 4571/12691 [1:30:34<2:35:11,  1.15s/it, loss=0.669][A
 36%|███▌      | 4572/12691 [1:30:34<2:40:27,  1.19s/it, loss=0.669][A
 36%|███▌      | 4572/12691 [1:30:35<2:40:27,  1.19s/it, loss=0.67] [A
 36%|███▌      | 4573/12691 [1:30:35<2:39:13,  1.18s/it, loss=0.

 36%|███▋      | 4623/12691 [1:31:36<2:51:19,  1.27s/it, loss=0.657][A
 36%|███▋      | 4623/12691 [1:31:37<2:51:19,  1.27s/it, loss=0.661][A
 36%|███▋      | 4624/12691 [1:31:37<2:49:05,  1.26s/it, loss=0.661][A
 36%|███▋      | 4624/12691 [1:31:38<2:49:05,  1.26s/it, loss=0.664][A
 36%|███▋      | 4625/12691 [1:31:38<2:47:19,  1.24s/it, loss=0.664][A
 36%|███▋      | 4625/12691 [1:31:39<2:47:19,  1.24s/it, loss=0.665][A
 36%|███▋      | 4626/12691 [1:31:39<2:43:27,  1.22s/it, loss=0.665][A
 36%|███▋      | 4626/12691 [1:31:41<2:43:27,  1.22s/it, loss=0.664][A
 36%|███▋      | 4627/12691 [1:31:41<2:40:22,  1.19s/it, loss=0.664][A
 36%|███▋      | 4627/12691 [1:31:42<2:40:22,  1.19s/it, loss=0.662][A
 36%|███▋      | 4628/12691 [1:31:42<2:43:21,  1.22s/it, loss=0.662][A
 36%|███▋      | 4628/12691 [1:31:43<2:43:21,  1.22s/it, loss=0.661][A
 36%|███▋      | 4629/12691 [1:31:43<2:41:15,  1.20s/it, loss=0.661][A
 36%|███▋      | 4629/12691 [1:31:44<2:41:15,  1.20s/it, loss=0.

 37%|███▋      | 4679/12691 [1:32:45<2:38:41,  1.19s/it, loss=0.651][A
 37%|███▋      | 4680/12691 [1:32:45<2:38:36,  1.19s/it, loss=0.651][A
 37%|███▋      | 4680/12691 [1:32:46<2:38:36,  1.19s/it, loss=0.649][A
 37%|███▋      | 4681/12691 [1:32:46<2:40:00,  1.20s/it, loss=0.649][A
 37%|███▋      | 4681/12691 [1:32:47<2:40:00,  1.20s/it, loss=0.65] [A
 37%|███▋      | 4682/12691 [1:32:47<2:35:10,  1.16s/it, loss=0.65][A
 37%|███▋      | 4682/12691 [1:32:48<2:35:10,  1.16s/it, loss=0.651][A
 37%|███▋      | 4683/12691 [1:32:48<2:33:10,  1.15s/it, loss=0.651][A
 37%|███▋      | 4683/12691 [1:32:49<2:33:10,  1.15s/it, loss=0.652][A
 37%|███▋      | 4684/12691 [1:32:49<2:37:16,  1.18s/it, loss=0.652][A
 37%|███▋      | 4684/12691 [1:32:50<2:37:16,  1.18s/it, loss=0.653][A
 37%|███▋      | 4685/12691 [1:32:50<2:38:10,  1.19s/it, loss=0.653][A
 37%|███▋      | 4685/12691 [1:32:52<2:38:10,  1.19s/it, loss=0.653][A
 37%|███▋      | 4686/12691 [1:32:52<2:36:39,  1.17s/it, loss=0.6

 37%|███▋      | 4736/12691 [1:33:50<2:32:10,  1.15s/it, loss=0.65][A
 37%|███▋      | 4736/12691 [1:33:51<2:32:10,  1.15s/it, loss=0.648][A
 37%|███▋      | 4737/12691 [1:33:51<2:35:00,  1.17s/it, loss=0.648][A
 37%|███▋      | 4737/12691 [1:33:52<2:35:00,  1.17s/it, loss=0.65] [A
 37%|███▋      | 4738/12691 [1:33:52<2:37:56,  1.19s/it, loss=0.65][A
 37%|███▋      | 4738/12691 [1:33:53<2:37:56,  1.19s/it, loss=0.649][A
 37%|███▋      | 4739/12691 [1:33:53<2:43:56,  1.24s/it, loss=0.649][A
 37%|███▋      | 4739/12691 [1:33:55<2:43:56,  1.24s/it, loss=0.65] [A
 37%|███▋      | 4740/12691 [1:33:55<2:42:48,  1.23s/it, loss=0.65][A
 37%|███▋      | 4740/12691 [1:33:56<2:42:48,  1.23s/it, loss=0.652][A
 37%|███▋      | 4741/12691 [1:33:56<2:40:00,  1.21s/it, loss=0.652][A
 37%|███▋      | 4741/12691 [1:33:57<2:40:00,  1.21s/it, loss=0.653][A
 37%|███▋      | 4742/12691 [1:33:57<2:37:55,  1.19s/it, loss=0.653][A
 37%|███▋      | 4742/12691 [1:33:58<2:37:55,  1.19s/it, loss=0.652

 38%|███▊      | 4792/12691 [1:34:58<2:33:45,  1.17s/it, loss=0.648][A
 38%|███▊      | 4793/12691 [1:34:58<2:33:35,  1.17s/it, loss=0.648][A
 38%|███▊      | 4793/12691 [1:34:59<2:33:35,  1.17s/it, loss=0.65] [A
 38%|███▊      | 4794/12691 [1:34:59<2:34:11,  1.17s/it, loss=0.65][A
 38%|███▊      | 4794/12691 [1:35:01<2:34:11,  1.17s/it, loss=0.652][A
 38%|███▊      | 4795/12691 [1:35:01<2:41:11,  1.22s/it, loss=0.652][A
 38%|███▊      | 4795/12691 [1:35:02<2:41:11,  1.22s/it, loss=0.649][A
 38%|███▊      | 4796/12691 [1:35:02<2:40:17,  1.22s/it, loss=0.649][A
 38%|███▊      | 4796/12691 [1:35:03<2:40:17,  1.22s/it, loss=0.65] [A
 38%|███▊      | 4797/12691 [1:35:03<2:38:44,  1.21s/it, loss=0.65][A
 38%|███▊      | 4797/12691 [1:35:04<2:38:44,  1.21s/it, loss=0.649][A
 38%|███▊      | 4798/12691 [1:35:04<2:35:35,  1.18s/it, loss=0.649][A
 38%|███▊      | 4798/12691 [1:35:05<2:35:35,  1.18s/it, loss=0.649][A
 38%|███▊      | 4799/12691 [1:35:05<2:35:01,  1.18s/it, loss=0.64

 38%|███▊      | 4849/12691 [1:36:07<2:42:12,  1.24s/it, loss=0.652][A
 38%|███▊      | 4849/12691 [1:36:08<2:42:12,  1.24s/it, loss=0.651][A
 38%|███▊      | 4850/12691 [1:36:08<2:41:51,  1.24s/it, loss=0.651][A
 38%|███▊      | 4850/12691 [1:36:09<2:41:51,  1.24s/it, loss=0.652][A
 38%|███▊      | 4851/12691 [1:36:09<2:46:18,  1.27s/it, loss=0.652][A
 38%|███▊      | 4851/12691 [1:36:10<2:46:18,  1.27s/it, loss=0.655][A
 38%|███▊      | 4852/12691 [1:36:10<2:41:52,  1.24s/it, loss=0.655][A
 38%|███▊      | 4852/12691 [1:36:12<2:41:52,  1.24s/it, loss=0.653][A
 38%|███▊      | 4853/12691 [1:36:12<2:40:48,  1.23s/it, loss=0.653][A
 38%|███▊      | 4853/12691 [1:36:13<2:40:48,  1.23s/it, loss=0.653][A
 38%|███▊      | 4854/12691 [1:36:13<2:40:22,  1.23s/it, loss=0.653][A
 38%|███▊      | 4854/12691 [1:36:14<2:40:22,  1.23s/it, loss=0.652][A
 38%|███▊      | 4855/12691 [1:36:14<2:39:05,  1.22s/it, loss=0.652][A
 38%|███▊      | 4855/12691 [1:36:15<2:39:05,  1.22s/it, loss=0.

 39%|███▊      | 4905/12691 [1:37:15<2:30:03,  1.16s/it, loss=0.634][A
 39%|███▊      | 4906/12691 [1:37:15<2:37:37,  1.21s/it, loss=0.634][A
 39%|███▊      | 4906/12691 [1:37:17<2:37:37,  1.21s/it, loss=0.636][A
 39%|███▊      | 4907/12691 [1:37:17<2:35:41,  1.20s/it, loss=0.636][A
 39%|███▊      | 4907/12691 [1:37:18<2:35:41,  1.20s/it, loss=0.638][A
 39%|███▊      | 4908/12691 [1:37:18<2:32:18,  1.17s/it, loss=0.638][A
 39%|███▊      | 4908/12691 [1:37:19<2:32:18,  1.17s/it, loss=0.638][A
 39%|███▊      | 4909/12691 [1:37:19<2:34:03,  1.19s/it, loss=0.638][A
 39%|███▊      | 4909/12691 [1:37:20<2:34:03,  1.19s/it, loss=0.639][A
 39%|███▊      | 4910/12691 [1:37:20<2:31:52,  1.17s/it, loss=0.639][A
 39%|███▊      | 4910/12691 [1:37:21<2:31:52,  1.17s/it, loss=0.638][A
 39%|███▊      | 4911/12691 [1:37:21<2:29:43,  1.15s/it, loss=0.638][A
 39%|███▊      | 4911/12691 [1:37:22<2:29:43,  1.15s/it, loss=0.638][A
 39%|███▊      | 4912/12691 [1:37:22<2:29:27,  1.15s/it, loss=0.

 39%|███▉      | 4962/12691 [1:38:21<2:30:48,  1.17s/it, loss=0.66][A
 39%|███▉      | 4962/12691 [1:38:23<2:30:48,  1.17s/it, loss=0.659][A
 39%|███▉      | 4963/12691 [1:38:23<2:32:11,  1.18s/it, loss=0.659][A
 39%|███▉      | 4963/12691 [1:38:24<2:32:11,  1.18s/it, loss=0.659][A
 39%|███▉      | 4964/12691 [1:38:24<2:33:22,  1.19s/it, loss=0.659][A
 39%|███▉      | 4964/12691 [1:38:25<2:33:22,  1.19s/it, loss=0.658][A
 39%|███▉      | 4965/12691 [1:38:25<2:35:19,  1.21s/it, loss=0.658][A
 39%|███▉      | 4965/12691 [1:38:26<2:35:19,  1.21s/it, loss=0.656][A
 39%|███▉      | 4966/12691 [1:38:26<2:35:01,  1.20s/it, loss=0.656][A
 39%|███▉      | 4966/12691 [1:38:28<2:35:01,  1.20s/it, loss=0.654][A
 39%|███▉      | 4967/12691 [1:38:28<2:36:06,  1.21s/it, loss=0.654][A
 39%|███▉      | 4967/12691 [1:38:29<2:36:06,  1.21s/it, loss=0.653][A
 39%|███▉      | 4968/12691 [1:38:29<2:35:57,  1.21s/it, loss=0.653][A
 39%|███▉      | 4968/12691 [1:38:30<2:35:57,  1.21s/it, loss=0.6

saving model checkpoint at iteration=5000



 39%|███▉      | 5001/12691 [1:39:09<3:09:40,  1.48s/it, loss=0.663][A
 39%|███▉      | 5001/12691 [1:39:10<3:09:40,  1.48s/it, loss=0.664][A
 39%|███▉      | 5002/12691 [1:39:10<2:56:41,  1.38s/it, loss=0.664][A
 39%|███▉      | 5002/12691 [1:39:11<2:56:41,  1.38s/it, loss=0.664][A
 39%|███▉      | 5003/12691 [1:39:11<2:50:31,  1.33s/it, loss=0.664][A
 39%|███▉      | 5003/12691 [1:39:12<2:50:31,  1.33s/it, loss=0.668][A
 39%|███▉      | 5004/12691 [1:39:12<2:43:11,  1.27s/it, loss=0.668][A
 39%|███▉      | 5004/12691 [1:39:14<2:43:11,  1.27s/it, loss=0.669][A
 39%|███▉      | 5005/12691 [1:39:14<2:41:17,  1.26s/it, loss=0.669][A
 39%|███▉      | 5005/12691 [1:39:15<2:41:17,  1.26s/it, loss=0.669][A
 39%|███▉      | 5006/12691 [1:39:15<2:38:59,  1.24s/it, loss=0.669][A
 39%|███▉      | 5006/12691 [1:39:16<2:38:59,  1.24s/it, loss=0.67] [A
 39%|███▉      | 5007/12691 [1:39:16<2:34:54,  1.21s/it, loss=0.67][A
 39%|███▉      | 5007/12691 [1:39:17<2:34:54,  1.21s/it, loss=0.

 40%|███▉      | 5057/12691 [1:40:16<2:33:51,  1.21s/it, loss=0.656][A
 40%|███▉      | 5058/12691 [1:40:16<2:31:49,  1.19s/it, loss=0.656][A
 40%|███▉      | 5058/12691 [1:40:17<2:31:49,  1.19s/it, loss=0.653][A
 40%|███▉      | 5059/12691 [1:40:17<2:28:36,  1.17s/it, loss=0.653][A
 40%|███▉      | 5059/12691 [1:40:18<2:28:36,  1.17s/it, loss=0.652][A
 40%|███▉      | 5060/12691 [1:40:18<2:29:43,  1.18s/it, loss=0.652][A
 40%|███▉      | 5060/12691 [1:40:19<2:29:43,  1.18s/it, loss=0.652][A
 40%|███▉      | 5061/12691 [1:40:19<2:34:22,  1.21s/it, loss=0.652][A
 40%|███▉      | 5061/12691 [1:40:21<2:34:22,  1.21s/it, loss=0.65] [A
 40%|███▉      | 5062/12691 [1:40:21<2:31:15,  1.19s/it, loss=0.65][A
 40%|███▉      | 5062/12691 [1:40:22<2:31:15,  1.19s/it, loss=0.647][A
 40%|███▉      | 5063/12691 [1:40:22<2:30:45,  1.19s/it, loss=0.647][A
 40%|███▉      | 5063/12691 [1:40:23<2:30:45,  1.19s/it, loss=0.646][A
 40%|███▉      | 5064/12691 [1:40:23<2:29:55,  1.18s/it, loss=0.6

 40%|████      | 5114/12691 [1:41:22<2:28:14,  1.17s/it, loss=0.648][A
 40%|████      | 5114/12691 [1:41:23<2:28:14,  1.17s/it, loss=0.649][A
 40%|████      | 5115/12691 [1:41:23<2:29:47,  1.19s/it, loss=0.649][A
 40%|████      | 5115/12691 [1:41:25<2:29:47,  1.19s/it, loss=0.65] [A
 40%|████      | 5116/12691 [1:41:25<2:37:31,  1.25s/it, loss=0.65][A
 40%|████      | 5116/12691 [1:41:26<2:37:31,  1.25s/it, loss=0.648][A
 40%|████      | 5117/12691 [1:41:26<2:34:43,  1.23s/it, loss=0.648][A
 40%|████      | 5117/12691 [1:41:27<2:34:43,  1.23s/it, loss=0.648][A
 40%|████      | 5118/12691 [1:41:27<2:32:27,  1.21s/it, loss=0.648][A
 40%|████      | 5118/12691 [1:41:28<2:32:27,  1.21s/it, loss=0.65] [A
 40%|████      | 5119/12691 [1:41:28<2:29:33,  1.19s/it, loss=0.65][A
 40%|████      | 5119/12691 [1:41:29<2:29:33,  1.19s/it, loss=0.652][A
 40%|████      | 5120/12691 [1:41:29<2:30:11,  1.19s/it, loss=0.652][A
 40%|████      | 5120/12691 [1:41:31<2:30:11,  1.19s/it, loss=0.65

 41%|████      | 5170/12691 [1:42:30<2:22:43,  1.14s/it, loss=0.645][A
 41%|████      | 5171/12691 [1:42:30<2:21:16,  1.13s/it, loss=0.645][A
 41%|████      | 5171/12691 [1:42:31<2:21:16,  1.13s/it, loss=0.647][A
 41%|████      | 5172/12691 [1:42:31<2:26:43,  1.17s/it, loss=0.647][A
 41%|████      | 5172/12691 [1:42:32<2:26:43,  1.17s/it, loss=0.651][A
 41%|████      | 5173/12691 [1:42:32<2:28:48,  1.19s/it, loss=0.651][A
 41%|████      | 5173/12691 [1:42:34<2:28:48,  1.19s/it, loss=0.651][A
 41%|████      | 5174/12691 [1:42:34<2:29:53,  1.20s/it, loss=0.651][A
 41%|████      | 5174/12691 [1:42:35<2:29:53,  1.20s/it, loss=0.652][A
 41%|████      | 5175/12691 [1:42:35<2:30:30,  1.20s/it, loss=0.652][A
 41%|████      | 5175/12691 [1:42:36<2:30:30,  1.20s/it, loss=0.651][A
 41%|████      | 5176/12691 [1:42:36<2:29:10,  1.19s/it, loss=0.651][A
 41%|████      | 5176/12691 [1:42:37<2:29:10,  1.19s/it, loss=0.651][A
 41%|████      | 5177/12691 [1:42:37<2:25:57,  1.17s/it, loss=0.

 41%|████      | 5227/12691 [1:43:37<2:29:16,  1.20s/it, loss=0.646][A
 41%|████      | 5227/12691 [1:43:39<2:29:16,  1.20s/it, loss=0.645][A
 41%|████      | 5228/12691 [1:43:39<2:35:28,  1.25s/it, loss=0.645][A
 41%|████      | 5228/12691 [1:43:40<2:35:28,  1.25s/it, loss=0.645][A
 41%|████      | 5229/12691 [1:43:40<2:33:52,  1.24s/it, loss=0.645][A
 41%|████      | 5229/12691 [1:43:41<2:33:52,  1.24s/it, loss=0.644][A
 41%|████      | 5230/12691 [1:43:41<2:33:52,  1.24s/it, loss=0.644][A
 41%|████      | 5230/12691 [1:43:42<2:33:52,  1.24s/it, loss=0.646][A
 41%|████      | 5231/12691 [1:43:42<2:31:00,  1.21s/it, loss=0.646][A
 41%|████      | 5231/12691 [1:43:44<2:31:00,  1.21s/it, loss=0.649][A
 41%|████      | 5232/12691 [1:43:44<2:32:23,  1.23s/it, loss=0.649][A
 41%|████      | 5232/12691 [1:43:45<2:32:23,  1.23s/it, loss=0.649][A
 41%|████      | 5233/12691 [1:43:45<2:32:03,  1.22s/it, loss=0.649][A
 41%|████      | 5233/12691 [1:43:46<2:32:03,  1.22s/it, loss=0.

 42%|████▏     | 5283/12691 [1:44:44<2:25:46,  1.18s/it, loss=0.647][A
 42%|████▏     | 5284/12691 [1:44:44<2:23:22,  1.16s/it, loss=0.647][A
 42%|████▏     | 5284/12691 [1:44:45<2:23:22,  1.16s/it, loss=0.645][A
 42%|████▏     | 5285/12691 [1:44:45<2:21:27,  1.15s/it, loss=0.645][A
 42%|████▏     | 5285/12691 [1:44:46<2:21:27,  1.15s/it, loss=0.647][A
 42%|████▏     | 5286/12691 [1:44:46<2:20:33,  1.14s/it, loss=0.647][A
 42%|████▏     | 5286/12691 [1:44:48<2:20:33,  1.14s/it, loss=0.645][A
 42%|████▏     | 5287/12691 [1:44:48<2:19:18,  1.13s/it, loss=0.645][A
 42%|████▏     | 5287/12691 [1:44:49<2:19:18,  1.13s/it, loss=0.643][A
 42%|████▏     | 5288/12691 [1:44:49<2:18:52,  1.13s/it, loss=0.643][A
 42%|████▏     | 5288/12691 [1:44:50<2:18:52,  1.13s/it, loss=0.644][A
 42%|████▏     | 5289/12691 [1:44:50<2:20:33,  1.14s/it, loss=0.644][A
 42%|████▏     | 5289/12691 [1:44:51<2:20:33,  1.14s/it, loss=0.647][A
 42%|████▏     | 5290/12691 [1:44:51<2:26:29,  1.19s/it, loss=0.

 42%|████▏     | 5340/12691 [1:45:51<2:32:20,  1.24s/it, loss=0.639][A
 42%|████▏     | 5341/12691 [1:45:51<2:32:35,  1.25s/it, loss=0.639][A
 42%|████▏     | 5341/12691 [1:45:52<2:32:35,  1.25s/it, loss=0.64] [A
 42%|████▏     | 5342/12691 [1:45:52<2:29:23,  1.22s/it, loss=0.64][A
 42%|████▏     | 5342/12691 [1:45:53<2:29:23,  1.22s/it, loss=0.642][A
 42%|████▏     | 5343/12691 [1:45:53<2:26:48,  1.20s/it, loss=0.642][A
 42%|████▏     | 5343/12691 [1:45:54<2:26:48,  1.20s/it, loss=0.643][A
 42%|████▏     | 5344/12691 [1:45:55<2:25:34,  1.19s/it, loss=0.643][A
 42%|████▏     | 5344/12691 [1:45:56<2:25:34,  1.19s/it, loss=0.642][A
 42%|████▏     | 5345/12691 [1:45:56<2:29:09,  1.22s/it, loss=0.642][A
 42%|████▏     | 5345/12691 [1:45:57<2:29:09,  1.22s/it, loss=0.643][A
 42%|████▏     | 5346/12691 [1:45:57<2:27:33,  1.21s/it, loss=0.643][A
 42%|████▏     | 5346/12691 [1:45:58<2:27:33,  1.21s/it, loss=0.64] [A
 42%|████▏     | 5347/12691 [1:45:58<2:27:20,  1.20s/it, loss=0.6

 43%|████▎     | 5397/12691 [1:46:58<2:24:20,  1.19s/it, loss=0.651][A
 43%|████▎     | 5397/12691 [1:46:59<2:24:20,  1.19s/it, loss=0.649][A
 43%|████▎     | 5398/12691 [1:46:59<2:25:28,  1.20s/it, loss=0.649][A
 43%|████▎     | 5398/12691 [1:47:01<2:25:28,  1.20s/it, loss=0.649][A
 43%|████▎     | 5399/12691 [1:47:01<2:25:38,  1.20s/it, loss=0.649][A
 43%|████▎     | 5399/12691 [1:47:02<2:25:38,  1.20s/it, loss=0.648][A
 43%|████▎     | 5400/12691 [1:47:02<2:25:39,  1.20s/it, loss=0.648][A
 43%|████▎     | 5400/12691 [1:47:03<2:25:39,  1.20s/it, loss=0.646][A
 43%|████▎     | 5401/12691 [1:47:03<2:24:42,  1.19s/it, loss=0.646][A
 43%|████▎     | 5401/12691 [1:47:04<2:24:42,  1.19s/it, loss=0.645][A
 43%|████▎     | 5402/12691 [1:47:04<2:23:00,  1.18s/it, loss=0.645][A
 43%|████▎     | 5402/12691 [1:47:05<2:23:00,  1.18s/it, loss=0.648][A
 43%|████▎     | 5403/12691 [1:47:05<2:24:40,  1.19s/it, loss=0.648][A
 43%|████▎     | 5403/12691 [1:47:06<2:24:40,  1.19s/it, loss=0.

 43%|████▎     | 5453/12691 [1:48:05<2:22:05,  1.18s/it, loss=0.662][A
 43%|████▎     | 5454/12691 [1:48:05<2:22:16,  1.18s/it, loss=0.662][A
 43%|████▎     | 5454/12691 [1:48:06<2:22:16,  1.18s/it, loss=0.662][A
 43%|████▎     | 5455/12691 [1:48:06<2:21:51,  1.18s/it, loss=0.662][A
 43%|████▎     | 5455/12691 [1:48:07<2:21:51,  1.18s/it, loss=0.663][A
 43%|████▎     | 5456/12691 [1:48:07<2:26:32,  1.22s/it, loss=0.663][A
 43%|████▎     | 5456/12691 [1:48:09<2:26:32,  1.22s/it, loss=0.665][A
 43%|████▎     | 5457/12691 [1:48:09<2:25:15,  1.20s/it, loss=0.665][A
 43%|████▎     | 5457/12691 [1:48:10<2:25:15,  1.20s/it, loss=0.664][A
 43%|████▎     | 5458/12691 [1:48:10<2:23:49,  1.19s/it, loss=0.664][A
 43%|████▎     | 5458/12691 [1:48:11<2:23:49,  1.19s/it, loss=0.665][A
 43%|████▎     | 5459/12691 [1:48:11<2:23:13,  1.19s/it, loss=0.665][A
 43%|████▎     | 5459/12691 [1:48:12<2:23:13,  1.19s/it, loss=0.664][A
 43%|████▎     | 5460/12691 [1:48:12<2:22:44,  1.18s/it, loss=0.

 43%|████▎     | 5510/12691 [1:49:11<2:18:16,  1.16s/it, loss=0.641][A
 43%|████▎     | 5510/12691 [1:49:12<2:18:16,  1.16s/it, loss=0.641][A
 43%|████▎     | 5511/12691 [1:49:12<2:18:15,  1.16s/it, loss=0.641][A
 43%|████▎     | 5511/12691 [1:49:14<2:18:15,  1.16s/it, loss=0.64] [A
 43%|████▎     | 5512/12691 [1:49:14<2:22:29,  1.19s/it, loss=0.64][A
 43%|████▎     | 5512/12691 [1:49:15<2:22:29,  1.19s/it, loss=0.645][A
 43%|████▎     | 5513/12691 [1:49:15<2:23:48,  1.20s/it, loss=0.645][A
 43%|████▎     | 5513/12691 [1:49:16<2:23:48,  1.20s/it, loss=0.644][A
 43%|████▎     | 5514/12691 [1:49:16<2:25:40,  1.22s/it, loss=0.644][A
 43%|████▎     | 5514/12691 [1:49:17<2:25:40,  1.22s/it, loss=0.642][A
 43%|████▎     | 5515/12691 [1:49:17<2:21:26,  1.18s/it, loss=0.642][A
 43%|████▎     | 5515/12691 [1:49:18<2:21:26,  1.18s/it, loss=0.643][A
 43%|████▎     | 5516/12691 [1:49:18<2:20:30,  1.17s/it, loss=0.643][A
 43%|████▎     | 5516/12691 [1:49:19<2:20:30,  1.17s/it, loss=0.6

 44%|████▍     | 5566/12691 [1:50:19<2:17:46,  1.16s/it, loss=0.64] [A
 44%|████▍     | 5567/12691 [1:50:19<2:18:17,  1.16s/it, loss=0.64][A
 44%|████▍     | 5567/12691 [1:50:20<2:18:17,  1.16s/it, loss=0.643][A
 44%|████▍     | 5568/12691 [1:50:20<2:21:17,  1.19s/it, loss=0.643][A
 44%|████▍     | 5568/12691 [1:50:22<2:21:17,  1.19s/it, loss=0.642][A
 44%|████▍     | 5569/12691 [1:50:22<2:21:25,  1.19s/it, loss=0.642][A
 44%|████▍     | 5569/12691 [1:50:23<2:21:25,  1.19s/it, loss=0.642][A
 44%|████▍     | 5570/12691 [1:50:23<2:21:21,  1.19s/it, loss=0.642][A
 44%|████▍     | 5570/12691 [1:50:24<2:21:21,  1.19s/it, loss=0.643][A
 44%|████▍     | 5571/12691 [1:50:24<2:19:19,  1.17s/it, loss=0.643][A
 44%|████▍     | 5571/12691 [1:50:25<2:19:19,  1.17s/it, loss=0.639][A
 44%|████▍     | 5572/12691 [1:50:25<2:17:59,  1.16s/it, loss=0.639][A
 44%|████▍     | 5572/12691 [1:50:26<2:17:59,  1.16s/it, loss=0.639][A
 44%|████▍     | 5573/12691 [1:50:26<2:17:26,  1.16s/it, loss=0.6

 44%|████▍     | 5623/12691 [1:51:27<2:26:57,  1.25s/it, loss=0.654][A
 44%|████▍     | 5623/12691 [1:51:28<2:26:57,  1.25s/it, loss=0.656][A
 44%|████▍     | 5624/12691 [1:51:28<2:26:17,  1.24s/it, loss=0.656][A
 44%|████▍     | 5624/12691 [1:51:29<2:26:17,  1.24s/it, loss=0.66] [A
 44%|████▍     | 5625/12691 [1:51:29<2:23:04,  1.21s/it, loss=0.66][A
 44%|████▍     | 5625/12691 [1:51:31<2:23:04,  1.21s/it, loss=0.661][A
 44%|████▍     | 5626/12691 [1:51:31<2:21:50,  1.20s/it, loss=0.661][A
 44%|████▍     | 5626/12691 [1:51:32<2:21:50,  1.20s/it, loss=0.66] [A
 44%|████▍     | 5627/12691 [1:51:32<2:21:39,  1.20s/it, loss=0.66][A
 44%|████▍     | 5627/12691 [1:51:33<2:21:39,  1.20s/it, loss=0.657][A
 44%|████▍     | 5628/12691 [1:51:33<2:23:03,  1.22s/it, loss=0.657][A
 44%|████▍     | 5628/12691 [1:51:34<2:23:03,  1.22s/it, loss=0.656][A
 44%|████▍     | 5629/12691 [1:51:34<2:27:22,  1.25s/it, loss=0.656][A
 44%|████▍     | 5629/12691 [1:51:36<2:27:22,  1.25s/it, loss=0.65

 45%|████▍     | 5679/12691 [1:52:36<2:26:17,  1.25s/it, loss=0.651][A
 45%|████▍     | 5680/12691 [1:52:36<2:26:31,  1.25s/it, loss=0.651][A
 45%|████▍     | 5680/12691 [1:52:37<2:26:31,  1.25s/it, loss=0.649][A
 45%|████▍     | 5681/12691 [1:52:37<2:21:56,  1.21s/it, loss=0.649][A
 45%|████▍     | 5681/12691 [1:52:38<2:21:56,  1.21s/it, loss=0.65] [A
 45%|████▍     | 5682/12691 [1:52:38<2:22:52,  1.22s/it, loss=0.65][A
 45%|████▍     | 5682/12691 [1:52:39<2:22:52,  1.22s/it, loss=0.65][A
 45%|████▍     | 5683/12691 [1:52:39<2:22:03,  1.22s/it, loss=0.65][A
 45%|████▍     | 5683/12691 [1:52:41<2:22:03,  1.22s/it, loss=0.65][A
 45%|████▍     | 5684/12691 [1:52:41<2:22:06,  1.22s/it, loss=0.65][A
 45%|████▍     | 5684/12691 [1:52:42<2:22:06,  1.22s/it, loss=0.649][A
 45%|████▍     | 5685/12691 [1:52:42<2:22:27,  1.22s/it, loss=0.649][A
 45%|████▍     | 5685/12691 [1:52:43<2:22:27,  1.22s/it, loss=0.647][A
 45%|████▍     | 5686/12691 [1:52:43<2:22:44,  1.22s/it, loss=0.647]

 45%|████▌     | 5736/12691 [1:53:44<2:23:51,  1.24s/it, loss=0.651][A
 45%|████▌     | 5736/12691 [1:53:45<2:23:51,  1.24s/it, loss=0.65] [A
 45%|████▌     | 5737/12691 [1:53:45<2:23:32,  1.24s/it, loss=0.65][A
 45%|████▌     | 5737/12691 [1:53:46<2:23:32,  1.24s/it, loss=0.653][A
 45%|████▌     | 5738/12691 [1:53:46<2:23:23,  1.24s/it, loss=0.653][A
 45%|████▌     | 5738/12691 [1:53:47<2:23:23,  1.24s/it, loss=0.653][A
 45%|████▌     | 5739/12691 [1:53:47<2:22:48,  1.23s/it, loss=0.653][A
 45%|████▌     | 5739/12691 [1:53:49<2:22:48,  1.23s/it, loss=0.654][A
 45%|████▌     | 5740/12691 [1:53:49<2:23:51,  1.24s/it, loss=0.654][A
 45%|████▌     | 5740/12691 [1:53:50<2:23:51,  1.24s/it, loss=0.653][A
 45%|████▌     | 5741/12691 [1:53:50<2:25:04,  1.25s/it, loss=0.653][A
 45%|████▌     | 5741/12691 [1:53:51<2:25:04,  1.25s/it, loss=0.651][A
 45%|████▌     | 5742/12691 [1:53:51<2:22:09,  1.23s/it, loss=0.651][A
 45%|████▌     | 5742/12691 [1:53:52<2:22:09,  1.23s/it, loss=0.6

 46%|████▌     | 5792/12691 [1:54:52<2:17:53,  1.20s/it, loss=0.643][A
 46%|████▌     | 5793/12691 [1:54:52<2:16:31,  1.19s/it, loss=0.643][A
 46%|████▌     | 5793/12691 [1:54:53<2:16:31,  1.19s/it, loss=0.64] [A
 46%|████▌     | 5794/12691 [1:54:53<2:14:40,  1.17s/it, loss=0.64][A
 46%|████▌     | 5794/12691 [1:54:54<2:14:40,  1.17s/it, loss=0.639][A
 46%|████▌     | 5795/12691 [1:54:54<2:12:32,  1.15s/it, loss=0.639][A
 46%|████▌     | 5795/12691 [1:54:55<2:12:32,  1.15s/it, loss=0.636][A
 46%|████▌     | 5796/12691 [1:54:55<2:16:54,  1.19s/it, loss=0.636][A
 46%|████▌     | 5796/12691 [1:54:56<2:16:54,  1.19s/it, loss=0.634][A
 46%|████▌     | 5797/12691 [1:54:56<2:17:38,  1.20s/it, loss=0.634][A
 46%|████▌     | 5797/12691 [1:54:57<2:17:38,  1.20s/it, loss=0.634][A
 46%|████▌     | 5798/12691 [1:54:57<2:14:20,  1.17s/it, loss=0.634][A
 46%|████▌     | 5798/12691 [1:54:59<2:14:20,  1.17s/it, loss=0.635][A
 46%|████▌     | 5799/12691 [1:54:59<2:14:05,  1.17s/it, loss=0.6

 46%|████▌     | 5849/12691 [1:55:59<2:17:25,  1.21s/it, loss=0.63][A
 46%|████▌     | 5849/12691 [1:56:00<2:17:25,  1.21s/it, loss=0.631][A
 46%|████▌     | 5850/12691 [1:56:00<2:15:27,  1.19s/it, loss=0.631][A
 46%|████▌     | 5850/12691 [1:56:01<2:15:27,  1.19s/it, loss=0.629][A
 46%|████▌     | 5851/12691 [1:56:01<2:14:57,  1.18s/it, loss=0.629][A
 46%|████▌     | 5851/12691 [1:56:03<2:14:57,  1.18s/it, loss=0.631][A
 46%|████▌     | 5852/12691 [1:56:03<2:18:54,  1.22s/it, loss=0.631][A
 46%|████▌     | 5852/12691 [1:56:04<2:18:54,  1.22s/it, loss=0.631][A
 46%|████▌     | 5853/12691 [1:56:04<2:17:33,  1.21s/it, loss=0.631][A
 46%|████▌     | 5853/12691 [1:56:05<2:17:33,  1.21s/it, loss=0.629][A
 46%|████▌     | 5854/12691 [1:56:05<2:15:54,  1.19s/it, loss=0.629][A
 46%|████▌     | 5854/12691 [1:56:06<2:15:54,  1.19s/it, loss=0.631][A
 46%|████▌     | 5855/12691 [1:56:06<2:18:41,  1.22s/it, loss=0.631][A
 46%|████▌     | 5855/12691 [1:56:07<2:18:41,  1.22s/it, loss=0.6

 47%|████▋     | 5905/12691 [1:57:11<2:24:41,  1.28s/it, loss=0.658][A
 47%|████▋     | 5906/12691 [1:57:11<2:23:10,  1.27s/it, loss=0.658][A
 47%|████▋     | 5906/12691 [1:57:12<2:23:10,  1.27s/it, loss=0.659][A
 47%|████▋     | 5907/12691 [1:57:12<2:24:18,  1.28s/it, loss=0.659][A
 47%|████▋     | 5907/12691 [1:57:13<2:24:18,  1.28s/it, loss=0.657][A
 47%|████▋     | 5908/12691 [1:57:13<2:22:07,  1.26s/it, loss=0.657][A
 47%|████▋     | 5908/12691 [1:57:14<2:22:07,  1.26s/it, loss=0.659][A
 47%|████▋     | 5909/12691 [1:57:14<2:22:24,  1.26s/it, loss=0.659][A
 47%|████▋     | 5909/12691 [1:57:16<2:22:24,  1.26s/it, loss=0.661][A
 47%|████▋     | 5910/12691 [1:57:16<2:21:19,  1.25s/it, loss=0.661][A
 47%|████▋     | 5910/12691 [1:57:17<2:21:19,  1.25s/it, loss=0.661][A
 47%|████▋     | 5911/12691 [1:57:17<2:19:43,  1.24s/it, loss=0.661][A
 47%|████▋     | 5911/12691 [1:57:18<2:19:43,  1.24s/it, loss=0.659][A
 47%|████▋     | 5912/12691 [1:57:18<2:21:09,  1.25s/it, loss=0.

 47%|████▋     | 5962/12691 [1:58:19<2:11:40,  1.17s/it, loss=0.662][A
 47%|████▋     | 5962/12691 [1:58:20<2:11:40,  1.17s/it, loss=0.662][A
 47%|████▋     | 5963/12691 [1:58:20<2:16:34,  1.22s/it, loss=0.662][A
 47%|████▋     | 5963/12691 [1:58:22<2:16:34,  1.22s/it, loss=0.665][A
 47%|████▋     | 5964/12691 [1:58:22<2:16:02,  1.21s/it, loss=0.665][A
 47%|████▋     | 5964/12691 [1:58:23<2:16:02,  1.21s/it, loss=0.667][A
 47%|████▋     | 5965/12691 [1:58:23<2:14:30,  1.20s/it, loss=0.667][A
 47%|████▋     | 5965/12691 [1:58:24<2:14:30,  1.20s/it, loss=0.671][A
 47%|████▋     | 5966/12691 [1:58:24<2:13:15,  1.19s/it, loss=0.671][A
 47%|████▋     | 5966/12691 [1:58:25<2:13:15,  1.19s/it, loss=0.666][A
 47%|████▋     | 5967/12691 [1:58:25<2:10:23,  1.16s/it, loss=0.666][A
 47%|████▋     | 5967/12691 [1:58:26<2:10:23,  1.16s/it, loss=0.667][A
 47%|████▋     | 5968/12691 [1:58:26<2:11:28,  1.17s/it, loss=0.667][A
 47%|████▋     | 5968/12691 [1:58:27<2:11:28,  1.17s/it, loss=0.

saving model checkpoint at iteration=6000



 47%|████▋     | 6001/12691 [1:59:06<2:49:03,  1.52s/it, loss=0.651][A
 47%|████▋     | 6001/12691 [1:59:07<2:49:03,  1.52s/it, loss=0.65] [A
 47%|████▋     | 6002/12691 [1:59:07<2:35:42,  1.40s/it, loss=0.65][A
 47%|████▋     | 6002/12691 [1:59:08<2:35:42,  1.40s/it, loss=0.652][A
 47%|████▋     | 6003/12691 [1:59:08<2:27:21,  1.32s/it, loss=0.652][A
 47%|████▋     | 6003/12691 [1:59:10<2:27:21,  1.32s/it, loss=0.651][A
 47%|████▋     | 6004/12691 [1:59:10<2:24:25,  1.30s/it, loss=0.651][A
 47%|████▋     | 6004/12691 [1:59:11<2:24:25,  1.30s/it, loss=0.654][A
 47%|████▋     | 6005/12691 [1:59:11<2:20:05,  1.26s/it, loss=0.654][A
 47%|████▋     | 6005/12691 [1:59:12<2:20:05,  1.26s/it, loss=0.654][A
 47%|████▋     | 6006/12691 [1:59:12<2:24:20,  1.30s/it, loss=0.654][A
 47%|████▋     | 6006/12691 [1:59:13<2:24:20,  1.30s/it, loss=0.655][A
 47%|████▋     | 6007/12691 [1:59:13<2:18:55,  1.25s/it, loss=0.655][A
 47%|████▋     | 6007/12691 [1:59:15<2:18:55,  1.25s/it, loss=0.

 48%|████▊     | 6057/12691 [2:00:15<2:15:57,  1.23s/it, loss=0.655][A
 48%|████▊     | 6058/12691 [2:00:15<2:14:11,  1.21s/it, loss=0.655][A
 48%|████▊     | 6058/12691 [2:00:16<2:14:11,  1.21s/it, loss=0.654][A
 48%|████▊     | 6059/12691 [2:00:16<2:13:15,  1.21s/it, loss=0.654][A
 48%|████▊     | 6059/12691 [2:00:18<2:13:15,  1.21s/it, loss=0.654][A
 48%|████▊     | 6060/12691 [2:00:18<2:11:16,  1.19s/it, loss=0.654][A
 48%|████▊     | 6060/12691 [2:00:19<2:11:16,  1.19s/it, loss=0.651][A
 48%|████▊     | 6061/12691 [2:00:19<2:09:41,  1.17s/it, loss=0.651][A
 48%|████▊     | 6061/12691 [2:00:20<2:09:41,  1.17s/it, loss=0.649][A
 48%|████▊     | 6062/12691 [2:00:20<2:12:40,  1.20s/it, loss=0.649][A
 48%|████▊     | 6062/12691 [2:00:21<2:12:40,  1.20s/it, loss=0.649][A
 48%|████▊     | 6063/12691 [2:00:21<2:10:37,  1.18s/it, loss=0.649][A
 48%|████▊     | 6063/12691 [2:00:22<2:10:37,  1.18s/it, loss=0.648][A
 48%|████▊     | 6064/12691 [2:00:22<2:09:36,  1.17s/it, loss=0.

 48%|████▊     | 6114/12691 [2:01:21<2:08:15,  1.17s/it, loss=0.664][A
 48%|████▊     | 6114/12691 [2:01:22<2:08:15,  1.17s/it, loss=0.661][A
 48%|████▊     | 6115/12691 [2:01:22<2:08:16,  1.17s/it, loss=0.661][A
 48%|████▊     | 6115/12691 [2:01:23<2:08:16,  1.17s/it, loss=0.664][A
 48%|████▊     | 6116/12691 [2:01:23<2:09:53,  1.19s/it, loss=0.664][A
 48%|████▊     | 6116/12691 [2:01:24<2:09:53,  1.19s/it, loss=0.665][A
 48%|████▊     | 6117/12691 [2:01:24<2:13:10,  1.22s/it, loss=0.665][A
 48%|████▊     | 6117/12691 [2:01:26<2:13:10,  1.22s/it, loss=0.665][A
 48%|████▊     | 6118/12691 [2:01:26<2:12:10,  1.21s/it, loss=0.665][A
 48%|████▊     | 6118/12691 [2:01:27<2:12:10,  1.21s/it, loss=0.664][A
 48%|████▊     | 6119/12691 [2:01:27<2:10:41,  1.19s/it, loss=0.664][A
 48%|████▊     | 6119/12691 [2:01:28<2:10:41,  1.19s/it, loss=0.661][A
 48%|████▊     | 6120/12691 [2:01:28<2:09:26,  1.18s/it, loss=0.661][A
 48%|████▊     | 6120/12691 [2:01:29<2:09:26,  1.18s/it, loss=0.

 49%|████▊     | 6170/12691 [2:02:29<2:05:17,  1.15s/it, loss=0.645][A
 49%|████▊     | 6171/12691 [2:02:29<2:04:14,  1.14s/it, loss=0.645][A
 49%|████▊     | 6171/12691 [2:02:30<2:04:14,  1.14s/it, loss=0.643][A
 49%|████▊     | 6172/12691 [2:02:30<2:01:42,  1.12s/it, loss=0.643][A
 49%|████▊     | 6172/12691 [2:02:31<2:01:42,  1.12s/it, loss=0.643][A
 49%|████▊     | 6173/12691 [2:02:31<2:05:11,  1.15s/it, loss=0.643][A
 49%|████▊     | 6173/12691 [2:02:32<2:05:11,  1.15s/it, loss=0.645][A
 49%|████▊     | 6174/12691 [2:02:32<2:07:34,  1.17s/it, loss=0.645][A
 49%|████▊     | 6174/12691 [2:02:34<2:07:34,  1.17s/it, loss=0.643][A
 49%|████▊     | 6175/12691 [2:02:34<2:07:48,  1.18s/it, loss=0.643][A
 49%|████▊     | 6175/12691 [2:02:35<2:07:48,  1.18s/it, loss=0.642][A
 49%|████▊     | 6176/12691 [2:02:35<2:06:46,  1.17s/it, loss=0.642][A
 49%|████▊     | 6176/12691 [2:02:36<2:06:46,  1.17s/it, loss=0.647][A
 49%|████▊     | 6177/12691 [2:02:36<2:04:58,  1.15s/it, loss=0.

 49%|████▉     | 6227/12691 [2:03:34<2:05:21,  1.16s/it, loss=0.646][A
 49%|████▉     | 6227/12691 [2:03:35<2:05:21,  1.16s/it, loss=0.646][A
 49%|████▉     | 6228/12691 [2:03:35<2:04:18,  1.15s/it, loss=0.646][A
 49%|████▉     | 6228/12691 [2:03:37<2:04:18,  1.15s/it, loss=0.644][A
 49%|████▉     | 6229/12691 [2:03:37<2:06:32,  1.17s/it, loss=0.644][A
 49%|████▉     | 6229/12691 [2:03:38<2:06:32,  1.17s/it, loss=0.644][A
 49%|████▉     | 6230/12691 [2:03:38<2:05:44,  1.17s/it, loss=0.644][A
 49%|████▉     | 6230/12691 [2:03:39<2:05:44,  1.17s/it, loss=0.645][A
 49%|████▉     | 6231/12691 [2:03:39<2:06:00,  1.17s/it, loss=0.645][A
 49%|████▉     | 6231/12691 [2:03:40<2:06:00,  1.17s/it, loss=0.644][A
 49%|████▉     | 6232/12691 [2:03:40<2:05:01,  1.16s/it, loss=0.644][A
 49%|████▉     | 6232/12691 [2:03:41<2:05:01,  1.16s/it, loss=0.643][A
 49%|████▉     | 6233/12691 [2:03:41<2:04:04,  1.15s/it, loss=0.643][A
 49%|████▉     | 6233/12691 [2:03:42<2:04:04,  1.15s/it, loss=0.

 50%|████▉     | 6283/12691 [2:04:42<2:02:45,  1.15s/it, loss=0.643][A
 50%|████▉     | 6284/12691 [2:04:42<2:04:08,  1.16s/it, loss=0.643][A
 50%|████▉     | 6284/12691 [2:04:43<2:04:08,  1.16s/it, loss=0.642][A
 50%|████▉     | 6285/12691 [2:04:43<2:02:49,  1.15s/it, loss=0.642][A
 50%|████▉     | 6285/12691 [2:04:44<2:02:49,  1.15s/it, loss=0.642][A
 50%|████▉     | 6286/12691 [2:04:44<2:02:15,  1.15s/it, loss=0.642][A
 50%|████▉     | 6286/12691 [2:04:45<2:02:15,  1.15s/it, loss=0.64] [A
 50%|████▉     | 6287/12691 [2:04:45<2:01:29,  1.14s/it, loss=0.64][A
 50%|████▉     | 6287/12691 [2:04:47<2:01:29,  1.14s/it, loss=0.639][A
 50%|████▉     | 6288/12691 [2:04:47<2:01:43,  1.14s/it, loss=0.639][A
 50%|████▉     | 6288/12691 [2:04:48<2:01:43,  1.14s/it, loss=0.64] [A
 50%|████▉     | 6289/12691 [2:04:48<2:00:46,  1.13s/it, loss=0.64][A
 50%|████▉     | 6289/12691 [2:04:49<2:00:46,  1.13s/it, loss=0.643][A
 50%|████▉     | 6290/12691 [2:04:49<2:03:41,  1.16s/it, loss=0.64

 50%|████▉     | 6340/12691 [2:05:48<2:07:57,  1.21s/it, loss=0.649][A
 50%|████▉     | 6340/12691 [2:05:49<2:07:57,  1.21s/it, loss=0.647][A
 50%|████▉     | 6341/12691 [2:05:49<2:08:44,  1.22s/it, loss=0.647][A
 50%|████▉     | 6341/12691 [2:05:51<2:08:44,  1.22s/it, loss=0.645][A
 50%|████▉     | 6342/12691 [2:05:51<2:10:02,  1.23s/it, loss=0.645][A
 50%|████▉     | 6342/12691 [2:05:52<2:10:02,  1.23s/it, loss=0.643][A
 50%|████▉     | 6343/12691 [2:05:52<2:11:40,  1.24s/it, loss=0.643][A
 50%|████▉     | 6343/12691 [2:05:53<2:11:40,  1.24s/it, loss=0.644][A
 50%|████▉     | 6344/12691 [2:05:53<2:12:05,  1.25s/it, loss=0.644][A
 50%|████▉     | 6344/12691 [2:05:54<2:12:05,  1.25s/it, loss=0.644][A
 50%|████▉     | 6345/12691 [2:05:54<2:11:36,  1.24s/it, loss=0.644][A
 50%|████▉     | 6345/12691 [2:05:56<2:11:36,  1.24s/it, loss=0.642][A
 50%|█████     | 6346/12691 [2:05:56<2:14:42,  1.27s/it, loss=0.642][A
 50%|█████     | 6346/12691 [2:05:57<2:14:42,  1.27s/it, loss=0.

 50%|█████     | 6396/12691 [2:06:57<2:07:00,  1.21s/it, loss=0.652][A
 50%|█████     | 6397/12691 [2:06:57<2:05:15,  1.19s/it, loss=0.652][A
 50%|█████     | 6397/12691 [2:06:58<2:05:15,  1.19s/it, loss=0.65] [A
 50%|█████     | 6398/12691 [2:06:58<2:03:09,  1.17s/it, loss=0.65][A
 50%|█████     | 6398/12691 [2:06:59<2:03:09,  1.17s/it, loss=0.653][A
 50%|█████     | 6399/12691 [2:06:59<2:02:17,  1.17s/it, loss=0.653][A
 50%|█████     | 6399/12691 [2:07:00<2:02:17,  1.17s/it, loss=0.655][A
 50%|█████     | 6400/12691 [2:07:00<2:01:36,  1.16s/it, loss=0.655][A
 50%|█████     | 6400/12691 [2:07:01<2:01:36,  1.16s/it, loss=0.657][A
 50%|█████     | 6401/12691 [2:07:01<2:04:15,  1.19s/it, loss=0.657][A
 50%|█████     | 6401/12691 [2:07:03<2:04:15,  1.19s/it, loss=0.66] [A
 50%|█████     | 6402/12691 [2:07:03<2:05:50,  1.20s/it, loss=0.66][A
 50%|█████     | 6402/12691 [2:07:04<2:05:50,  1.20s/it, loss=0.661][A
 50%|█████     | 6403/12691 [2:07:04<2:04:12,  1.19s/it, loss=0.66

 51%|█████     | 6453/12691 [2:08:03<2:03:32,  1.19s/it, loss=0.662][A
 51%|█████     | 6453/12691 [2:08:05<2:03:32,  1.19s/it, loss=0.663][A
 51%|█████     | 6454/12691 [2:08:05<2:04:02,  1.19s/it, loss=0.663][A
 51%|█████     | 6454/12691 [2:08:06<2:04:02,  1.19s/it, loss=0.661][A
 51%|█████     | 6455/12691 [2:08:06<2:04:15,  1.20s/it, loss=0.661][A
 51%|█████     | 6455/12691 [2:08:07<2:04:15,  1.20s/it, loss=0.662][A
 51%|█████     | 6456/12691 [2:08:07<2:03:29,  1.19s/it, loss=0.662][A
 51%|█████     | 6456/12691 [2:08:08<2:03:29,  1.19s/it, loss=0.663][A
 51%|█████     | 6457/12691 [2:08:08<2:08:25,  1.24s/it, loss=0.663][A
 51%|█████     | 6457/12691 [2:08:09<2:08:25,  1.24s/it, loss=0.662][A
 51%|█████     | 6458/12691 [2:08:09<2:07:43,  1.23s/it, loss=0.662][A
 51%|█████     | 6458/12691 [2:08:11<2:07:43,  1.23s/it, loss=0.661][A
 51%|█████     | 6459/12691 [2:08:11<2:08:33,  1.24s/it, loss=0.661][A
 51%|█████     | 6459/12691 [2:08:12<2:08:33,  1.24s/it, loss=0.

 51%|█████▏    | 6509/12691 [2:09:11<2:01:46,  1.18s/it, loss=0.665][A
 51%|█████▏    | 6510/12691 [2:09:11<2:00:57,  1.17s/it, loss=0.665][A
 51%|█████▏    | 6510/12691 [2:09:12<2:00:57,  1.17s/it, loss=0.664][A
 51%|█████▏    | 6511/12691 [2:09:12<2:00:36,  1.17s/it, loss=0.664][A
 51%|█████▏    | 6511/12691 [2:09:13<2:00:36,  1.17s/it, loss=0.663][A
 51%|█████▏    | 6512/12691 [2:09:13<2:02:25,  1.19s/it, loss=0.663][A
 51%|█████▏    | 6512/12691 [2:09:15<2:02:25,  1.19s/it, loss=0.666][A
 51%|█████▏    | 6513/12691 [2:09:15<2:07:09,  1.23s/it, loss=0.666][A
 51%|█████▏    | 6513/12691 [2:09:16<2:07:09,  1.23s/it, loss=0.665][A
 51%|█████▏    | 6514/12691 [2:09:16<2:04:42,  1.21s/it, loss=0.665][A
 51%|█████▏    | 6514/12691 [2:09:17<2:04:42,  1.21s/it, loss=0.664][A
 51%|█████▏    | 6515/12691 [2:09:17<2:03:03,  1.20s/it, loss=0.664][A
 51%|█████▏    | 6515/12691 [2:09:18<2:03:03,  1.20s/it, loss=0.664][A
 51%|█████▏    | 6516/12691 [2:09:18<2:01:47,  1.18s/it, loss=0.

 52%|█████▏    | 6566/12691 [2:10:18<2:01:44,  1.19s/it, loss=0.659][A
 52%|█████▏    | 6566/12691 [2:10:19<2:01:44,  1.19s/it, loss=0.661][A
 52%|█████▏    | 6567/12691 [2:10:19<2:01:54,  1.19s/it, loss=0.661][A
 52%|█████▏    | 6567/12691 [2:10:20<2:01:54,  1.19s/it, loss=0.656][A
 52%|█████▏    | 6568/12691 [2:10:20<2:05:00,  1.23s/it, loss=0.656][A
 52%|█████▏    | 6568/12691 [2:10:22<2:05:00,  1.23s/it, loss=0.658][A
 52%|█████▏    | 6569/12691 [2:10:22<2:06:36,  1.24s/it, loss=0.658][A
 52%|█████▏    | 6569/12691 [2:10:23<2:06:36,  1.24s/it, loss=0.658][A
 52%|█████▏    | 6570/12691 [2:10:23<2:04:32,  1.22s/it, loss=0.658][A
 52%|█████▏    | 6570/12691 [2:10:24<2:04:32,  1.22s/it, loss=0.658][A
 52%|█████▏    | 6571/12691 [2:10:24<2:02:12,  1.20s/it, loss=0.658][A
 52%|█████▏    | 6571/12691 [2:10:25<2:02:12,  1.20s/it, loss=0.66] [A
 52%|█████▏    | 6572/12691 [2:10:25<2:01:28,  1.19s/it, loss=0.66][A
 52%|█████▏    | 6572/12691 [2:10:26<2:01:28,  1.19s/it, loss=0.6

 52%|█████▏    | 6622/12691 [2:11:28<2:07:21,  1.26s/it, loss=0.66] [A
 52%|█████▏    | 6623/12691 [2:11:28<2:06:38,  1.25s/it, loss=0.66][A
 52%|█████▏    | 6623/12691 [2:11:29<2:06:38,  1.25s/it, loss=0.661][A
 52%|█████▏    | 6624/12691 [2:11:29<2:08:03,  1.27s/it, loss=0.661][A
 52%|█████▏    | 6624/12691 [2:11:30<2:08:03,  1.27s/it, loss=0.662][A
 52%|█████▏    | 6625/12691 [2:11:30<2:06:21,  1.25s/it, loss=0.662][A
 52%|█████▏    | 6625/12691 [2:11:31<2:06:21,  1.25s/it, loss=0.665][A
 52%|█████▏    | 6626/12691 [2:11:31<2:06:51,  1.26s/it, loss=0.665][A
 52%|█████▏    | 6626/12691 [2:11:33<2:06:51,  1.26s/it, loss=0.667][A
 52%|█████▏    | 6627/12691 [2:11:33<2:06:15,  1.25s/it, loss=0.667][A
 52%|█████▏    | 6627/12691 [2:11:34<2:06:15,  1.25s/it, loss=0.667][A
 52%|█████▏    | 6628/12691 [2:11:34<2:04:55,  1.24s/it, loss=0.667][A
 52%|█████▏    | 6628/12691 [2:11:35<2:04:55,  1.24s/it, loss=0.668][A
 52%|█████▏    | 6629/12691 [2:11:35<2:05:34,  1.24s/it, loss=0.6

 53%|█████▎    | 6679/12691 [2:12:36<1:57:53,  1.18s/it, loss=0.649][A
 53%|█████▎    | 6679/12691 [2:12:37<1:57:53,  1.18s/it, loss=0.649][A
 53%|█████▎    | 6680/12691 [2:12:37<1:55:05,  1.15s/it, loss=0.649][A
 53%|█████▎    | 6680/12691 [2:12:38<1:55:05,  1.15s/it, loss=0.651][A
 53%|█████▎    | 6681/12691 [2:12:38<1:55:43,  1.16s/it, loss=0.651][A
 53%|█████▎    | 6681/12691 [2:12:39<1:55:43,  1.16s/it, loss=0.648][A
 53%|█████▎    | 6682/12691 [2:12:39<1:55:54,  1.16s/it, loss=0.648][A
 53%|█████▎    | 6682/12691 [2:12:40<1:55:54,  1.16s/it, loss=0.649][A
 53%|█████▎    | 6683/12691 [2:12:40<1:55:58,  1.16s/it, loss=0.649][A
 53%|█████▎    | 6683/12691 [2:12:41<1:55:58,  1.16s/it, loss=0.647][A
 53%|█████▎    | 6684/12691 [2:12:41<1:55:11,  1.15s/it, loss=0.647][A
 53%|█████▎    | 6684/12691 [2:12:43<1:55:11,  1.15s/it, loss=0.648][A
 53%|█████▎    | 6685/12691 [2:12:43<1:56:30,  1.16s/it, loss=0.648][A
 53%|█████▎    | 6685/12691 [2:12:44<1:56:30,  1.16s/it, loss=0.

 53%|█████▎    | 6736/12691 [2:13:43<1:58:59,  1.20s/it, loss=0.643][A
 53%|█████▎    | 6736/12691 [2:13:44<1:58:59,  1.20s/it, loss=0.644][A
 53%|█████▎    | 6737/12691 [2:13:44<1:57:00,  1.18s/it, loss=0.644][A
 53%|█████▎    | 6737/12691 [2:13:45<1:57:00,  1.18s/it, loss=0.642][A
 53%|█████▎    | 6738/12691 [2:13:45<1:57:09,  1.18s/it, loss=0.642][A
 53%|█████▎    | 6738/12691 [2:13:46<1:57:09,  1.18s/it, loss=0.644][A
 53%|█████▎    | 6739/12691 [2:13:46<1:54:57,  1.16s/it, loss=0.644][A
 53%|█████▎    | 6739/12691 [2:13:47<1:54:57,  1.16s/it, loss=0.642][A
 53%|█████▎    | 6740/12691 [2:13:47<1:55:23,  1.16s/it, loss=0.642][A
 53%|█████▎    | 6740/12691 [2:13:48<1:55:23,  1.16s/it, loss=0.643][A
 53%|█████▎    | 6741/12691 [2:13:48<1:55:55,  1.17s/it, loss=0.643][A
 53%|█████▎    | 6741/12691 [2:13:50<1:55:55,  1.17s/it, loss=0.641][A
 53%|█████▎    | 6742/12691 [2:13:50<1:54:39,  1.16s/it, loss=0.641][A
 53%|█████▎    | 6742/12691 [2:13:51<1:54:39,  1.16s/it, loss=0.

 54%|█████▎    | 6792/12691 [2:14:50<2:01:18,  1.23s/it, loss=0.646][A
 54%|█████▎    | 6793/12691 [2:14:50<1:58:46,  1.21s/it, loss=0.646][A
 54%|█████▎    | 6793/12691 [2:14:51<1:58:46,  1.21s/it, loss=0.647][A
 54%|█████▎    | 6794/12691 [2:14:51<2:00:03,  1.22s/it, loss=0.647][A
 54%|█████▎    | 6794/12691 [2:14:53<2:00:03,  1.22s/it, loss=0.645][A
 54%|█████▎    | 6795/12691 [2:14:53<2:01:26,  1.24s/it, loss=0.645][A
 54%|█████▎    | 6795/12691 [2:14:54<2:01:26,  1.24s/it, loss=0.649][A
 54%|█████▎    | 6796/12691 [2:14:54<2:03:34,  1.26s/it, loss=0.649][A
 54%|█████▎    | 6796/12691 [2:14:55<2:03:34,  1.26s/it, loss=0.647][A
 54%|█████▎    | 6797/12691 [2:14:55<2:03:28,  1.26s/it, loss=0.647][A
 54%|█████▎    | 6797/12691 [2:14:56<2:03:28,  1.26s/it, loss=0.646][A
 54%|█████▎    | 6798/12691 [2:14:56<2:02:30,  1.25s/it, loss=0.646][A
 54%|█████▎    | 6798/12691 [2:14:58<2:02:30,  1.25s/it, loss=0.644][A
 54%|█████▎    | 6799/12691 [2:14:58<2:00:11,  1.22s/it, loss=0.

 54%|█████▍    | 6849/12691 [2:15:58<1:57:23,  1.21s/it, loss=0.663][A
 54%|█████▍    | 6849/12691 [2:15:59<1:57:23,  1.21s/it, loss=0.665][A
 54%|█████▍    | 6850/12691 [2:15:59<1:55:46,  1.19s/it, loss=0.665][A
 54%|█████▍    | 6850/12691 [2:16:01<1:55:46,  1.19s/it, loss=0.666][A
 54%|█████▍    | 6851/12691 [2:16:01<1:56:51,  1.20s/it, loss=0.666][A
 54%|█████▍    | 6851/12691 [2:16:02<1:56:51,  1.20s/it, loss=0.664][A
 54%|█████▍    | 6852/12691 [2:16:02<2:02:17,  1.26s/it, loss=0.664][A
 54%|█████▍    | 6852/12691 [2:16:03<2:02:17,  1.26s/it, loss=0.663][A
 54%|█████▍    | 6853/12691 [2:16:03<2:01:45,  1.25s/it, loss=0.663][A
 54%|█████▍    | 6853/12691 [2:16:04<2:01:45,  1.25s/it, loss=0.661][A
 54%|█████▍    | 6854/12691 [2:16:04<2:00:19,  1.24s/it, loss=0.661][A
 54%|█████▍    | 6854/12691 [2:16:06<2:00:19,  1.24s/it, loss=0.661][A
 54%|█████▍    | 6855/12691 [2:16:06<1:58:16,  1.22s/it, loss=0.661][A
 54%|█████▍    | 6855/12691 [2:16:07<1:58:16,  1.22s/it, loss=0.

 54%|█████▍    | 6905/12691 [2:17:08<1:53:42,  1.18s/it, loss=0.66] [A
 54%|█████▍    | 6906/12691 [2:17:08<1:53:53,  1.18s/it, loss=0.66][A
 54%|█████▍    | 6906/12691 [2:17:09<1:53:53,  1.18s/it, loss=0.658][A
 54%|█████▍    | 6907/12691 [2:17:09<1:55:45,  1.20s/it, loss=0.658][A
 54%|█████▍    | 6907/12691 [2:17:11<1:55:45,  1.20s/it, loss=0.656][A
 54%|█████▍    | 6908/12691 [2:17:11<1:57:57,  1.22s/it, loss=0.656][A
 54%|█████▍    | 6908/12691 [2:17:12<1:57:57,  1.22s/it, loss=0.658][A
 54%|█████▍    | 6909/12691 [2:17:12<1:57:59,  1.22s/it, loss=0.658][A
 54%|█████▍    | 6909/12691 [2:17:13<1:57:59,  1.22s/it, loss=0.657][A
 54%|█████▍    | 6910/12691 [2:17:13<1:58:09,  1.23s/it, loss=0.657][A
 54%|█████▍    | 6910/12691 [2:17:14<1:58:09,  1.23s/it, loss=0.658][A
 54%|█████▍    | 6911/12691 [2:17:14<1:56:42,  1.21s/it, loss=0.658][A
 54%|█████▍    | 6911/12691 [2:17:15<1:56:42,  1.21s/it, loss=0.657][A
 54%|█████▍    | 6912/12691 [2:17:15<1:54:53,  1.19s/it, loss=0.6

 55%|█████▍    | 6962/12691 [2:18:16<1:52:49,  1.18s/it, loss=0.654][A
 55%|█████▍    | 6962/12691 [2:18:17<1:52:49,  1.18s/it, loss=0.653][A
 55%|█████▍    | 6963/12691 [2:18:17<1:56:07,  1.22s/it, loss=0.653][A
 55%|█████▍    | 6963/12691 [2:18:18<1:56:07,  1.22s/it, loss=0.651][A
 55%|█████▍    | 6964/12691 [2:18:18<1:55:44,  1.21s/it, loss=0.651][A
 55%|█████▍    | 6964/12691 [2:18:19<1:55:44,  1.21s/it, loss=0.65] [A
 55%|█████▍    | 6965/12691 [2:18:19<1:53:47,  1.19s/it, loss=0.65][A
 55%|█████▍    | 6965/12691 [2:18:20<1:53:47,  1.19s/it, loss=0.652][A
 55%|█████▍    | 6966/12691 [2:18:20<1:52:39,  1.18s/it, loss=0.652][A
 55%|█████▍    | 6966/12691 [2:18:22<1:52:39,  1.18s/it, loss=0.653][A
 55%|█████▍    | 6967/12691 [2:18:22<1:52:11,  1.18s/it, loss=0.653][A
 55%|█████▍    | 6967/12691 [2:18:23<1:52:11,  1.18s/it, loss=0.651][A
 55%|█████▍    | 6968/12691 [2:18:23<1:51:32,  1.17s/it, loss=0.651][A
 55%|█████▍    | 6968/12691 [2:18:24<1:51:32,  1.17s/it, loss=0.6

saving model checkpoint at iteration=7000



 55%|█████▌    | 7001/12691 [2:19:03<2:24:18,  1.52s/it, loss=0.644][A
 55%|█████▌    | 7001/12691 [2:19:04<2:24:18,  1.52s/it, loss=0.64] [A
 55%|█████▌    | 7002/12691 [2:19:04<2:14:03,  1.41s/it, loss=0.64][A
 55%|█████▌    | 7002/12691 [2:19:05<2:14:03,  1.41s/it, loss=0.641][A
 55%|█████▌    | 7003/12691 [2:19:05<2:07:17,  1.34s/it, loss=0.641][A
 55%|█████▌    | 7003/12691 [2:19:06<2:07:17,  1.34s/it, loss=0.641][A
 55%|█████▌    | 7004/12691 [2:19:06<2:02:02,  1.29s/it, loss=0.641][A
 55%|█████▌    | 7004/12691 [2:19:07<2:02:02,  1.29s/it, loss=0.641][A
 55%|█████▌    | 7005/12691 [2:19:07<1:57:34,  1.24s/it, loss=0.641][A
 55%|█████▌    | 7005/12691 [2:19:08<1:57:34,  1.24s/it, loss=0.642][A
 55%|█████▌    | 7006/12691 [2:19:08<1:56:29,  1.23s/it, loss=0.642][A
 55%|█████▌    | 7006/12691 [2:19:09<1:56:29,  1.23s/it, loss=0.642][A
 55%|█████▌    | 7007/12691 [2:19:09<1:54:17,  1.21s/it, loss=0.642][A
 55%|█████▌    | 7007/12691 [2:19:11<1:54:17,  1.21s/it, loss=0.

 56%|█████▌    | 7057/12691 [2:20:09<1:49:33,  1.17s/it, loss=0.662][A
 56%|█████▌    | 7058/12691 [2:20:09<1:48:57,  1.16s/it, loss=0.662][A
 56%|█████▌    | 7058/12691 [2:20:11<1:48:57,  1.16s/it, loss=0.663][A
 56%|█████▌    | 7059/12691 [2:20:11<1:48:04,  1.15s/it, loss=0.663][A
 56%|█████▌    | 7059/12691 [2:20:12<1:48:04,  1.15s/it, loss=0.662][A
 56%|█████▌    | 7060/12691 [2:20:12<1:47:27,  1.15s/it, loss=0.662][A
 56%|█████▌    | 7060/12691 [2:20:13<1:47:27,  1.15s/it, loss=0.663][A
 56%|█████▌    | 7061/12691 [2:20:13<1:49:06,  1.16s/it, loss=0.663][A
 56%|█████▌    | 7061/12691 [2:20:14<1:49:06,  1.16s/it, loss=0.663][A
 56%|█████▌    | 7062/12691 [2:20:14<1:53:57,  1.21s/it, loss=0.663][A
 56%|█████▌    | 7062/12691 [2:20:15<1:53:57,  1.21s/it, loss=0.66] [A
 56%|█████▌    | 7063/12691 [2:20:15<1:51:56,  1.19s/it, loss=0.66][A
 56%|█████▌    | 7063/12691 [2:20:17<1:51:56,  1.19s/it, loss=0.661][A
 56%|█████▌    | 7064/12691 [2:20:17<1:50:24,  1.18s/it, loss=0.6

 56%|█████▌    | 7114/12691 [2:21:14<1:46:34,  1.15s/it, loss=0.662][A
 56%|█████▌    | 7114/12691 [2:21:16<1:46:34,  1.15s/it, loss=0.662][A
 56%|█████▌    | 7115/12691 [2:21:16<1:45:58,  1.14s/it, loss=0.662][A
 56%|█████▌    | 7115/12691 [2:21:17<1:45:58,  1.14s/it, loss=0.661][A
 56%|█████▌    | 7116/12691 [2:21:17<1:44:37,  1.13s/it, loss=0.661][A
 56%|█████▌    | 7116/12691 [2:21:18<1:44:37,  1.13s/it, loss=0.664][A
 56%|█████▌    | 7117/12691 [2:21:18<1:44:04,  1.12s/it, loss=0.664][A
 56%|█████▌    | 7117/12691 [2:21:19<1:44:04,  1.12s/it, loss=0.666][A
 56%|█████▌    | 7118/12691 [2:21:19<1:47:43,  1.16s/it, loss=0.666][A
 56%|█████▌    | 7118/12691 [2:21:20<1:47:43,  1.16s/it, loss=0.664][A
 56%|█████▌    | 7119/12691 [2:21:20<1:46:13,  1.14s/it, loss=0.664][A
 56%|█████▌    | 7119/12691 [2:21:21<1:46:13,  1.14s/it, loss=0.662][A
 56%|█████▌    | 7120/12691 [2:21:21<1:45:10,  1.13s/it, loss=0.662][A
 56%|█████▌    | 7120/12691 [2:21:22<1:45:10,  1.13s/it, loss=0.

 56%|█████▋    | 7170/12691 [2:22:19<1:44:13,  1.13s/it, loss=0.645][A
 57%|█████▋    | 7171/12691 [2:22:19<1:43:57,  1.13s/it, loss=0.645][A
 57%|█████▋    | 7171/12691 [2:22:20<1:43:57,  1.13s/it, loss=0.645][A
 57%|█████▋    | 7172/12691 [2:22:20<1:43:19,  1.12s/it, loss=0.645][A
 57%|█████▋    | 7172/12691 [2:22:22<1:43:19,  1.12s/it, loss=0.649][A
 57%|█████▋    | 7173/12691 [2:22:22<1:43:01,  1.12s/it, loss=0.649][A
 57%|█████▋    | 7173/12691 [2:22:23<1:43:01,  1.12s/it, loss=0.649][A
 57%|█████▋    | 7174/12691 [2:22:23<1:46:39,  1.16s/it, loss=0.649][A
 57%|█████▋    | 7174/12691 [2:22:24<1:46:39,  1.16s/it, loss=0.651][A
 57%|█████▋    | 7175/12691 [2:22:24<1:45:04,  1.14s/it, loss=0.651][A
 57%|█████▋    | 7175/12691 [2:22:25<1:45:04,  1.14s/it, loss=0.651][A
 57%|█████▋    | 7176/12691 [2:22:25<1:44:28,  1.14s/it, loss=0.651][A
 57%|█████▋    | 7176/12691 [2:22:26<1:44:28,  1.14s/it, loss=0.65] [A
 57%|█████▋    | 7177/12691 [2:22:26<1:44:38,  1.14s/it, loss=0.

 57%|█████▋    | 7227/12691 [2:23:24<1:47:06,  1.18s/it, loss=0.654][A
 57%|█████▋    | 7227/12691 [2:23:25<1:47:06,  1.18s/it, loss=0.653][A
 57%|█████▋    | 7228/12691 [2:23:25<1:45:36,  1.16s/it, loss=0.653][A
 57%|█████▋    | 7228/12691 [2:23:27<1:45:36,  1.16s/it, loss=0.654][A
 57%|█████▋    | 7229/12691 [2:23:27<1:47:30,  1.18s/it, loss=0.654][A
 57%|█████▋    | 7229/12691 [2:23:28<1:47:30,  1.18s/it, loss=0.656][A
 57%|█████▋    | 7230/12691 [2:23:28<1:46:04,  1.17s/it, loss=0.656][A
 57%|█████▋    | 7230/12691 [2:23:29<1:46:04,  1.17s/it, loss=0.655][A
 57%|█████▋    | 7231/12691 [2:23:29<1:44:59,  1.15s/it, loss=0.655][A
 57%|█████▋    | 7231/12691 [2:23:30<1:44:59,  1.15s/it, loss=0.654][A
 57%|█████▋    | 7232/12691 [2:23:30<1:44:31,  1.15s/it, loss=0.654][A
 57%|█████▋    | 7232/12691 [2:23:31<1:44:31,  1.15s/it, loss=0.655][A
 57%|█████▋    | 7233/12691 [2:23:31<1:46:06,  1.17s/it, loss=0.655][A
 57%|█████▋    | 7233/12691 [2:23:32<1:46:06,  1.17s/it, loss=0.

 57%|█████▋    | 7283/12691 [2:24:29<1:41:35,  1.13s/it, loss=0.648][A
 57%|█████▋    | 7284/12691 [2:24:29<1:40:51,  1.12s/it, loss=0.648][A
 57%|█████▋    | 7284/12691 [2:24:31<1:40:51,  1.12s/it, loss=0.649][A
 57%|█████▋    | 7285/12691 [2:24:31<1:43:24,  1.15s/it, loss=0.649][A
 57%|█████▋    | 7285/12691 [2:24:32<1:43:24,  1.15s/it, loss=0.649][A
 57%|█████▋    | 7286/12691 [2:24:32<1:43:25,  1.15s/it, loss=0.649][A
 57%|█████▋    | 7286/12691 [2:24:33<1:43:25,  1.15s/it, loss=0.649][A
 57%|█████▋    | 7287/12691 [2:24:33<1:42:59,  1.14s/it, loss=0.649][A
 57%|█████▋    | 7287/12691 [2:24:34<1:42:59,  1.14s/it, loss=0.649][A
 57%|█████▋    | 7288/12691 [2:24:34<1:42:53,  1.14s/it, loss=0.649][A
 57%|█████▋    | 7288/12691 [2:24:35<1:42:53,  1.14s/it, loss=0.65] [A
 57%|█████▋    | 7289/12691 [2:24:35<1:43:26,  1.15s/it, loss=0.65][A
 57%|█████▋    | 7289/12691 [2:24:36<1:43:26,  1.15s/it, loss=0.649][A
 57%|█████▋    | 7290/12691 [2:24:36<1:45:17,  1.17s/it, loss=0.6

 58%|█████▊    | 7340/12691 [2:25:35<1:46:51,  1.20s/it, loss=0.656][A
 58%|█████▊    | 7340/12691 [2:25:36<1:46:51,  1.20s/it, loss=0.653][A
 58%|█████▊    | 7341/12691 [2:25:36<1:47:52,  1.21s/it, loss=0.653][A
 58%|█████▊    | 7341/12691 [2:25:38<1:47:52,  1.21s/it, loss=0.652][A
 58%|█████▊    | 7342/12691 [2:25:38<1:48:53,  1.22s/it, loss=0.652][A
 58%|█████▊    | 7342/12691 [2:25:39<1:48:53,  1.22s/it, loss=0.651][A
 58%|█████▊    | 7343/12691 [2:25:39<1:47:55,  1.21s/it, loss=0.651][A
 58%|█████▊    | 7343/12691 [2:25:40<1:47:55,  1.21s/it, loss=0.653][A
 58%|█████▊    | 7344/12691 [2:25:40<1:45:57,  1.19s/it, loss=0.653][A
 58%|█████▊    | 7344/12691 [2:25:41<1:45:57,  1.19s/it, loss=0.653][A
 58%|█████▊    | 7345/12691 [2:25:41<1:45:02,  1.18s/it, loss=0.653][A
 58%|█████▊    | 7345/12691 [2:25:42<1:45:02,  1.18s/it, loss=0.651][A
 58%|█████▊    | 7346/12691 [2:25:42<1:47:19,  1.20s/it, loss=0.651][A
 58%|█████▊    | 7346/12691 [2:25:44<1:47:19,  1.20s/it, loss=0.

 58%|█████▊    | 7396/12691 [2:26:44<1:50:58,  1.26s/it, loss=0.648][A
 58%|█████▊    | 7397/12691 [2:26:44<1:51:11,  1.26s/it, loss=0.648][A
 58%|█████▊    | 7397/12691 [2:26:45<1:51:11,  1.26s/it, loss=0.651][A
 58%|█████▊    | 7398/12691 [2:26:45<1:50:35,  1.25s/it, loss=0.651][A
 58%|█████▊    | 7398/12691 [2:26:47<1:50:35,  1.25s/it, loss=0.65] [A
 58%|█████▊    | 7399/12691 [2:26:47<1:49:45,  1.24s/it, loss=0.65][A
 58%|█████▊    | 7399/12691 [2:26:48<1:49:45,  1.24s/it, loss=0.651][A
 58%|█████▊    | 7400/12691 [2:26:48<1:50:06,  1.25s/it, loss=0.651][A
 58%|█████▊    | 7400/12691 [2:26:49<1:50:06,  1.25s/it, loss=0.65] [A
 58%|█████▊    | 7401/12691 [2:26:49<1:49:44,  1.24s/it, loss=0.65][A
 58%|█████▊    | 7401/12691 [2:26:51<1:49:44,  1.24s/it, loss=0.649][A
 58%|█████▊    | 7402/12691 [2:26:51<1:52:57,  1.28s/it, loss=0.649][A
 58%|█████▊    | 7402/12691 [2:26:52<1:52:57,  1.28s/it, loss=0.648][A
 58%|█████▊    | 7403/12691 [2:26:52<1:51:16,  1.26s/it, loss=0.64

 59%|█████▊    | 7453/12691 [2:27:54<1:47:17,  1.23s/it, loss=0.651][A
 59%|█████▊    | 7453/12691 [2:27:55<1:47:17,  1.23s/it, loss=0.648][A
 59%|█████▊    | 7454/12691 [2:27:55<1:45:24,  1.21s/it, loss=0.648][A
 59%|█████▊    | 7454/12691 [2:27:56<1:45:24,  1.21s/it, loss=0.652][A
 59%|█████▊    | 7455/12691 [2:27:56<1:44:55,  1.20s/it, loss=0.652][A
 59%|█████▊    | 7455/12691 [2:27:58<1:44:55,  1.20s/it, loss=0.649][A
 59%|█████▉    | 7456/12691 [2:27:58<1:44:43,  1.20s/it, loss=0.649][A
 59%|█████▉    | 7456/12691 [2:27:59<1:44:43,  1.20s/it, loss=0.651][A
 59%|█████▉    | 7457/12691 [2:27:59<1:43:57,  1.19s/it, loss=0.651][A
 59%|█████▉    | 7457/12691 [2:28:00<1:43:57,  1.19s/it, loss=0.651][A
 59%|█████▉    | 7458/12691 [2:28:00<1:46:02,  1.22s/it, loss=0.651][A
 59%|█████▉    | 7458/12691 [2:28:01<1:46:02,  1.22s/it, loss=0.65] [A
 59%|█████▉    | 7459/12691 [2:28:01<1:44:36,  1.20s/it, loss=0.65][A
 59%|█████▉    | 7459/12691 [2:28:02<1:44:36,  1.20s/it, loss=0.6

 59%|█████▉    | 7509/12691 [2:29:03<1:43:30,  1.20s/it, loss=0.64] [A
 59%|█████▉    | 7510/12691 [2:29:03<1:42:38,  1.19s/it, loss=0.64][A
 59%|█████▉    | 7510/12691 [2:29:04<1:42:38,  1.19s/it, loss=0.643][A
 59%|█████▉    | 7511/12691 [2:29:04<1:40:53,  1.17s/it, loss=0.643][A
 59%|█████▉    | 7511/12691 [2:29:05<1:40:53,  1.17s/it, loss=0.643][A
 59%|█████▉    | 7512/12691 [2:29:05<1:40:12,  1.16s/it, loss=0.643][A
 59%|█████▉    | 7512/12691 [2:29:06<1:40:12,  1.16s/it, loss=0.645][A
 59%|█████▉    | 7513/12691 [2:29:06<1:44:22,  1.21s/it, loss=0.645][A
 59%|█████▉    | 7513/12691 [2:29:07<1:44:22,  1.21s/it, loss=0.643][A
 59%|█████▉    | 7514/12691 [2:29:07<1:41:02,  1.17s/it, loss=0.643][A
 59%|█████▉    | 7514/12691 [2:29:08<1:41:02,  1.17s/it, loss=0.641][A
 59%|█████▉    | 7515/12691 [2:29:08<1:40:19,  1.16s/it, loss=0.641][A
 59%|█████▉    | 7515/12691 [2:29:10<1:40:19,  1.16s/it, loss=0.644][A
 59%|█████▉    | 7516/12691 [2:29:10<1:40:43,  1.17s/it, loss=0.6

 60%|█████▉    | 7566/12691 [2:30:10<1:47:47,  1.26s/it, loss=0.654][A
 60%|█████▉    | 7566/12691 [2:30:12<1:47:47,  1.26s/it, loss=0.65] [A
 60%|█████▉    | 7567/12691 [2:30:12<1:47:22,  1.26s/it, loss=0.65][A
 60%|█████▉    | 7567/12691 [2:30:13<1:47:22,  1.26s/it, loss=0.649][A
 60%|█████▉    | 7568/12691 [2:30:13<1:47:28,  1.26s/it, loss=0.649][A
 60%|█████▉    | 7568/12691 [2:30:14<1:47:28,  1.26s/it, loss=0.649][A
 60%|█████▉    | 7569/12691 [2:30:14<1:49:39,  1.28s/it, loss=0.649][A
 60%|█████▉    | 7569/12691 [2:30:15<1:49:39,  1.28s/it, loss=0.647][A
 60%|█████▉    | 7570/12691 [2:30:15<1:49:50,  1.29s/it, loss=0.647][A
 60%|█████▉    | 7570/12691 [2:30:17<1:49:50,  1.29s/it, loss=0.648][A
 60%|█████▉    | 7571/12691 [2:30:17<1:46:08,  1.24s/it, loss=0.648][A
 60%|█████▉    | 7571/12691 [2:30:18<1:46:08,  1.24s/it, loss=0.646][A
 60%|█████▉    | 7572/12691 [2:30:18<1:46:46,  1.25s/it, loss=0.646][A
 60%|█████▉    | 7572/12691 [2:30:19<1:46:46,  1.25s/it, loss=0.6

 60%|██████    | 7622/12691 [2:31:22<1:46:30,  1.26s/it, loss=0.654][A
 60%|██████    | 7623/12691 [2:31:22<1:45:21,  1.25s/it, loss=0.654][A
 60%|██████    | 7623/12691 [2:31:23<1:45:21,  1.25s/it, loss=0.652][A
 60%|██████    | 7624/12691 [2:31:23<1:48:05,  1.28s/it, loss=0.652][A
 60%|██████    | 7624/12691 [2:31:24<1:48:05,  1.28s/it, loss=0.651][A
 60%|██████    | 7625/12691 [2:31:24<1:46:17,  1.26s/it, loss=0.651][A
 60%|██████    | 7625/12691 [2:31:25<1:46:17,  1.26s/it, loss=0.652][A
 60%|██████    | 7626/12691 [2:31:25<1:45:43,  1.25s/it, loss=0.652][A
 60%|██████    | 7626/12691 [2:31:27<1:45:43,  1.25s/it, loss=0.655][A
 60%|██████    | 7627/12691 [2:31:27<1:45:52,  1.25s/it, loss=0.655][A
 60%|██████    | 7627/12691 [2:31:28<1:45:52,  1.25s/it, loss=0.655][A
 60%|██████    | 7628/12691 [2:31:28<1:45:50,  1.25s/it, loss=0.655][A
 60%|██████    | 7628/12691 [2:31:29<1:45:50,  1.25s/it, loss=0.654][A
 60%|██████    | 7629/12691 [2:31:29<1:44:37,  1.24s/it, loss=0.

 61%|██████    | 7679/12691 [2:32:31<1:44:43,  1.25s/it, loss=0.647][A
 61%|██████    | 7679/12691 [2:32:32<1:44:43,  1.25s/it, loss=0.646][A
 61%|██████    | 7680/12691 [2:32:32<1:44:13,  1.25s/it, loss=0.646][A
 61%|██████    | 7680/12691 [2:32:33<1:44:13,  1.25s/it, loss=0.646][A
 61%|██████    | 7681/12691 [2:32:33<1:41:44,  1.22s/it, loss=0.646][A
 61%|██████    | 7681/12691 [2:32:34<1:41:44,  1.22s/it, loss=0.649][A
 61%|██████    | 7682/12691 [2:32:34<1:40:00,  1.20s/it, loss=0.649][A
 61%|██████    | 7682/12691 [2:32:36<1:40:00,  1.20s/it, loss=0.647][A
 61%|██████    | 7683/12691 [2:32:36<1:38:16,  1.18s/it, loss=0.647][A
 61%|██████    | 7683/12691 [2:32:37<1:38:16,  1.18s/it, loss=0.647][A
 61%|██████    | 7684/12691 [2:32:37<1:37:22,  1.17s/it, loss=0.647][A
 61%|██████    | 7684/12691 [2:32:38<1:37:22,  1.17s/it, loss=0.648][A
 61%|██████    | 7685/12691 [2:32:38<1:38:49,  1.18s/it, loss=0.648][A
 61%|██████    | 7685/12691 [2:32:39<1:38:49,  1.18s/it, loss=0.

 61%|██████    | 7735/12691 [2:33:37<1:37:24,  1.18s/it, loss=0.656][A
 61%|██████    | 7736/12691 [2:33:37<1:38:19,  1.19s/it, loss=0.656][A
 61%|██████    | 7736/12691 [2:33:38<1:38:19,  1.19s/it, loss=0.657][A
 61%|██████    | 7737/12691 [2:33:38<1:37:17,  1.18s/it, loss=0.657][A
 61%|██████    | 7737/12691 [2:33:39<1:37:17,  1.18s/it, loss=0.656][A
 61%|██████    | 7738/12691 [2:33:39<1:37:12,  1.18s/it, loss=0.656][A
 61%|██████    | 7738/12691 [2:33:40<1:37:12,  1.18s/it, loss=0.655][A
 61%|██████    | 7739/12691 [2:33:40<1:35:47,  1.16s/it, loss=0.655][A
 61%|██████    | 7739/12691 [2:33:42<1:35:47,  1.16s/it, loss=0.653][A
 61%|██████    | 7740/12691 [2:33:42<1:35:03,  1.15s/it, loss=0.653][A
 61%|██████    | 7740/12691 [2:33:43<1:35:03,  1.15s/it, loss=0.651][A
 61%|██████    | 7741/12691 [2:33:43<1:41:13,  1.23s/it, loss=0.651][A
 61%|██████    | 7741/12691 [2:33:44<1:41:13,  1.23s/it, loss=0.654][A
 61%|██████    | 7742/12691 [2:33:44<1:39:05,  1.20s/it, loss=0.

 61%|██████▏   | 7792/12691 [2:34:42<1:35:03,  1.16s/it, loss=0.646][A
 61%|██████▏   | 7792/12691 [2:34:43<1:35:03,  1.16s/it, loss=0.646][A
 61%|██████▏   | 7793/12691 [2:34:43<1:34:32,  1.16s/it, loss=0.646][A
 61%|██████▏   | 7793/12691 [2:34:44<1:34:32,  1.16s/it, loss=0.649][A
 61%|██████▏   | 7794/12691 [2:34:44<1:36:43,  1.19s/it, loss=0.649][A
 61%|██████▏   | 7794/12691 [2:34:46<1:36:43,  1.19s/it, loss=0.648][A
 61%|██████▏   | 7795/12691 [2:34:46<1:36:27,  1.18s/it, loss=0.648][A
 61%|██████▏   | 7795/12691 [2:34:47<1:36:27,  1.18s/it, loss=0.648][A
 61%|██████▏   | 7796/12691 [2:34:47<1:34:36,  1.16s/it, loss=0.648][A
 61%|██████▏   | 7796/12691 [2:34:48<1:34:36,  1.16s/it, loss=0.648][A
 61%|██████▏   | 7797/12691 [2:34:48<1:36:11,  1.18s/it, loss=0.648][A
 61%|██████▏   | 7797/12691 [2:34:49<1:36:11,  1.18s/it, loss=0.647][A
 61%|██████▏   | 7798/12691 [2:34:49<1:34:52,  1.16s/it, loss=0.647][A
 61%|██████▏   | 7798/12691 [2:34:50<1:34:52,  1.16s/it, loss=0.

 62%|██████▏   | 7848/12691 [2:35:51<1:38:37,  1.22s/it, loss=0.666][A
 62%|██████▏   | 7849/12691 [2:35:51<1:38:51,  1.22s/it, loss=0.666][A
 62%|██████▏   | 7849/12691 [2:35:52<1:38:51,  1.22s/it, loss=0.665][A
 62%|██████▏   | 7850/12691 [2:35:52<1:38:30,  1.22s/it, loss=0.665][A
 62%|██████▏   | 7850/12691 [2:35:53<1:38:30,  1.22s/it, loss=0.665][A
 62%|██████▏   | 7851/12691 [2:35:53<1:39:07,  1.23s/it, loss=0.665][A
 62%|██████▏   | 7851/12691 [2:35:54<1:39:07,  1.23s/it, loss=0.662][A
 62%|██████▏   | 7852/12691 [2:35:54<1:42:00,  1.26s/it, loss=0.662][A
 62%|██████▏   | 7852/12691 [2:35:56<1:42:00,  1.26s/it, loss=0.662][A
 62%|██████▏   | 7853/12691 [2:35:56<1:40:04,  1.24s/it, loss=0.662][A
 62%|██████▏   | 7853/12691 [2:35:57<1:40:04,  1.24s/it, loss=0.664][A
 62%|██████▏   | 7854/12691 [2:35:57<1:38:55,  1.23s/it, loss=0.664][A
 62%|██████▏   | 7854/12691 [2:35:58<1:38:55,  1.23s/it, loss=0.662][A
 62%|██████▏   | 7855/12691 [2:35:58<1:38:11,  1.22s/it, loss=0.

 62%|██████▏   | 7905/12691 [2:36:57<1:31:22,  1.15s/it, loss=0.65][A
 62%|██████▏   | 7905/12691 [2:36:58<1:31:22,  1.15s/it, loss=0.652][A
 62%|██████▏   | 7906/12691 [2:36:58<1:31:02,  1.14s/it, loss=0.652][A
 62%|██████▏   | 7906/12691 [2:36:59<1:31:02,  1.14s/it, loss=0.65] [A
 62%|██████▏   | 7907/12691 [2:36:59<1:30:27,  1.13s/it, loss=0.65][A
 62%|██████▏   | 7907/12691 [2:37:01<1:30:27,  1.13s/it, loss=0.655][A
 62%|██████▏   | 7908/12691 [2:37:01<1:33:59,  1.18s/it, loss=0.655][A
 62%|██████▏   | 7908/12691 [2:37:02<1:33:59,  1.18s/it, loss=0.652][A
 62%|██████▏   | 7909/12691 [2:37:02<1:32:25,  1.16s/it, loss=0.652][A
 62%|██████▏   | 7909/12691 [2:37:03<1:32:25,  1.16s/it, loss=0.652][A
 62%|██████▏   | 7910/12691 [2:37:03<1:32:09,  1.16s/it, loss=0.652][A
 62%|██████▏   | 7910/12691 [2:37:04<1:32:09,  1.16s/it, loss=0.651][A
 62%|██████▏   | 7911/12691 [2:37:04<1:31:47,  1.15s/it, loss=0.651][A
 62%|██████▏   | 7911/12691 [2:37:05<1:31:47,  1.15s/it, loss=0.65

 63%|██████▎   | 7961/12691 [2:38:05<1:34:29,  1.20s/it, loss=0.654][A
 63%|██████▎   | 7962/12691 [2:38:05<1:33:08,  1.18s/it, loss=0.654][A
 63%|██████▎   | 7962/12691 [2:38:07<1:33:08,  1.18s/it, loss=0.657][A
 63%|██████▎   | 7963/12691 [2:38:07<1:34:51,  1.20s/it, loss=0.657][A
 63%|██████▎   | 7963/12691 [2:38:08<1:34:51,  1.20s/it, loss=0.658][A
 63%|██████▎   | 7964/12691 [2:38:08<1:33:29,  1.19s/it, loss=0.658][A
 63%|██████▎   | 7964/12691 [2:38:09<1:33:29,  1.19s/it, loss=0.659][A
 63%|██████▎   | 7965/12691 [2:38:09<1:32:46,  1.18s/it, loss=0.659][A
 63%|██████▎   | 7965/12691 [2:38:10<1:32:46,  1.18s/it, loss=0.659][A
 63%|██████▎   | 7966/12691 [2:38:10<1:31:53,  1.17s/it, loss=0.659][A
 63%|██████▎   | 7966/12691 [2:38:11<1:31:53,  1.17s/it, loss=0.658][A
 63%|██████▎   | 7967/12691 [2:38:11<1:30:58,  1.16s/it, loss=0.658][A
 63%|██████▎   | 7967/12691 [2:38:12<1:30:58,  1.16s/it, loss=0.658][A
 63%|██████▎   | 7968/12691 [2:38:12<1:30:19,  1.15s/it, loss=0.

saving model checkpoint at iteration=8000



 63%|██████▎   | 8001/12691 [2:38:53<2:00:02,  1.54s/it, loss=0.665][A
 63%|██████▎   | 8001/12691 [2:38:54<2:00:02,  1.54s/it, loss=0.663][A
 63%|██████▎   | 8002/12691 [2:38:54<1:50:53,  1.42s/it, loss=0.663][A
 63%|██████▎   | 8002/12691 [2:38:55<1:50:53,  1.42s/it, loss=0.661][A
 63%|██████▎   | 8003/12691 [2:38:55<1:45:32,  1.35s/it, loss=0.661][A
 63%|██████▎   | 8003/12691 [2:38:56<1:45:32,  1.35s/it, loss=0.661][A
 63%|██████▎   | 8004/12691 [2:38:56<1:40:47,  1.29s/it, loss=0.661][A
 63%|██████▎   | 8004/12691 [2:38:57<1:40:47,  1.29s/it, loss=0.66] [A
 63%|██████▎   | 8005/12691 [2:38:57<1:36:39,  1.24s/it, loss=0.66][A
 63%|██████▎   | 8005/12691 [2:38:59<1:36:39,  1.24s/it, loss=0.661][A
 63%|██████▎   | 8006/12691 [2:38:59<1:35:49,  1.23s/it, loss=0.661][A
 63%|██████▎   | 8006/12691 [2:39:00<1:35:49,  1.23s/it, loss=0.659][A
 63%|██████▎   | 8007/12691 [2:39:00<1:38:44,  1.26s/it, loss=0.659][A
 63%|██████▎   | 8007/12691 [2:39:01<1:38:44,  1.26s/it, loss=0.

 63%|██████▎   | 8057/12691 [2:39:59<1:27:58,  1.14s/it, loss=0.651][A
 63%|██████▎   | 8058/12691 [2:39:59<1:29:52,  1.16s/it, loss=0.651][A
 63%|██████▎   | 8058/12691 [2:40:00<1:29:52,  1.16s/it, loss=0.65] [A
 64%|██████▎   | 8059/12691 [2:40:00<1:29:51,  1.16s/it, loss=0.65][A
 64%|██████▎   | 8059/12691 [2:40:02<1:29:51,  1.16s/it, loss=0.649][A
 64%|██████▎   | 8060/12691 [2:40:02<1:30:14,  1.17s/it, loss=0.649][A
 64%|██████▎   | 8060/12691 [2:40:03<1:30:14,  1.17s/it, loss=0.648][A
 64%|██████▎   | 8061/12691 [2:40:03<1:29:50,  1.16s/it, loss=0.648][A
 64%|██████▎   | 8061/12691 [2:40:04<1:29:50,  1.16s/it, loss=0.648][A
 64%|██████▎   | 8062/12691 [2:40:04<1:29:54,  1.17s/it, loss=0.648][A
 64%|██████▎   | 8062/12691 [2:40:05<1:29:54,  1.17s/it, loss=0.651][A
 64%|██████▎   | 8063/12691 [2:40:05<1:29:29,  1.16s/it, loss=0.651][A
 64%|██████▎   | 8063/12691 [2:40:06<1:29:29,  1.16s/it, loss=0.65] [A
 64%|██████▎   | 8064/12691 [2:40:06<1:27:43,  1.14s/it, loss=0.6

 64%|██████▍   | 8114/12691 [2:41:04<1:27:27,  1.15s/it, loss=0.649][A
 64%|██████▍   | 8114/12691 [2:41:05<1:27:27,  1.15s/it, loss=0.648][A
 64%|██████▍   | 8115/12691 [2:41:05<1:27:19,  1.14s/it, loss=0.648][A
 64%|██████▍   | 8115/12691 [2:41:06<1:27:19,  1.14s/it, loss=0.647][A
 64%|██████▍   | 8116/12691 [2:41:06<1:28:53,  1.17s/it, loss=0.647][A
 64%|██████▍   | 8116/12691 [2:41:07<1:28:53,  1.17s/it, loss=0.648][A
 64%|██████▍   | 8117/12691 [2:41:07<1:27:11,  1.14s/it, loss=0.648][A
 64%|██████▍   | 8117/12691 [2:41:09<1:27:11,  1.14s/it, loss=0.649][A
 64%|██████▍   | 8118/12691 [2:41:09<1:28:45,  1.16s/it, loss=0.649][A
 64%|██████▍   | 8118/12691 [2:41:10<1:28:45,  1.16s/it, loss=0.647][A
 64%|██████▍   | 8119/12691 [2:41:10<1:28:51,  1.17s/it, loss=0.647][A
 64%|██████▍   | 8119/12691 [2:41:11<1:28:51,  1.17s/it, loss=0.648][A
 64%|██████▍   | 8120/12691 [2:41:11<1:28:02,  1.16s/it, loss=0.648][A
 64%|██████▍   | 8120/12691 [2:41:12<1:28:02,  1.16s/it, loss=0.

 64%|██████▍   | 8170/12691 [2:42:09<1:26:27,  1.15s/it, loss=0.632][A
 64%|██████▍   | 8171/12691 [2:42:09<1:25:31,  1.14s/it, loss=0.632][A
 64%|██████▍   | 8171/12691 [2:42:10<1:25:31,  1.14s/it, loss=0.633][A
 64%|██████▍   | 8172/12691 [2:42:10<1:24:51,  1.13s/it, loss=0.633][A
 64%|██████▍   | 8172/12691 [2:42:12<1:24:51,  1.13s/it, loss=0.63] [A
 64%|██████▍   | 8173/12691 [2:42:12<1:24:43,  1.13s/it, loss=0.63][A
 64%|██████▍   | 8173/12691 [2:42:13<1:24:43,  1.13s/it, loss=0.631][A
 64%|██████▍   | 8174/12691 [2:42:13<1:26:17,  1.15s/it, loss=0.631][A
 64%|██████▍   | 8174/12691 [2:42:14<1:26:17,  1.15s/it, loss=0.632][A
 64%|██████▍   | 8175/12691 [2:42:14<1:26:46,  1.15s/it, loss=0.632][A
 64%|██████▍   | 8175/12691 [2:42:15<1:26:46,  1.15s/it, loss=0.632][A
 64%|██████▍   | 8176/12691 [2:42:15<1:28:12,  1.17s/it, loss=0.632][A
 64%|██████▍   | 8176/12691 [2:42:16<1:28:12,  1.17s/it, loss=0.631][A
 64%|██████▍   | 8177/12691 [2:42:16<1:30:08,  1.20s/it, loss=0.6

 65%|██████▍   | 8227/12691 [2:43:15<1:26:18,  1.16s/it, loss=0.637][A
 65%|██████▍   | 8227/12691 [2:43:16<1:26:18,  1.16s/it, loss=0.637][A
 65%|██████▍   | 8228/12691 [2:43:16<1:25:22,  1.15s/it, loss=0.637][A
 65%|██████▍   | 8228/12691 [2:43:17<1:25:22,  1.15s/it, loss=0.637][A
 65%|██████▍   | 8229/12691 [2:43:17<1:28:09,  1.19s/it, loss=0.637][A
 65%|██████▍   | 8229/12691 [2:43:18<1:28:09,  1.19s/it, loss=0.637][A
 65%|██████▍   | 8230/12691 [2:43:18<1:26:32,  1.16s/it, loss=0.637][A
 65%|██████▍   | 8230/12691 [2:43:19<1:26:32,  1.16s/it, loss=0.636][A
 65%|██████▍   | 8231/12691 [2:43:19<1:26:40,  1.17s/it, loss=0.636][A
 65%|██████▍   | 8231/12691 [2:43:20<1:26:40,  1.17s/it, loss=0.635][A
 65%|██████▍   | 8232/12691 [2:43:20<1:25:14,  1.15s/it, loss=0.635][A
 65%|██████▍   | 8232/12691 [2:43:22<1:25:14,  1.15s/it, loss=0.634][A
 65%|██████▍   | 8233/12691 [2:43:22<1:24:33,  1.14s/it, loss=0.634][A
 65%|██████▍   | 8233/12691 [2:43:23<1:24:33,  1.14s/it, loss=0.

 65%|██████▌   | 8283/12691 [2:44:21<1:24:36,  1.15s/it, loss=0.649][A
 65%|██████▌   | 8284/12691 [2:44:21<1:25:33,  1.16s/it, loss=0.649][A
 65%|██████▌   | 8284/12691 [2:44:22<1:25:33,  1.16s/it, loss=0.649][A
 65%|██████▌   | 8285/12691 [2:44:22<1:29:54,  1.22s/it, loss=0.649][A
 65%|██████▌   | 8285/12691 [2:44:24<1:29:54,  1.22s/it, loss=0.648][A
 65%|██████▌   | 8286/12691 [2:44:24<1:27:20,  1.19s/it, loss=0.648][A
 65%|██████▌   | 8286/12691 [2:44:25<1:27:20,  1.19s/it, loss=0.646][A
 65%|██████▌   | 8287/12691 [2:44:25<1:27:36,  1.19s/it, loss=0.646][A
 65%|██████▌   | 8287/12691 [2:44:26<1:27:36,  1.19s/it, loss=0.647][A
 65%|██████▌   | 8288/12691 [2:44:26<1:26:39,  1.18s/it, loss=0.647][A
 65%|██████▌   | 8288/12691 [2:44:27<1:26:39,  1.18s/it, loss=0.646][A
 65%|██████▌   | 8289/12691 [2:44:27<1:28:08,  1.20s/it, loss=0.646][A
 65%|██████▌   | 8289/12691 [2:44:28<1:28:08,  1.20s/it, loss=0.648][A
 65%|██████▌   | 8290/12691 [2:44:28<1:26:26,  1.18s/it, loss=0.

 66%|██████▌   | 8340/12691 [2:45:30<1:29:49,  1.24s/it, loss=0.654][A
 66%|██████▌   | 8340/12691 [2:45:31<1:29:49,  1.24s/it, loss=0.654][A
 66%|██████▌   | 8341/12691 [2:45:31<1:32:21,  1.27s/it, loss=0.654][A
 66%|██████▌   | 8341/12691 [2:45:32<1:32:21,  1.27s/it, loss=0.656][A
 66%|██████▌   | 8342/12691 [2:45:32<1:31:22,  1.26s/it, loss=0.656][A
 66%|██████▌   | 8342/12691 [2:45:34<1:31:22,  1.26s/it, loss=0.656][A
 66%|██████▌   | 8343/12691 [2:45:34<1:31:14,  1.26s/it, loss=0.656][A
 66%|██████▌   | 8343/12691 [2:45:35<1:31:14,  1.26s/it, loss=0.657][A
 66%|██████▌   | 8344/12691 [2:45:35<1:30:47,  1.25s/it, loss=0.657][A
 66%|██████▌   | 8344/12691 [2:45:36<1:30:47,  1.25s/it, loss=0.656][A
 66%|██████▌   | 8345/12691 [2:45:36<1:29:33,  1.24s/it, loss=0.656][A
 66%|██████▌   | 8345/12691 [2:45:37<1:29:33,  1.24s/it, loss=0.656][A
 66%|██████▌   | 8346/12691 [2:45:37<1:29:35,  1.24s/it, loss=0.656][A
 66%|██████▌   | 8346/12691 [2:45:39<1:29:35,  1.24s/it, loss=0.

 66%|██████▌   | 8396/12691 [2:46:37<1:26:43,  1.21s/it, loss=0.636][A
 66%|██████▌   | 8397/12691 [2:46:37<1:24:26,  1.18s/it, loss=0.636][A
 66%|██████▌   | 8397/12691 [2:46:39<1:24:26,  1.18s/it, loss=0.634][A
 66%|██████▌   | 8398/12691 [2:46:39<1:25:47,  1.20s/it, loss=0.634][A
 66%|██████▌   | 8398/12691 [2:46:40<1:25:47,  1.20s/it, loss=0.635][A
 66%|██████▌   | 8399/12691 [2:46:40<1:24:37,  1.18s/it, loss=0.635][A
 66%|██████▌   | 8399/12691 [2:46:41<1:24:37,  1.18s/it, loss=0.635][A
 66%|██████▌   | 8400/12691 [2:46:41<1:23:14,  1.16s/it, loss=0.635][A
 66%|██████▌   | 8400/12691 [2:46:42<1:23:14,  1.16s/it, loss=0.636][A
 66%|██████▌   | 8401/12691 [2:46:42<1:22:04,  1.15s/it, loss=0.636][A
 66%|██████▌   | 8401/12691 [2:46:43<1:22:04,  1.15s/it, loss=0.639][A
 66%|██████▌   | 8402/12691 [2:46:43<1:23:05,  1.16s/it, loss=0.639][A
 66%|██████▌   | 8402/12691 [2:46:44<1:23:05,  1.16s/it, loss=0.638][A
 66%|██████▌   | 8403/12691 [2:46:44<1:22:23,  1.15s/it, loss=0.

 67%|██████▋   | 8453/12691 [2:47:43<1:24:29,  1.20s/it, loss=0.642][A
 67%|██████▋   | 8453/12691 [2:47:44<1:24:29,  1.20s/it, loss=0.642][A
 67%|██████▋   | 8454/12691 [2:47:44<1:22:54,  1.17s/it, loss=0.642][A
 67%|██████▋   | 8454/12691 [2:47:46<1:22:54,  1.17s/it, loss=0.642][A
 67%|██████▋   | 8455/12691 [2:47:46<1:22:24,  1.17s/it, loss=0.642][A
 67%|██████▋   | 8455/12691 [2:47:47<1:22:24,  1.17s/it, loss=0.64] [A
 67%|██████▋   | 8456/12691 [2:47:47<1:22:55,  1.17s/it, loss=0.64][A
 67%|██████▋   | 8456/12691 [2:47:48<1:22:55,  1.17s/it, loss=0.64][A
 67%|██████▋   | 8457/12691 [2:47:48<1:24:14,  1.19s/it, loss=0.64][A
 67%|██████▋   | 8457/12691 [2:47:49<1:24:14,  1.19s/it, loss=0.642][A
 67%|██████▋   | 8458/12691 [2:47:49<1:26:37,  1.23s/it, loss=0.642][A
 67%|██████▋   | 8458/12691 [2:47:51<1:26:37,  1.23s/it, loss=0.644][A
 67%|██████▋   | 8459/12691 [2:47:51<1:24:30,  1.20s/it, loss=0.644][A
 67%|██████▋   | 8459/12691 [2:47:52<1:24:30,  1.20s/it, loss=0.642

 67%|██████▋   | 8509/12691 [2:48:51<1:21:46,  1.17s/it, loss=0.64] [A
 67%|██████▋   | 8510/12691 [2:48:51<1:21:15,  1.17s/it, loss=0.64][A
 67%|██████▋   | 8510/12691 [2:48:52<1:21:15,  1.17s/it, loss=0.641][A
 67%|██████▋   | 8511/12691 [2:48:52<1:20:51,  1.16s/it, loss=0.641][A
 67%|██████▋   | 8511/12691 [2:48:53<1:20:51,  1.16s/it, loss=0.639][A
 67%|██████▋   | 8512/12691 [2:48:53<1:20:46,  1.16s/it, loss=0.639][A
 67%|██████▋   | 8512/12691 [2:48:55<1:20:46,  1.16s/it, loss=0.637][A
 67%|██████▋   | 8513/12691 [2:48:55<1:24:07,  1.21s/it, loss=0.637][A
 67%|██████▋   | 8513/12691 [2:48:56<1:24:07,  1.21s/it, loss=0.634][A
 67%|██████▋   | 8514/12691 [2:48:56<1:23:00,  1.19s/it, loss=0.634][A
 67%|██████▋   | 8514/12691 [2:48:57<1:23:00,  1.19s/it, loss=0.636][A
 67%|██████▋   | 8515/12691 [2:48:57<1:21:30,  1.17s/it, loss=0.636][A
 67%|██████▋   | 8515/12691 [2:48:58<1:21:30,  1.17s/it, loss=0.635][A
 67%|██████▋   | 8516/12691 [2:48:58<1:21:57,  1.18s/it, loss=0.6

 67%|██████▋   | 8566/12691 [2:49:59<1:18:10,  1.14s/it, loss=0.627][A
 68%|██████▊   | 8567/12691 [2:49:59<1:19:31,  1.16s/it, loss=0.627][A
 68%|██████▊   | 8567/12691 [2:50:00<1:19:31,  1.16s/it, loss=0.628][A
 68%|██████▊   | 8568/12691 [2:50:00<1:20:23,  1.17s/it, loss=0.628][A
 68%|██████▊   | 8568/12691 [2:50:01<1:20:23,  1.17s/it, loss=0.628][A
 68%|██████▊   | 8569/12691 [2:50:01<1:21:53,  1.19s/it, loss=0.628][A
 68%|██████▊   | 8569/12691 [2:50:02<1:21:53,  1.19s/it, loss=0.629][A
 68%|██████▊   | 8570/12691 [2:50:02<1:21:01,  1.18s/it, loss=0.629][A
 68%|██████▊   | 8570/12691 [2:50:03<1:21:01,  1.18s/it, loss=0.629][A
 68%|██████▊   | 8571/12691 [2:50:03<1:19:33,  1.16s/it, loss=0.629][A
 68%|██████▊   | 8571/12691 [2:50:04<1:19:33,  1.16s/it, loss=0.628][A
 68%|██████▊   | 8572/12691 [2:50:04<1:18:45,  1.15s/it, loss=0.628][A
 68%|██████▊   | 8572/12691 [2:50:06<1:18:45,  1.15s/it, loss=0.627][A
 68%|██████▊   | 8573/12691 [2:50:06<1:18:45,  1.15s/it, loss=0.

 68%|██████▊   | 8623/12691 [2:51:06<1:22:16,  1.21s/it, loss=0.634][A
 68%|██████▊   | 8623/12691 [2:51:07<1:22:16,  1.21s/it, loss=0.632][A
 68%|██████▊   | 8624/12691 [2:51:07<1:22:45,  1.22s/it, loss=0.632][A
 68%|██████▊   | 8624/12691 [2:51:09<1:22:45,  1.22s/it, loss=0.635][A
 68%|██████▊   | 8625/12691 [2:51:09<1:23:46,  1.24s/it, loss=0.635][A
 68%|██████▊   | 8625/12691 [2:51:10<1:23:46,  1.24s/it, loss=0.637][A
 68%|██████▊   | 8626/12691 [2:51:10<1:23:06,  1.23s/it, loss=0.637][A
 68%|██████▊   | 8626/12691 [2:51:11<1:23:06,  1.23s/it, loss=0.637][A
 68%|██████▊   | 8627/12691 [2:51:11<1:23:56,  1.24s/it, loss=0.637][A
 68%|██████▊   | 8627/12691 [2:51:12<1:23:56,  1.24s/it, loss=0.638][A
 68%|██████▊   | 8628/12691 [2:51:12<1:24:02,  1.24s/it, loss=0.638][A
 68%|██████▊   | 8628/12691 [2:51:13<1:24:02,  1.24s/it, loss=0.638][A
 68%|██████▊   | 8629/12691 [2:51:13<1:23:56,  1.24s/it, loss=0.638][A
 68%|██████▊   | 8629/12691 [2:51:15<1:23:56,  1.24s/it, loss=0.

 68%|██████▊   | 8679/12691 [2:52:17<1:20:53,  1.21s/it, loss=0.645][A
 68%|██████▊   | 8680/12691 [2:52:17<1:24:10,  1.26s/it, loss=0.645][A
 68%|██████▊   | 8680/12691 [2:52:18<1:24:10,  1.26s/it, loss=0.649][A
 68%|██████▊   | 8681/12691 [2:52:18<1:22:25,  1.23s/it, loss=0.649][A
 68%|██████▊   | 8681/12691 [2:52:19<1:22:25,  1.23s/it, loss=0.653][A
 68%|██████▊   | 8682/12691 [2:52:19<1:22:42,  1.24s/it, loss=0.653][A
 68%|██████▊   | 8682/12691 [2:52:20<1:22:42,  1.24s/it, loss=0.65] [A
 68%|██████▊   | 8683/12691 [2:52:20<1:22:43,  1.24s/it, loss=0.65][A
 68%|██████▊   | 8683/12691 [2:52:22<1:22:43,  1.24s/it, loss=0.65][A
 68%|██████▊   | 8684/12691 [2:52:22<1:23:20,  1.25s/it, loss=0.65][A
 68%|██████▊   | 8684/12691 [2:52:23<1:23:20,  1.25s/it, loss=0.652][A
 68%|██████▊   | 8685/12691 [2:52:23<1:23:22,  1.25s/it, loss=0.652][A
 68%|██████▊   | 8685/12691 [2:52:24<1:23:22,  1.25s/it, loss=0.654][A
 68%|██████▊   | 8686/12691 [2:52:24<1:26:24,  1.29s/it, loss=0.654

 69%|██████▉   | 8736/12691 [2:53:27<1:23:09,  1.26s/it, loss=0.651][A
 69%|██████▉   | 8737/12691 [2:53:27<1:22:32,  1.25s/it, loss=0.651][A
 69%|██████▉   | 8737/12691 [2:53:28<1:22:32,  1.25s/it, loss=0.652][A
 69%|██████▉   | 8738/12691 [2:53:28<1:21:57,  1.24s/it, loss=0.652][A
 69%|██████▉   | 8738/12691 [2:53:30<1:21:57,  1.24s/it, loss=0.651][A
 69%|██████▉   | 8739/12691 [2:53:30<1:21:55,  1.24s/it, loss=0.651][A
 69%|██████▉   | 8739/12691 [2:53:31<1:21:55,  1.24s/it, loss=0.652][A
 69%|██████▉   | 8740/12691 [2:53:31<1:21:47,  1.24s/it, loss=0.652][A
 69%|██████▉   | 8740/12691 [2:53:32<1:21:47,  1.24s/it, loss=0.657][A
 69%|██████▉   | 8741/12691 [2:53:32<1:20:39,  1.23s/it, loss=0.657][A
 69%|██████▉   | 8741/12691 [2:53:33<1:20:39,  1.23s/it, loss=0.658][A
 69%|██████▉   | 8742/12691 [2:53:33<1:24:10,  1.28s/it, loss=0.658][A
 69%|██████▉   | 8742/12691 [2:53:35<1:24:10,  1.28s/it, loss=0.654][A
 69%|██████▉   | 8743/12691 [2:53:35<1:23:32,  1.27s/it, loss=0.

 69%|██████▉   | 8793/12691 [2:54:36<1:19:20,  1.22s/it, loss=0.65][A
 69%|██████▉   | 8793/12691 [2:54:37<1:19:20,  1.22s/it, loss=0.652][A
 69%|██████▉   | 8794/12691 [2:54:37<1:17:15,  1.19s/it, loss=0.652][A
 69%|██████▉   | 8794/12691 [2:54:38<1:17:15,  1.19s/it, loss=0.652][A
 69%|██████▉   | 8795/12691 [2:54:38<1:15:57,  1.17s/it, loss=0.652][A
 69%|██████▉   | 8795/12691 [2:54:39<1:15:57,  1.17s/it, loss=0.65] [A
 69%|██████▉   | 8796/12691 [2:54:39<1:15:19,  1.16s/it, loss=0.65][A
 69%|██████▉   | 8796/12691 [2:54:41<1:15:19,  1.16s/it, loss=0.651][A
 69%|██████▉   | 8797/12691 [2:54:41<1:17:13,  1.19s/it, loss=0.651][A
 69%|██████▉   | 8797/12691 [2:54:42<1:17:13,  1.19s/it, loss=0.653][A
 69%|██████▉   | 8798/12691 [2:54:42<1:15:50,  1.17s/it, loss=0.653][A
 69%|██████▉   | 8798/12691 [2:54:43<1:15:50,  1.17s/it, loss=0.652][A
 69%|██████▉   | 8799/12691 [2:54:43<1:15:49,  1.17s/it, loss=0.652][A
 69%|██████▉   | 8799/12691 [2:54:44<1:15:49,  1.17s/it, loss=0.65

 70%|██████▉   | 8849/12691 [2:55:43<1:13:08,  1.14s/it, loss=0.642][A
 70%|██████▉   | 8850/12691 [2:55:43<1:12:07,  1.13s/it, loss=0.642][A
 70%|██████▉   | 8850/12691 [2:55:44<1:12:07,  1.13s/it, loss=0.642][A
 70%|██████▉   | 8851/12691 [2:55:44<1:12:34,  1.13s/it, loss=0.642][A
 70%|██████▉   | 8851/12691 [2:55:45<1:12:34,  1.13s/it, loss=0.644][A
 70%|██████▉   | 8852/12691 [2:55:45<1:12:16,  1.13s/it, loss=0.644][A
 70%|██████▉   | 8852/12691 [2:55:47<1:12:16,  1.13s/it, loss=0.641][A
 70%|██████▉   | 8853/12691 [2:55:47<1:14:17,  1.16s/it, loss=0.641][A
 70%|██████▉   | 8853/12691 [2:55:48<1:14:17,  1.16s/it, loss=0.643][A
 70%|██████▉   | 8854/12691 [2:55:48<1:15:58,  1.19s/it, loss=0.643][A
 70%|██████▉   | 8854/12691 [2:55:49<1:15:58,  1.19s/it, loss=0.643][A
 70%|██████▉   | 8855/12691 [2:55:49<1:16:23,  1.19s/it, loss=0.643][A
 70%|██████▉   | 8855/12691 [2:55:50<1:16:23,  1.19s/it, loss=0.642][A
 70%|██████▉   | 8856/12691 [2:55:50<1:15:16,  1.18s/it, loss=0.

 70%|███████   | 8906/12691 [2:56:50<1:12:45,  1.15s/it, loss=0.649][A
 70%|███████   | 8906/12691 [2:56:51<1:12:45,  1.15s/it, loss=0.648][A
 70%|███████   | 8907/12691 [2:56:51<1:12:15,  1.15s/it, loss=0.648][A
 70%|███████   | 8907/12691 [2:56:52<1:12:15,  1.15s/it, loss=0.645][A
 70%|███████   | 8908/12691 [2:56:52<1:12:07,  1.14s/it, loss=0.645][A
 70%|███████   | 8908/12691 [2:56:54<1:12:07,  1.14s/it, loss=0.646][A
 70%|███████   | 8909/12691 [2:56:54<1:15:57,  1.21s/it, loss=0.646][A
 70%|███████   | 8909/12691 [2:56:55<1:15:57,  1.21s/it, loss=0.646][A
 70%|███████   | 8910/12691 [2:56:55<1:16:51,  1.22s/it, loss=0.646][A
 70%|███████   | 8910/12691 [2:56:56<1:16:51,  1.22s/it, loss=0.645][A
 70%|███████   | 8911/12691 [2:56:56<1:17:38,  1.23s/it, loss=0.645][A
 70%|███████   | 8911/12691 [2:56:58<1:17:38,  1.23s/it, loss=0.644][A
 70%|███████   | 8912/12691 [2:56:58<1:18:07,  1.24s/it, loss=0.644][A
 70%|███████   | 8912/12691 [2:56:59<1:18:07,  1.24s/it, loss=0.

 71%|███████   | 8962/12691 [2:57:57<1:12:12,  1.16s/it, loss=0.642][A
 71%|███████   | 8963/12691 [2:57:57<1:11:37,  1.15s/it, loss=0.642][A
 71%|███████   | 8963/12691 [2:57:58<1:11:37,  1.15s/it, loss=0.644][A
 71%|███████   | 8964/12691 [2:57:58<1:15:16,  1.21s/it, loss=0.644][A
 71%|███████   | 8964/12691 [2:58:00<1:15:16,  1.21s/it, loss=0.644][A
 71%|███████   | 8965/12691 [2:58:00<1:13:31,  1.18s/it, loss=0.644][A
 71%|███████   | 8965/12691 [2:58:01<1:13:31,  1.18s/it, loss=0.645][A
 71%|███████   | 8966/12691 [2:58:01<1:12:04,  1.16s/it, loss=0.645][A
 71%|███████   | 8966/12691 [2:58:02<1:12:04,  1.16s/it, loss=0.648][A
 71%|███████   | 8967/12691 [2:58:02<1:11:59,  1.16s/it, loss=0.648][A
 71%|███████   | 8967/12691 [2:58:03<1:11:59,  1.16s/it, loss=0.65] [A
 71%|███████   | 8968/12691 [2:58:03<1:11:41,  1.16s/it, loss=0.65][A
 71%|███████   | 8968/12691 [2:58:04<1:11:41,  1.16s/it, loss=0.652][A
 71%|███████   | 8969/12691 [2:58:04<1:11:35,  1.15s/it, loss=0.6

saving model checkpoint at iteration=9000



 71%|███████   | 9001/12691 [2:58:42<1:29:29,  1.46s/it, loss=0.635][A
 71%|███████   | 9001/12691 [2:58:43<1:29:29,  1.46s/it, loss=0.635][A
 71%|███████   | 9002/12691 [2:58:43<1:24:18,  1.37s/it, loss=0.635][A
 71%|███████   | 9002/12691 [2:58:44<1:24:18,  1.37s/it, loss=0.636][A
 71%|███████   | 9003/12691 [2:58:44<1:19:50,  1.30s/it, loss=0.636][A
 71%|███████   | 9003/12691 [2:58:45<1:19:50,  1.30s/it, loss=0.637][A
 71%|███████   | 9004/12691 [2:58:45<1:16:31,  1.25s/it, loss=0.637][A
 71%|███████   | 9004/12691 [2:58:47<1:16:31,  1.25s/it, loss=0.638][A
 71%|███████   | 9005/12691 [2:58:47<1:14:48,  1.22s/it, loss=0.638][A
 71%|███████   | 9005/12691 [2:58:48<1:14:48,  1.22s/it, loss=0.642][A
 71%|███████   | 9006/12691 [2:58:48<1:13:46,  1.20s/it, loss=0.642][A
 71%|███████   | 9006/12691 [2:58:49<1:13:46,  1.20s/it, loss=0.642][A
 71%|███████   | 9007/12691 [2:58:49<1:12:47,  1.19s/it, loss=0.642][A
 71%|███████   | 9007/12691 [2:58:50<1:12:47,  1.19s/it, loss=0

 71%|███████▏  | 9057/12691 [2:59:49<1:11:41,  1.18s/it, loss=0.656][A
 71%|███████▏  | 9058/12691 [2:59:49<1:11:12,  1.18s/it, loss=0.656][A
 71%|███████▏  | 9058/12691 [2:59:50<1:11:12,  1.18s/it, loss=0.659][A
 71%|███████▏  | 9059/12691 [2:59:50<1:10:43,  1.17s/it, loss=0.659][A
 71%|███████▏  | 9059/12691 [2:59:51<1:10:43,  1.17s/it, loss=0.658][A
 71%|███████▏  | 9060/12691 [2:59:51<1:12:21,  1.20s/it, loss=0.658][A
 71%|███████▏  | 9060/12691 [2:59:52<1:12:21,  1.20s/it, loss=0.655][A
 71%|███████▏  | 9061/12691 [2:59:52<1:11:38,  1.18s/it, loss=0.655][A
 71%|███████▏  | 9061/12691 [2:59:54<1:11:38,  1.18s/it, loss=0.652][A
 71%|███████▏  | 9062/12691 [2:59:54<1:12:17,  1.20s/it, loss=0.652][A
 71%|███████▏  | 9062/12691 [2:59:55<1:12:17,  1.20s/it, loss=0.653][A
 71%|███████▏  | 9063/12691 [2:59:55<1:14:29,  1.23s/it, loss=0.653][A
 71%|███████▏  | 9063/12691 [2:59:56<1:14:29,  1.23s/it, loss=0.653][A
 71%|███████▏  | 9064/12691 [2:59:56<1:13:06,  1.21s/it, loss=0.

 72%|███████▏  | 9114/12691 [3:00:55<1:16:45,  1.29s/it, loss=0.655][A
 72%|███████▏  | 9114/12691 [3:00:56<1:16:45,  1.29s/it, loss=0.654][A
 72%|███████▏  | 9115/12691 [3:00:56<1:16:33,  1.28s/it, loss=0.654][A
 72%|███████▏  | 9115/12691 [3:00:58<1:16:33,  1.28s/it, loss=0.654][A
 72%|███████▏  | 9116/12691 [3:00:58<1:16:28,  1.28s/it, loss=0.654][A
 72%|███████▏  | 9116/12691 [3:00:59<1:16:28,  1.28s/it, loss=0.654][A
 72%|███████▏  | 9117/12691 [3:00:59<1:16:22,  1.28s/it, loss=0.654][A
 72%|███████▏  | 9117/12691 [3:01:00<1:16:22,  1.28s/it, loss=0.653][A
 72%|███████▏  | 9118/12691 [3:01:00<1:16:13,  1.28s/it, loss=0.653][A
 72%|███████▏  | 9118/12691 [3:01:02<1:16:13,  1.28s/it, loss=0.653][A
 72%|███████▏  | 9119/12691 [3:01:02<1:18:01,  1.31s/it, loss=0.653][A
 72%|███████▏  | 9119/12691 [3:01:03<1:18:01,  1.31s/it, loss=0.654][A
 72%|███████▏  | 9120/12691 [3:01:03<1:16:55,  1.29s/it, loss=0.654][A
 72%|███████▏  | 9120/12691 [3:01:04<1:16:55,  1.29s/it, loss=0.

 72%|███████▏  | 9170/12691 [3:02:04<1:08:47,  1.17s/it, loss=0.648][A
 72%|███████▏  | 9171/12691 [3:02:04<1:08:54,  1.17s/it, loss=0.648][A
 72%|███████▏  | 9171/12691 [3:02:05<1:08:54,  1.17s/it, loss=0.646][A
 72%|███████▏  | 9172/12691 [3:02:05<1:08:33,  1.17s/it, loss=0.646][A
 72%|███████▏  | 9172/12691 [3:02:06<1:08:33,  1.17s/it, loss=0.643][A
 72%|███████▏  | 9173/12691 [3:02:06<1:08:32,  1.17s/it, loss=0.643][A
 72%|███████▏  | 9173/12691 [3:02:07<1:08:32,  1.17s/it, loss=0.644][A
 72%|███████▏  | 9174/12691 [3:02:07<1:08:46,  1.17s/it, loss=0.644][A
 72%|███████▏  | 9174/12691 [3:02:09<1:08:46,  1.17s/it, loss=0.641][A
 72%|███████▏  | 9175/12691 [3:02:09<1:11:25,  1.22s/it, loss=0.641][A
 72%|███████▏  | 9175/12691 [3:02:10<1:11:25,  1.22s/it, loss=0.637][A
 72%|███████▏  | 9176/12691 [3:02:10<1:10:03,  1.20s/it, loss=0.637][A
 72%|███████▏  | 9176/12691 [3:02:11<1:10:03,  1.20s/it, loss=0.639][A
 72%|███████▏  | 9177/12691 [3:02:11<1:11:13,  1.22s/it, loss=0.

 73%|███████▎  | 9227/12691 [3:03:12<1:12:14,  1.25s/it, loss=0.644][A
 73%|███████▎  | 9227/12691 [3:03:13<1:12:14,  1.25s/it, loss=0.644][A
 73%|███████▎  | 9228/12691 [3:03:13<1:10:36,  1.22s/it, loss=0.644][A
 73%|███████▎  | 9228/12691 [3:03:14<1:10:36,  1.22s/it, loss=0.647][A
 73%|███████▎  | 9229/12691 [3:03:14<1:11:20,  1.24s/it, loss=0.647][A
 73%|███████▎  | 9229/12691 [3:03:15<1:11:20,  1.24s/it, loss=0.646][A
 73%|███████▎  | 9230/12691 [3:03:15<1:13:17,  1.27s/it, loss=0.646][A
 73%|███████▎  | 9230/12691 [3:03:17<1:13:17,  1.27s/it, loss=0.647][A
 73%|███████▎  | 9231/12691 [3:03:17<1:11:26,  1.24s/it, loss=0.647][A
 73%|███████▎  | 9231/12691 [3:03:18<1:11:26,  1.24s/it, loss=0.649][A
 73%|███████▎  | 9232/12691 [3:03:18<1:11:07,  1.23s/it, loss=0.649][A
 73%|███████▎  | 9232/12691 [3:03:19<1:11:07,  1.23s/it, loss=0.649][A
 73%|███████▎  | 9233/12691 [3:03:19<1:09:35,  1.21s/it, loss=0.649][A
 73%|███████▎  | 9233/12691 [3:03:20<1:09:35,  1.21s/it, loss=0.

 73%|███████▎  | 9283/12691 [3:04:18<1:04:31,  1.14s/it, loss=0.655][A
 73%|███████▎  | 9284/12691 [3:04:18<1:04:22,  1.13s/it, loss=0.655][A
 73%|███████▎  | 9284/12691 [3:04:19<1:04:22,  1.13s/it, loss=0.654][A
 73%|███████▎  | 9285/12691 [3:04:19<1:04:30,  1.14s/it, loss=0.654][A
 73%|███████▎  | 9285/12691 [3:04:20<1:04:30,  1.14s/it, loss=0.654][A
 73%|███████▎  | 9286/12691 [3:04:20<1:05:30,  1.15s/it, loss=0.654][A
 73%|███████▎  | 9286/12691 [3:04:22<1:05:30,  1.15s/it, loss=0.655][A
 73%|███████▎  | 9287/12691 [3:04:22<1:05:10,  1.15s/it, loss=0.655][A
 73%|███████▎  | 9287/12691 [3:04:23<1:05:10,  1.15s/it, loss=0.655][A
 73%|███████▎  | 9288/12691 [3:04:23<1:04:49,  1.14s/it, loss=0.655][A
 73%|███████▎  | 9288/12691 [3:04:24<1:04:49,  1.14s/it, loss=0.657][A
 73%|███████▎  | 9289/12691 [3:04:24<1:05:27,  1.15s/it, loss=0.657][A
 73%|███████▎  | 9289/12691 [3:04:25<1:05:27,  1.15s/it, loss=0.659][A
 73%|███████▎  | 9290/12691 [3:04:25<1:06:06,  1.17s/it, loss=0.

 74%|███████▎  | 9340/12691 [3:05:23<1:03:25,  1.14s/it, loss=0.654][A
 74%|███████▎  | 9340/12691 [3:05:24<1:03:25,  1.14s/it, loss=0.654][A
 74%|███████▎  | 9341/12691 [3:05:24<1:04:24,  1.15s/it, loss=0.654][A
 74%|███████▎  | 9341/12691 [3:05:25<1:04:24,  1.15s/it, loss=0.655][A
 74%|███████▎  | 9342/12691 [3:05:25<1:03:41,  1.14s/it, loss=0.655][A
 74%|███████▎  | 9342/12691 [3:05:27<1:03:41,  1.14s/it, loss=0.654][A
 74%|███████▎  | 9343/12691 [3:05:27<1:03:13,  1.13s/it, loss=0.654][A
 74%|███████▎  | 9343/12691 [3:05:28<1:03:13,  1.13s/it, loss=0.655][A
 74%|███████▎  | 9344/12691 [3:05:28<1:02:36,  1.12s/it, loss=0.655][A
 74%|███████▎  | 9344/12691 [3:05:29<1:02:36,  1.12s/it, loss=0.656][A
 74%|███████▎  | 9345/12691 [3:05:29<1:02:27,  1.12s/it, loss=0.656][A
 74%|███████▎  | 9345/12691 [3:05:30<1:02:27,  1.12s/it, loss=0.654][A
 74%|███████▎  | 9346/12691 [3:05:30<1:02:09,  1.11s/it, loss=0.654][A
 74%|███████▎  | 9346/12691 [3:05:31<1:02:09,  1.11s/it, loss=0.

 74%|███████▍  | 9396/12691 [3:06:31<1:05:53,  1.20s/it, loss=0.662][A
 74%|███████▍  | 9397/12691 [3:06:31<1:06:35,  1.21s/it, loss=0.662][A
 74%|███████▍  | 9397/12691 [3:06:32<1:06:35,  1.21s/it, loss=0.663][A
 74%|███████▍  | 9398/12691 [3:06:32<1:05:35,  1.20s/it, loss=0.663][A
 74%|███████▍  | 9398/12691 [3:06:33<1:05:35,  1.20s/it, loss=0.664][A
 74%|███████▍  | 9399/12691 [3:06:33<1:04:12,  1.17s/it, loss=0.664][A
 74%|███████▍  | 9399/12691 [3:06:34<1:04:12,  1.17s/it, loss=0.662][A
 74%|███████▍  | 9400/12691 [3:06:34<1:03:08,  1.15s/it, loss=0.662][A
 74%|███████▍  | 9400/12691 [3:06:35<1:03:08,  1.15s/it, loss=0.663][A
 74%|███████▍  | 9401/12691 [3:06:35<1:02:39,  1.14s/it, loss=0.663][A
 74%|███████▍  | 9401/12691 [3:06:36<1:02:39,  1.14s/it, loss=0.659][A
 74%|███████▍  | 9402/12691 [3:06:36<1:02:22,  1.14s/it, loss=0.659][A
 74%|███████▍  | 9402/12691 [3:06:38<1:02:22,  1.14s/it, loss=0.66] [A
 74%|███████▍  | 9403/12691 [3:06:38<1:03:36,  1.16s/it, loss=0.

 74%|███████▍  | 9453/12691 [3:07:36<1:04:43,  1.20s/it, loss=0.633][A
 74%|███████▍  | 9453/12691 [3:07:37<1:04:43,  1.20s/it, loss=0.632][A
 74%|███████▍  | 9454/12691 [3:07:37<1:03:35,  1.18s/it, loss=0.632][A
 74%|███████▍  | 9454/12691 [3:07:38<1:03:35,  1.18s/it, loss=0.633][A
 75%|███████▍  | 9455/12691 [3:07:38<1:02:14,  1.15s/it, loss=0.633][A
 75%|███████▍  | 9455/12691 [3:07:39<1:02:14,  1.15s/it, loss=0.634][A
 75%|███████▍  | 9456/12691 [3:07:39<1:01:57,  1.15s/it, loss=0.634][A
 75%|███████▍  | 9456/12691 [3:07:40<1:01:57,  1.15s/it, loss=0.634][A
 75%|███████▍  | 9457/12691 [3:07:40<1:03:09,  1.17s/it, loss=0.634][A
 75%|███████▍  | 9457/12691 [3:07:41<1:03:09,  1.17s/it, loss=0.637][A
 75%|███████▍  | 9458/12691 [3:07:41<1:03:55,  1.19s/it, loss=0.637][A
 75%|███████▍  | 9458/12691 [3:07:43<1:03:55,  1.19s/it, loss=0.641][A
 75%|███████▍  | 9459/12691 [3:07:43<1:06:12,  1.23s/it, loss=0.641][A
 75%|███████▍  | 9459/12691 [3:07:44<1:06:12,  1.23s/it, loss=0.

 75%|███████▍  | 9509/12691 [3:08:43<1:01:14,  1.15s/it, loss=0.644][A
 75%|███████▍  | 9510/12691 [3:08:43<1:00:42,  1.15s/it, loss=0.644][A
 75%|███████▍  | 9510/12691 [3:08:44<1:00:42,  1.15s/it, loss=0.646][A
 75%|███████▍  | 9511/12691 [3:08:44<1:00:35,  1.14s/it, loss=0.646][A
 75%|███████▍  | 9511/12691 [3:08:45<1:00:35,  1.14s/it, loss=0.646][A
 75%|███████▍  | 9512/12691 [3:08:45<1:00:06,  1.13s/it, loss=0.646][A
 75%|███████▍  | 9512/12691 [3:08:46<1:00:06,  1.13s/it, loss=0.646][A
 75%|███████▍  | 9513/12691 [3:08:46<59:44,  1.13s/it, loss=0.646]  [A
 75%|███████▍  | 9513/12691 [3:08:47<59:44,  1.13s/it, loss=0.644][A
 75%|███████▍  | 9514/12691 [3:08:47<1:02:00,  1.17s/it, loss=0.644][A
 75%|███████▍  | 9514/12691 [3:08:48<1:02:00,  1.17s/it, loss=0.644][A
 75%|███████▍  | 9515/12691 [3:08:48<1:01:04,  1.15s/it, loss=0.644][A
 75%|███████▍  | 9515/12691 [3:08:49<1:01:04,  1.15s/it, loss=0.646][A
 75%|███████▍  | 9516/12691 [3:08:49<1:00:39,  1.15s/it, loss=0.64

 75%|███████▌  | 9566/12691 [3:09:48<1:00:22,  1.16s/it, loss=0.648][A
 75%|███████▌  | 9567/12691 [3:09:48<59:56,  1.15s/it, loss=0.648]  [A
 75%|███████▌  | 9567/12691 [3:09:49<59:56,  1.15s/it, loss=0.65] [A
 75%|███████▌  | 9568/12691 [3:09:49<59:33,  1.14s/it, loss=0.65][A
 75%|███████▌  | 9568/12691 [3:09:50<59:33,  1.14s/it, loss=0.651][A
 75%|███████▌  | 9569/12691 [3:09:50<59:14,  1.14s/it, loss=0.651][A
 75%|███████▌  | 9569/12691 [3:09:52<59:14,  1.14s/it, loss=0.651][A
 75%|███████▌  | 9570/12691 [3:09:52<1:00:31,  1.16s/it, loss=0.651][A
 75%|███████▌  | 9570/12691 [3:09:53<1:00:31,  1.16s/it, loss=0.652][A
 75%|███████▌  | 9571/12691 [3:09:53<59:58,  1.15s/it, loss=0.652]  [A
 75%|███████▌  | 9571/12691 [3:09:54<59:58,  1.15s/it, loss=0.653][A
 75%|███████▌  | 9572/12691 [3:09:54<59:37,  1.15s/it, loss=0.653][A
 75%|███████▌  | 9572/12691 [3:09:55<59:37,  1.15s/it, loss=0.652][A
 75%|███████▌  | 9573/12691 [3:09:55<59:13,  1.14s/it, loss=0.652][A
 75%|██████

 76%|███████▌  | 9624/12691 [3:10:54<58:49,  1.15s/it, loss=0.647][A
 76%|███████▌  | 9624/12691 [3:10:55<58:49,  1.15s/it, loss=0.647][A
 76%|███████▌  | 9625/12691 [3:10:55<59:42,  1.17s/it, loss=0.647][A
 76%|███████▌  | 9625/12691 [3:10:56<59:42,  1.17s/it, loss=0.644][A
 76%|███████▌  | 9626/12691 [3:10:56<1:00:48,  1.19s/it, loss=0.644][A
 76%|███████▌  | 9626/12691 [3:10:58<1:00:48,  1.19s/it, loss=0.644][A
 76%|███████▌  | 9627/12691 [3:10:58<1:00:52,  1.19s/it, loss=0.644][A
 76%|███████▌  | 9627/12691 [3:10:59<1:00:52,  1.19s/it, loss=0.648][A
 76%|███████▌  | 9628/12691 [3:10:59<1:01:40,  1.21s/it, loss=0.648][A
 76%|███████▌  | 9628/12691 [3:11:00<1:01:40,  1.21s/it, loss=0.647][A
 76%|███████▌  | 9629/12691 [3:11:00<1:01:01,  1.20s/it, loss=0.647][A
 76%|███████▌  | 9629/12691 [3:11:01<1:01:01,  1.20s/it, loss=0.647][A
 76%|███████▌  | 9630/12691 [3:11:01<1:00:09,  1.18s/it, loss=0.647][A
 76%|███████▌  | 9630/12691 [3:11:02<1:00:09,  1.18s/it, loss=0.647][A


 76%|███████▋  | 9682/12691 [3:12:02<58:03,  1.16s/it, loss=0.644][A
 76%|███████▋  | 9682/12691 [3:12:03<58:03,  1.16s/it, loss=0.642][A
 76%|███████▋  | 9683/12691 [3:12:03<57:55,  1.16s/it, loss=0.642][A
 76%|███████▋  | 9683/12691 [3:12:04<57:55,  1.16s/it, loss=0.643][A
 76%|███████▋  | 9684/12691 [3:12:04<57:32,  1.15s/it, loss=0.643][A
 76%|███████▋  | 9684/12691 [3:12:05<57:32,  1.15s/it, loss=0.646][A
 76%|███████▋  | 9685/12691 [3:12:05<57:15,  1.14s/it, loss=0.646][A
 76%|███████▋  | 9685/12691 [3:12:06<57:15,  1.14s/it, loss=0.643][A
 76%|███████▋  | 9686/12691 [3:12:06<57:05,  1.14s/it, loss=0.643][A
 76%|███████▋  | 9686/12691 [3:12:07<57:05,  1.14s/it, loss=0.642][A
 76%|███████▋  | 9687/12691 [3:12:07<58:34,  1.17s/it, loss=0.642][A
 76%|███████▋  | 9687/12691 [3:12:09<58:34,  1.17s/it, loss=0.642][A
 76%|███████▋  | 9688/12691 [3:12:09<57:58,  1.16s/it, loss=0.642][A
 76%|███████▋  | 9688/12691 [3:12:10<57:58,  1.16s/it, loss=0.645][A
 76%|███████▋  | 968

 77%|███████▋  | 9740/12691 [3:13:09<56:05,  1.14s/it, loss=0.646][A
 77%|███████▋  | 9741/12691 [3:13:09<55:44,  1.13s/it, loss=0.646][A
 77%|███████▋  | 9741/12691 [3:13:11<55:44,  1.13s/it, loss=0.646][A
 77%|███████▋  | 9742/12691 [3:13:11<55:52,  1.14s/it, loss=0.646][A
 77%|███████▋  | 9742/12691 [3:13:12<55:52,  1.14s/it, loss=0.644][A
 77%|███████▋  | 9743/12691 [3:13:12<57:10,  1.16s/it, loss=0.644][A
 77%|███████▋  | 9743/12691 [3:13:13<57:10,  1.16s/it, loss=0.642][A
 77%|███████▋  | 9744/12691 [3:13:13<56:36,  1.15s/it, loss=0.642][A
 77%|███████▋  | 9744/12691 [3:13:14<56:36,  1.15s/it, loss=0.642][A
 77%|███████▋  | 9745/12691 [3:13:14<56:12,  1.14s/it, loss=0.642][A
 77%|███████▋  | 9745/12691 [3:13:15<56:12,  1.14s/it, loss=0.643][A
 77%|███████▋  | 9746/12691 [3:13:15<55:57,  1.14s/it, loss=0.643][A
 77%|███████▋  | 9746/12691 [3:13:16<55:57,  1.14s/it, loss=0.641][A
 77%|███████▋  | 9747/12691 [3:13:16<56:09,  1.14s/it, loss=0.641][A
 77%|███████▋  | 974

 77%|███████▋  | 9799/12691 [3:14:17<57:33,  1.19s/it, loss=0.639][A
 77%|███████▋  | 9799/12691 [3:14:18<57:33,  1.19s/it, loss=0.642][A
 77%|███████▋  | 9800/12691 [3:14:18<56:48,  1.18s/it, loss=0.642][A
 77%|███████▋  | 9800/12691 [3:14:19<56:48,  1.18s/it, loss=0.643][A
 77%|███████▋  | 9801/12691 [3:14:19<56:44,  1.18s/it, loss=0.643][A
 77%|███████▋  | 9801/12691 [3:14:20<56:44,  1.18s/it, loss=0.641][A
 77%|███████▋  | 9802/12691 [3:14:20<56:07,  1.17s/it, loss=0.641][A
 77%|███████▋  | 9802/12691 [3:14:21<56:07,  1.17s/it, loss=0.641][A
 77%|███████▋  | 9803/12691 [3:14:21<55:38,  1.16s/it, loss=0.641][A
 77%|███████▋  | 9803/12691 [3:14:23<55:38,  1.16s/it, loss=0.641][A
 77%|███████▋  | 9804/12691 [3:14:23<55:57,  1.16s/it, loss=0.641][A
 77%|███████▋  | 9804/12691 [3:14:24<55:57,  1.16s/it, loss=0.642][A
 77%|███████▋  | 9805/12691 [3:14:24<56:55,  1.18s/it, loss=0.642][A
 77%|███████▋  | 9805/12691 [3:14:25<56:55,  1.18s/it, loss=0.641][A
 77%|███████▋  | 980

 78%|███████▊  | 9857/12691 [3:15:26<54:01,  1.14s/it, loss=0.659][A
 78%|███████▊  | 9858/12691 [3:15:26<53:36,  1.14s/it, loss=0.659][A
 78%|███████▊  | 9858/12691 [3:15:27<53:36,  1.14s/it, loss=0.658][A
 78%|███████▊  | 9859/12691 [3:15:27<54:05,  1.15s/it, loss=0.658][A
 78%|███████▊  | 9859/12691 [3:15:28<54:05,  1.15s/it, loss=0.654][A
 78%|███████▊  | 9860/12691 [3:15:28<55:07,  1.17s/it, loss=0.654][A
 78%|███████▊  | 9860/12691 [3:15:29<55:07,  1.17s/it, loss=0.653][A
 78%|███████▊  | 9861/12691 [3:15:29<56:21,  1.19s/it, loss=0.653][A
 78%|███████▊  | 9861/12691 [3:15:30<56:21,  1.19s/it, loss=0.655][A
 78%|███████▊  | 9862/12691 [3:15:30<55:49,  1.18s/it, loss=0.655][A
 78%|███████▊  | 9862/12691 [3:15:32<55:49,  1.18s/it, loss=0.655][A
 78%|███████▊  | 9863/12691 [3:15:32<56:14,  1.19s/it, loss=0.655][A
 78%|███████▊  | 9863/12691 [3:15:33<56:14,  1.19s/it, loss=0.656][A
 78%|███████▊  | 9864/12691 [3:15:33<57:09,  1.21s/it, loss=0.656][A
 78%|███████▊  | 986

 78%|███████▊  | 9916/12691 [3:16:34<55:13,  1.19s/it, loss=0.64][A
 78%|███████▊  | 9916/12691 [3:16:35<55:13,  1.19s/it, loss=0.637][A
 78%|███████▊  | 9917/12691 [3:16:35<54:34,  1.18s/it, loss=0.637][A
 78%|███████▊  | 9917/12691 [3:16:37<54:34,  1.18s/it, loss=0.635][A
 78%|███████▊  | 9918/12691 [3:16:37<54:10,  1.17s/it, loss=0.635][A
 78%|███████▊  | 9918/12691 [3:16:38<54:10,  1.17s/it, loss=0.639][A
 78%|███████▊  | 9919/12691 [3:16:38<53:51,  1.17s/it, loss=0.639][A
 78%|███████▊  | 9919/12691 [3:16:39<53:51,  1.17s/it, loss=0.636][A
 78%|███████▊  | 9920/12691 [3:16:39<54:22,  1.18s/it, loss=0.636][A
 78%|███████▊  | 9920/12691 [3:16:40<54:22,  1.18s/it, loss=0.636][A
 78%|███████▊  | 9921/12691 [3:16:40<53:55,  1.17s/it, loss=0.636][A
 78%|███████▊  | 9921/12691 [3:16:41<53:55,  1.17s/it, loss=0.636][A
 78%|███████▊  | 9922/12691 [3:16:41<55:37,  1.21s/it, loss=0.636][A
 78%|███████▊  | 9922/12691 [3:16:42<55:37,  1.21s/it, loss=0.635][A
 78%|███████▊  | 9923

 79%|███████▊  | 9974/12691 [3:17:46<54:30,  1.20s/it, loss=0.642][A
 79%|███████▊  | 9975/12691 [3:17:46<54:37,  1.21s/it, loss=0.642][A
 79%|███████▊  | 9975/12691 [3:17:47<54:37,  1.21s/it, loss=0.644][A
 79%|███████▊  | 9976/12691 [3:17:47<53:40,  1.19s/it, loss=0.644][A
 79%|███████▊  | 9976/12691 [3:17:48<53:40,  1.19s/it, loss=0.646][A
 79%|███████▊  | 9977/12691 [3:17:48<53:25,  1.18s/it, loss=0.646][A
 79%|███████▊  | 9977/12691 [3:17:49<53:25,  1.18s/it, loss=0.648][A
 79%|███████▊  | 9978/12691 [3:17:49<55:18,  1.22s/it, loss=0.648][A
 79%|███████▊  | 9978/12691 [3:17:50<55:18,  1.22s/it, loss=0.65] [A
 79%|███████▊  | 9979/12691 [3:17:50<54:21,  1.20s/it, loss=0.65][A
 79%|███████▊  | 9979/12691 [3:17:52<54:21,  1.20s/it, loss=0.65][A
 79%|███████▊  | 9980/12691 [3:17:52<53:58,  1.19s/it, loss=0.65][A
 79%|███████▊  | 9980/12691 [3:17:53<53:58,  1.19s/it, loss=0.65][A
 79%|███████▊  | 9981/12691 [3:17:53<53:35,  1.19s/it, loss=0.65][A
 79%|███████▊  | 9981/126

saving model checkpoint at iteration=10000



 79%|███████▉  | 10001/12691 [3:18:18<1:06:25,  1.48s/it, loss=0.652][A
 79%|███████▉  | 10001/12691 [3:18:19<1:06:25,  1.48s/it, loss=0.65] [A
 79%|███████▉  | 10002/12691 [3:18:19<1:05:06,  1.45s/it, loss=0.65][A
 79%|███████▉  | 10002/12691 [3:18:20<1:05:06,  1.45s/it, loss=0.651][A
 79%|███████▉  | 10003/12691 [3:18:20<1:01:16,  1.37s/it, loss=0.651][A
 79%|███████▉  | 10003/12691 [3:18:22<1:01:16,  1.37s/it, loss=0.653][A
 79%|███████▉  | 10004/12691 [3:18:22<58:20,  1.30s/it, loss=0.653]  [A
 79%|███████▉  | 10004/12691 [3:18:23<58:20,  1.30s/it, loss=0.653][A
 79%|███████▉  | 10005/12691 [3:18:23<56:29,  1.26s/it, loss=0.653][A
 79%|███████▉  | 10005/12691 [3:18:24<56:29,  1.26s/it, loss=0.654][A
 79%|███████▉  | 10006/12691 [3:18:24<55:58,  1.25s/it, loss=0.654][A
 79%|███████▉  | 10006/12691 [3:18:25<55:58,  1.25s/it, loss=0.651][A
 79%|███████▉  | 10007/12691 [3:18:25<56:15,  1.26s/it, loss=0.651][A
 79%|███████▉  | 10007/12691 [3:18:27<56:15,  1.26s/it, loss=0.

 79%|███████▉  | 10058/12691 [3:19:30<54:39,  1.25s/it, loss=0.654][A
 79%|███████▉  | 10059/12691 [3:19:30<53:39,  1.22s/it, loss=0.654][A
 79%|███████▉  | 10059/12691 [3:19:31<53:39,  1.22s/it, loss=0.653][A
 79%|███████▉  | 10060/12691 [3:19:31<52:45,  1.20s/it, loss=0.653][A
 79%|███████▉  | 10060/12691 [3:19:33<52:45,  1.20s/it, loss=0.655][A
 79%|███████▉  | 10061/12691 [3:19:33<53:09,  1.21s/it, loss=0.655][A
 79%|███████▉  | 10061/12691 [3:19:34<53:09,  1.21s/it, loss=0.652][A
 79%|███████▉  | 10062/12691 [3:19:34<52:33,  1.20s/it, loss=0.652][A
 79%|███████▉  | 10062/12691 [3:19:35<52:33,  1.20s/it, loss=0.65] [A
 79%|███████▉  | 10063/12691 [3:19:35<52:31,  1.20s/it, loss=0.65][A
 79%|███████▉  | 10063/12691 [3:19:36<52:31,  1.20s/it, loss=0.65][A
 79%|███████▉  | 10064/12691 [3:19:36<55:29,  1.27s/it, loss=0.65][A
 79%|███████▉  | 10064/12691 [3:19:38<55:29,  1.27s/it, loss=0.65][A
 79%|███████▉  | 10065/12691 [3:19:38<54:17,  1.24s/it, loss=0.65][A
 79%|██████

 80%|███████▉  | 10116/12691 [3:20:39<51:53,  1.21s/it, loss=0.658][A
 80%|███████▉  | 10116/12691 [3:20:40<51:53,  1.21s/it, loss=0.657][A
 80%|███████▉  | 10117/12691 [3:20:40<50:57,  1.19s/it, loss=0.657][A
 80%|███████▉  | 10117/12691 [3:20:41<50:57,  1.19s/it, loss=0.658][A
 80%|███████▉  | 10118/12691 [3:20:41<50:11,  1.17s/it, loss=0.658][A
 80%|███████▉  | 10118/12691 [3:20:42<50:11,  1.17s/it, loss=0.657][A
 80%|███████▉  | 10119/12691 [3:20:42<49:54,  1.16s/it, loss=0.657][A
 80%|███████▉  | 10119/12691 [3:20:44<49:54,  1.16s/it, loss=0.654][A
 80%|███████▉  | 10120/12691 [3:20:44<50:54,  1.19s/it, loss=0.654][A
 80%|███████▉  | 10120/12691 [3:20:45<50:54,  1.19s/it, loss=0.657][A
 80%|███████▉  | 10121/12691 [3:20:45<50:57,  1.19s/it, loss=0.657][A
 80%|███████▉  | 10121/12691 [3:20:46<50:57,  1.19s/it, loss=0.659][A
 80%|███████▉  | 10122/12691 [3:20:46<50:34,  1.18s/it, loss=0.659][A
 80%|███████▉  | 10122/12691 [3:20:47<50:34,  1.18s/it, loss=0.658][A
 80%|█

 80%|████████  | 10173/12691 [3:21:49<48:29,  1.16s/it, loss=0.653][A
 80%|████████  | 10174/12691 [3:21:49<48:13,  1.15s/it, loss=0.653][A
 80%|████████  | 10174/12691 [3:21:50<48:13,  1.15s/it, loss=0.655][A
 80%|████████  | 10175/12691 [3:21:50<49:32,  1.18s/it, loss=0.655][A
 80%|████████  | 10175/12691 [3:21:51<49:32,  1.18s/it, loss=0.656][A
 80%|████████  | 10176/12691 [3:21:51<49:01,  1.17s/it, loss=0.656][A
 80%|████████  | 10176/12691 [3:21:53<49:01,  1.17s/it, loss=0.656][A
 80%|████████  | 10177/12691 [3:21:53<48:35,  1.16s/it, loss=0.656][A
 80%|████████  | 10177/12691 [3:21:54<48:35,  1.16s/it, loss=0.657][A
 80%|████████  | 10178/12691 [3:21:54<49:10,  1.17s/it, loss=0.657][A
 80%|████████  | 10178/12691 [3:21:55<49:10,  1.17s/it, loss=0.655][A
 80%|████████  | 10179/12691 [3:21:55<48:56,  1.17s/it, loss=0.655][A
 80%|████████  | 10179/12691 [3:21:56<48:56,  1.17s/it, loss=0.655][A
 80%|████████  | 10180/12691 [3:21:56<48:40,  1.16s/it, loss=0.655][A
 80%|█

 81%|████████  | 10231/12691 [3:22:58<51:47,  1.26s/it, loss=0.65][A
 81%|████████  | 10231/12691 [3:22:59<51:47,  1.26s/it, loss=0.65][A
 81%|████████  | 10232/12691 [3:22:59<51:39,  1.26s/it, loss=0.65][A
 81%|████████  | 10232/12691 [3:23:01<51:39,  1.26s/it, loss=0.647][A
 81%|████████  | 10233/12691 [3:23:01<50:30,  1.23s/it, loss=0.647][A
 81%|████████  | 10233/12691 [3:23:02<50:30,  1.23s/it, loss=0.647][A
 81%|████████  | 10234/12691 [3:23:02<50:10,  1.23s/it, loss=0.647][A
 81%|████████  | 10234/12691 [3:23:03<50:10,  1.23s/it, loss=0.645][A
 81%|████████  | 10235/12691 [3:23:03<49:44,  1.22s/it, loss=0.645][A
 81%|████████  | 10235/12691 [3:23:04<49:44,  1.22s/it, loss=0.645][A
 81%|████████  | 10236/12691 [3:23:04<50:13,  1.23s/it, loss=0.645][A
 81%|████████  | 10236/12691 [3:23:06<50:13,  1.23s/it, loss=0.649][A
 81%|████████  | 10237/12691 [3:23:06<51:32,  1.26s/it, loss=0.649][A
 81%|████████  | 10237/12691 [3:23:07<51:32,  1.26s/it, loss=0.648][A
 81%|████

 81%|████████  | 10288/12691 [3:24:08<48:14,  1.20s/it, loss=0.658][A
 81%|████████  | 10289/12691 [3:24:08<48:58,  1.22s/it, loss=0.658][A
 81%|████████  | 10289/12691 [3:24:09<48:58,  1.22s/it, loss=0.658][A
 81%|████████  | 10290/12691 [3:24:09<49:12,  1.23s/it, loss=0.658][A
 81%|████████  | 10290/12691 [3:24:11<49:12,  1.23s/it, loss=0.657][A
 81%|████████  | 10291/12691 [3:24:11<49:00,  1.23s/it, loss=0.657][A
 81%|████████  | 10291/12691 [3:24:12<49:00,  1.23s/it, loss=0.658][A
 81%|████████  | 10292/12691 [3:24:12<48:34,  1.21s/it, loss=0.658][A
 81%|████████  | 10292/12691 [3:24:13<48:34,  1.21s/it, loss=0.658][A
 81%|████████  | 10293/12691 [3:24:13<50:12,  1.26s/it, loss=0.658][A
 81%|████████  | 10293/12691 [3:24:14<50:12,  1.26s/it, loss=0.655][A
 81%|████████  | 10294/12691 [3:24:14<49:28,  1.24s/it, loss=0.655][A
 81%|████████  | 10294/12691 [3:24:15<49:28,  1.24s/it, loss=0.657][A
 81%|████████  | 10295/12691 [3:24:15<48:43,  1.22s/it, loss=0.657][A
 81%|█

 82%|████████▏ | 10346/12691 [3:25:16<45:26,  1.16s/it, loss=0.639][A
 82%|████████▏ | 10346/12691 [3:25:17<45:26,  1.16s/it, loss=0.641][A
 82%|████████▏ | 10347/12691 [3:25:17<45:35,  1.17s/it, loss=0.641][A
 82%|████████▏ | 10347/12691 [3:25:19<45:35,  1.17s/it, loss=0.64] [A
 82%|████████▏ | 10348/12691 [3:25:19<46:21,  1.19s/it, loss=0.64][A
 82%|████████▏ | 10348/12691 [3:25:20<46:21,  1.19s/it, loss=0.639][A
 82%|████████▏ | 10349/12691 [3:25:20<47:18,  1.21s/it, loss=0.639][A
 82%|████████▏ | 10349/12691 [3:25:21<47:18,  1.21s/it, loss=0.637][A
 82%|████████▏ | 10350/12691 [3:25:21<46:19,  1.19s/it, loss=0.637][A
 82%|████████▏ | 10350/12691 [3:25:22<46:19,  1.19s/it, loss=0.635][A
 82%|████████▏ | 10351/12691 [3:25:22<45:30,  1.17s/it, loss=0.635][A
 82%|████████▏ | 10351/12691 [3:25:23<45:30,  1.17s/it, loss=0.638][A
 82%|████████▏ | 10352/12691 [3:25:23<45:19,  1.16s/it, loss=0.638][A
 82%|████████▏ | 10352/12691 [3:25:24<45:19,  1.16s/it, loss=0.637][A
 82%|██

 82%|████████▏ | 10403/12691 [3:26:26<44:59,  1.18s/it, loss=0.63][A
 82%|████████▏ | 10404/12691 [3:26:26<46:27,  1.22s/it, loss=0.63][A
 82%|████████▏ | 10404/12691 [3:26:27<46:27,  1.22s/it, loss=0.631][A
 82%|████████▏ | 10405/12691 [3:26:27<45:41,  1.20s/it, loss=0.631][A
 82%|████████▏ | 10405/12691 [3:26:28<45:41,  1.20s/it, loss=0.633][A
 82%|████████▏ | 10406/12691 [3:26:28<45:29,  1.19s/it, loss=0.633][A
 82%|████████▏ | 10406/12691 [3:26:29<45:29,  1.19s/it, loss=0.632][A
 82%|████████▏ | 10407/12691 [3:26:29<45:13,  1.19s/it, loss=0.632][A
 82%|████████▏ | 10407/12691 [3:26:30<45:13,  1.19s/it, loss=0.632][A
 82%|████████▏ | 10408/12691 [3:26:30<45:56,  1.21s/it, loss=0.632][A
 82%|████████▏ | 10408/12691 [3:26:32<45:56,  1.21s/it, loss=0.634][A
 82%|████████▏ | 10409/12691 [3:26:32<45:22,  1.19s/it, loss=0.634][A
 82%|████████▏ | 10409/12691 [3:26:33<45:22,  1.19s/it, loss=0.633][A
 82%|████████▏ | 10410/12691 [3:26:33<46:45,  1.23s/it, loss=0.633][A
 82%|███

 82%|████████▏ | 10461/12691 [3:27:34<42:39,  1.15s/it, loss=0.633][A
 82%|████████▏ | 10461/12691 [3:27:35<42:39,  1.15s/it, loss=0.63] [A
 82%|████████▏ | 10462/12691 [3:27:35<42:47,  1.15s/it, loss=0.63][A
 82%|████████▏ | 10462/12691 [3:27:36<42:47,  1.15s/it, loss=0.631][A
 82%|████████▏ | 10463/12691 [3:27:36<43:44,  1.18s/it, loss=0.631][A
 82%|████████▏ | 10463/12691 [3:27:38<43:44,  1.18s/it, loss=0.632][A
 82%|████████▏ | 10464/12691 [3:27:38<43:45,  1.18s/it, loss=0.632][A
 82%|████████▏ | 10464/12691 [3:27:39<43:45,  1.18s/it, loss=0.634][A
 82%|████████▏ | 10465/12691 [3:27:39<43:15,  1.17s/it, loss=0.634][A
 82%|████████▏ | 10465/12691 [3:27:40<43:15,  1.17s/it, loss=0.634][A
 82%|████████▏ | 10466/12691 [3:27:40<43:54,  1.18s/it, loss=0.634][A
 82%|████████▏ | 10466/12691 [3:27:41<43:54,  1.18s/it, loss=0.635][A
 82%|████████▏ | 10467/12691 [3:27:41<43:30,  1.17s/it, loss=0.635][A
 82%|████████▏ | 10467/12691 [3:27:42<43:30,  1.17s/it, loss=0.638][A
 82%|██

 83%|████████▎ | 10518/12691 [3:28:44<43:55,  1.21s/it, loss=0.654][A
 83%|████████▎ | 10519/12691 [3:28:44<43:22,  1.20s/it, loss=0.654][A
 83%|████████▎ | 10519/12691 [3:28:45<43:22,  1.20s/it, loss=0.652][A
 83%|████████▎ | 10520/12691 [3:28:45<42:40,  1.18s/it, loss=0.652][A
 83%|████████▎ | 10520/12691 [3:28:46<42:40,  1.18s/it, loss=0.653][A
 83%|████████▎ | 10521/12691 [3:28:46<41:52,  1.16s/it, loss=0.653][A
 83%|████████▎ | 10521/12691 [3:28:47<41:52,  1.16s/it, loss=0.654][A
 83%|████████▎ | 10522/12691 [3:28:47<43:13,  1.20s/it, loss=0.654][A
 83%|████████▎ | 10522/12691 [3:28:49<43:13,  1.20s/it, loss=0.658][A
 83%|████████▎ | 10523/12691 [3:28:49<43:11,  1.20s/it, loss=0.658][A
 83%|████████▎ | 10523/12691 [3:28:50<43:11,  1.20s/it, loss=0.657][A
 83%|████████▎ | 10524/12691 [3:28:50<42:25,  1.17s/it, loss=0.657][A
 83%|████████▎ | 10524/12691 [3:28:51<42:25,  1.17s/it, loss=0.656][A
 83%|████████▎ | 10525/12691 [3:28:51<42:25,  1.18s/it, loss=0.656][A
 83%|█

 83%|████████▎ | 10576/12691 [3:29:51<40:38,  1.15s/it, loss=0.649][A
 83%|████████▎ | 10576/12691 [3:29:52<40:38,  1.15s/it, loss=0.648][A
 83%|████████▎ | 10577/12691 [3:29:52<42:04,  1.19s/it, loss=0.648][A
 83%|████████▎ | 10577/12691 [3:29:53<42:04,  1.19s/it, loss=0.65] [A
 83%|████████▎ | 10578/12691 [3:29:53<41:44,  1.19s/it, loss=0.65][A
 83%|████████▎ | 10578/12691 [3:29:54<41:44,  1.19s/it, loss=0.648][A
 83%|████████▎ | 10579/12691 [3:29:54<41:13,  1.17s/it, loss=0.648][A
 83%|████████▎ | 10579/12691 [3:29:56<41:13,  1.17s/it, loss=0.646][A
 83%|████████▎ | 10580/12691 [3:29:56<40:58,  1.16s/it, loss=0.646][A
 83%|████████▎ | 10580/12691 [3:29:57<40:58,  1.16s/it, loss=0.647][A
 83%|████████▎ | 10581/12691 [3:29:57<40:58,  1.17s/it, loss=0.647][A
 83%|████████▎ | 10581/12691 [3:29:58<40:58,  1.17s/it, loss=0.648][A
 83%|████████▎ | 10582/12691 [3:29:58<41:00,  1.17s/it, loss=0.648][A
 83%|████████▎ | 10582/12691 [3:29:59<41:00,  1.17s/it, loss=0.649][A
 83%|██

 84%|████████▍ | 10633/12691 [3:31:00<43:33,  1.27s/it, loss=0.658][A
 84%|████████▍ | 10634/12691 [3:31:00<43:15,  1.26s/it, loss=0.658][A
 84%|████████▍ | 10634/12691 [3:31:01<43:15,  1.26s/it, loss=0.657][A
 84%|████████▍ | 10635/12691 [3:31:01<43:02,  1.26s/it, loss=0.657][A
 84%|████████▍ | 10635/12691 [3:31:02<43:02,  1.26s/it, loss=0.658][A
 84%|████████▍ | 10636/12691 [3:31:02<42:57,  1.25s/it, loss=0.658][A
 84%|████████▍ | 10636/12691 [3:31:04<42:57,  1.25s/it, loss=0.66] [A
 84%|████████▍ | 10637/12691 [3:31:04<42:37,  1.25s/it, loss=0.66][A
 84%|████████▍ | 10637/12691 [3:31:05<42:37,  1.25s/it, loss=0.659][A
 84%|████████▍ | 10638/12691 [3:31:05<42:34,  1.24s/it, loss=0.659][A
 84%|████████▍ | 10638/12691 [3:31:06<42:34,  1.24s/it, loss=0.66] [A
 84%|████████▍ | 10639/12691 [3:31:06<43:47,  1.28s/it, loss=0.66][A
 84%|████████▍ | 10639/12691 [3:31:07<43:47,  1.28s/it, loss=0.657][A
 84%|████████▍ | 10640/12691 [3:31:07<42:21,  1.24s/it, loss=0.657][A
 84%|███

 84%|████████▍ | 10691/12691 [3:32:09<40:56,  1.23s/it, loss=0.654][A
 84%|████████▍ | 10691/12691 [3:32:11<40:56,  1.23s/it, loss=0.65] [A
 84%|████████▍ | 10692/12691 [3:32:11<40:48,  1.23s/it, loss=0.65][A
 84%|████████▍ | 10692/12691 [3:32:12<40:48,  1.23s/it, loss=0.65][A
 84%|████████▍ | 10693/12691 [3:32:12<40:51,  1.23s/it, loss=0.65][A
 84%|████████▍ | 10693/12691 [3:32:13<40:51,  1.23s/it, loss=0.651][A
 84%|████████▍ | 10694/12691 [3:32:13<41:09,  1.24s/it, loss=0.651][A
 84%|████████▍ | 10694/12691 [3:32:14<41:09,  1.24s/it, loss=0.652][A
 84%|████████▍ | 10695/12691 [3:32:14<40:32,  1.22s/it, loss=0.652][A
 84%|████████▍ | 10695/12691 [3:32:15<40:32,  1.22s/it, loss=0.652][A
 84%|████████▍ | 10696/12691 [3:32:15<40:46,  1.23s/it, loss=0.652][A
 84%|████████▍ | 10696/12691 [3:32:17<40:46,  1.23s/it, loss=0.652][A
 84%|████████▍ | 10697/12691 [3:32:17<41:02,  1.24s/it, loss=0.652][A
 84%|████████▍ | 10697/12691 [3:32:18<41:02,  1.24s/it, loss=0.653][A
 84%|████

 85%|████████▍ | 10748/12691 [3:33:17<37:07,  1.15s/it, loss=0.662][A
 85%|████████▍ | 10749/12691 [3:33:17<36:50,  1.14s/it, loss=0.662][A
 85%|████████▍ | 10749/12691 [3:33:18<36:50,  1.14s/it, loss=0.662][A
 85%|████████▍ | 10750/12691 [3:33:18<37:31,  1.16s/it, loss=0.662][A
 85%|████████▍ | 10750/12691 [3:33:19<37:31,  1.16s/it, loss=0.659][A
 85%|████████▍ | 10751/12691 [3:33:19<38:06,  1.18s/it, loss=0.659][A
 85%|████████▍ | 10751/12691 [3:33:21<38:06,  1.18s/it, loss=0.658][A
 85%|████████▍ | 10752/12691 [3:33:21<37:36,  1.16s/it, loss=0.658][A
 85%|████████▍ | 10752/12691 [3:33:22<37:36,  1.16s/it, loss=0.658][A
 85%|████████▍ | 10753/12691 [3:33:22<37:14,  1.15s/it, loss=0.658][A
 85%|████████▍ | 10753/12691 [3:33:23<37:14,  1.15s/it, loss=0.656][A
 85%|████████▍ | 10754/12691 [3:33:23<37:11,  1.15s/it, loss=0.656][A
 85%|████████▍ | 10754/12691 [3:33:24<37:11,  1.15s/it, loss=0.658][A
 85%|████████▍ | 10755/12691 [3:33:24<37:23,  1.16s/it, loss=0.658][A
 85%|█

 85%|████████▌ | 10806/12691 [3:34:24<37:25,  1.19s/it, loss=0.659][A
 85%|████████▌ | 10806/12691 [3:34:25<37:25,  1.19s/it, loss=0.662][A
 85%|████████▌ | 10807/12691 [3:34:26<37:14,  1.19s/it, loss=0.662][A
 85%|████████▌ | 10807/12691 [3:34:27<37:14,  1.19s/it, loss=0.663][A
 85%|████████▌ | 10808/12691 [3:34:27<36:46,  1.17s/it, loss=0.663][A
 85%|████████▌ | 10808/12691 [3:34:28<36:46,  1.17s/it, loss=0.664][A
 85%|████████▌ | 10809/12691 [3:34:28<35:55,  1.15s/it, loss=0.664][A
 85%|████████▌ | 10809/12691 [3:34:29<35:55,  1.15s/it, loss=0.664][A
 85%|████████▌ | 10810/12691 [3:34:29<36:20,  1.16s/it, loss=0.664][A
 85%|████████▌ | 10810/12691 [3:34:30<36:20,  1.16s/it, loss=0.664][A
 85%|████████▌ | 10811/12691 [3:34:30<35:56,  1.15s/it, loss=0.664][A
 85%|████████▌ | 10811/12691 [3:34:31<35:56,  1.15s/it, loss=0.665][A
 85%|████████▌ | 10812/12691 [3:34:31<37:50,  1.21s/it, loss=0.665][A
 85%|████████▌ | 10812/12691 [3:34:33<37:50,  1.21s/it, loss=0.668][A
 85%|█

 86%|████████▌ | 10863/12691 [3:35:32<35:37,  1.17s/it, loss=0.652][A
 86%|████████▌ | 10864/12691 [3:35:32<35:44,  1.17s/it, loss=0.652][A
 86%|████████▌ | 10864/12691 [3:35:33<35:44,  1.17s/it, loss=0.651][A
 86%|████████▌ | 10865/12691 [3:35:33<35:16,  1.16s/it, loss=0.651][A
 86%|████████▌ | 10865/12691 [3:35:34<35:16,  1.16s/it, loss=0.647][A
 86%|████████▌ | 10866/12691 [3:35:34<35:05,  1.15s/it, loss=0.647][A
 86%|████████▌ | 10866/12691 [3:35:35<35:05,  1.15s/it, loss=0.649][A
 86%|████████▌ | 10867/12691 [3:35:35<35:06,  1.16s/it, loss=0.649][A
 86%|████████▌ | 10867/12691 [3:35:37<35:06,  1.16s/it, loss=0.647][A
 86%|████████▌ | 10868/12691 [3:35:37<36:02,  1.19s/it, loss=0.647][A
 86%|████████▌ | 10868/12691 [3:35:38<36:02,  1.19s/it, loss=0.647][A
 86%|████████▌ | 10869/12691 [3:35:38<35:39,  1.17s/it, loss=0.647][A
 86%|████████▌ | 10869/12691 [3:35:39<35:39,  1.17s/it, loss=0.646][A
 86%|████████▌ | 10870/12691 [3:35:39<35:23,  1.17s/it, loss=0.646][A
 86%|█

 86%|████████▌ | 10921/12691 [3:36:39<34:48,  1.18s/it, loss=0.647][A
 86%|████████▌ | 10921/12691 [3:36:40<34:48,  1.18s/it, loss=0.647][A
 86%|████████▌ | 10922/12691 [3:36:40<34:42,  1.18s/it, loss=0.647][A
 86%|████████▌ | 10922/12691 [3:36:42<34:42,  1.18s/it, loss=0.649][A
 86%|████████▌ | 10923/12691 [3:36:42<35:05,  1.19s/it, loss=0.649][A
 86%|████████▌ | 10923/12691 [3:36:43<35:05,  1.19s/it, loss=0.65] [A
 86%|████████▌ | 10924/12691 [3:36:43<34:40,  1.18s/it, loss=0.65][A
 86%|████████▌ | 10924/12691 [3:36:44<34:40,  1.18s/it, loss=0.649][A
 86%|████████▌ | 10925/12691 [3:36:44<34:34,  1.17s/it, loss=0.649][A
 86%|████████▌ | 10925/12691 [3:36:45<34:34,  1.17s/it, loss=0.656][A
 86%|████████▌ | 10926/12691 [3:36:45<34:19,  1.17s/it, loss=0.656][A
 86%|████████▌ | 10926/12691 [3:36:46<34:19,  1.17s/it, loss=0.656][A
 86%|████████▌ | 10927/12691 [3:36:46<34:59,  1.19s/it, loss=0.656][A
 86%|████████▌ | 10927/12691 [3:36:48<34:59,  1.19s/it, loss=0.658][A
 86%|██

 87%|████████▋ | 10978/12691 [3:37:48<32:46,  1.15s/it, loss=0.649][A
 87%|████████▋ | 10979/12691 [3:37:48<33:43,  1.18s/it, loss=0.649][A
 87%|████████▋ | 10979/12691 [3:37:50<33:43,  1.18s/it, loss=0.649][A
 87%|████████▋ | 10980/12691 [3:37:50<33:54,  1.19s/it, loss=0.649][A
 87%|████████▋ | 10980/12691 [3:37:51<33:54,  1.19s/it, loss=0.649][A
 87%|████████▋ | 10981/12691 [3:37:51<34:16,  1.20s/it, loss=0.649][A
 87%|████████▋ | 10981/12691 [3:37:52<34:16,  1.20s/it, loss=0.65] [A
 87%|████████▋ | 10982/12691 [3:37:52<33:37,  1.18s/it, loss=0.65][A
 87%|████████▋ | 10982/12691 [3:37:53<33:37,  1.18s/it, loss=0.651][A
 87%|████████▋ | 10983/12691 [3:37:53<33:14,  1.17s/it, loss=0.651][A
 87%|████████▋ | 10983/12691 [3:37:54<33:14,  1.17s/it, loss=0.652][A
 87%|████████▋ | 10984/12691 [3:37:54<33:16,  1.17s/it, loss=0.652][A
 87%|████████▋ | 10984/12691 [3:37:56<33:16,  1.17s/it, loss=0.652][A
 87%|████████▋ | 10985/12691 [3:37:56<33:52,  1.19s/it, loss=0.652][A
 87%|██

saving model checkpoint at iteration=11000



 87%|████████▋ | 11001/12691 [3:38:16<40:55,  1.45s/it, loss=0.66][A
 87%|████████▋ | 11001/12691 [3:38:17<40:55,  1.45s/it, loss=0.658][A
 87%|████████▋ | 11002/12691 [3:38:17<38:55,  1.38s/it, loss=0.658][A
 87%|████████▋ | 11002/12691 [3:38:18<38:55,  1.38s/it, loss=0.656][A
 87%|████████▋ | 11003/12691 [3:38:18<37:40,  1.34s/it, loss=0.656][A
 87%|████████▋ | 11003/12691 [3:38:19<37:40,  1.34s/it, loss=0.654][A
 87%|████████▋ | 11004/12691 [3:38:19<35:55,  1.28s/it, loss=0.654][A
 87%|████████▋ | 11004/12691 [3:38:20<35:55,  1.28s/it, loss=0.653][A
 87%|████████▋ | 11005/12691 [3:38:20<34:49,  1.24s/it, loss=0.653][A
 87%|████████▋ | 11005/12691 [3:38:21<34:49,  1.24s/it, loss=0.651][A
 87%|████████▋ | 11006/12691 [3:38:21<33:51,  1.21s/it, loss=0.651][A
 87%|████████▋ | 11006/12691 [3:38:23<33:51,  1.21s/it, loss=0.65] [A
 87%|████████▋ | 11007/12691 [3:38:23<33:22,  1.19s/it, loss=0.65][A
 87%|████████▋ | 11007/12691 [3:38:24<33:22,  1.19s/it, loss=0.656][A
 87%|██

 87%|████████▋ | 11058/12691 [3:39:23<31:28,  1.16s/it, loss=0.658][A
 87%|████████▋ | 11059/12691 [3:39:23<32:06,  1.18s/it, loss=0.658][A
 87%|████████▋ | 11059/12691 [3:39:24<32:06,  1.18s/it, loss=0.657][A
 87%|████████▋ | 11060/12691 [3:39:24<31:38,  1.16s/it, loss=0.657][A
 87%|████████▋ | 11060/12691 [3:39:26<31:38,  1.16s/it, loss=0.657][A
 87%|████████▋ | 11061/12691 [3:39:26<31:30,  1.16s/it, loss=0.657][A
 87%|████████▋ | 11061/12691 [3:39:27<31:30,  1.16s/it, loss=0.657][A
 87%|████████▋ | 11062/12691 [3:39:27<31:08,  1.15s/it, loss=0.657][A
 87%|████████▋ | 11062/12691 [3:39:28<31:08,  1.15s/it, loss=0.655][A
 87%|████████▋ | 11063/12691 [3:39:28<31:07,  1.15s/it, loss=0.655][A
 87%|████████▋ | 11063/12691 [3:39:29<31:07,  1.15s/it, loss=0.654][A
 87%|████████▋ | 11064/12691 [3:39:29<31:05,  1.15s/it, loss=0.654][A
 87%|████████▋ | 11064/12691 [3:39:30<31:05,  1.15s/it, loss=0.654][A
 87%|████████▋ | 11065/12691 [3:39:30<32:04,  1.18s/it, loss=0.654][A
 87%|█

 88%|████████▊ | 11116/12691 [3:40:32<32:15,  1.23s/it, loss=0.662][A
 88%|████████▊ | 11116/12691 [3:40:33<32:15,  1.23s/it, loss=0.659][A
 88%|████████▊ | 11117/12691 [3:40:33<31:35,  1.20s/it, loss=0.659][A
 88%|████████▊ | 11117/12691 [3:40:34<31:35,  1.20s/it, loss=0.659][A
 88%|████████▊ | 11118/12691 [3:40:34<31:00,  1.18s/it, loss=0.659][A
 88%|████████▊ | 11118/12691 [3:40:36<31:00,  1.18s/it, loss=0.661][A
 88%|████████▊ | 11119/12691 [3:40:36<31:22,  1.20s/it, loss=0.661][A
 88%|████████▊ | 11119/12691 [3:40:37<31:22,  1.20s/it, loss=0.659][A
 88%|████████▊ | 11120/12691 [3:40:37<32:03,  1.22s/it, loss=0.659][A
 88%|████████▊ | 11120/12691 [3:40:38<32:03,  1.22s/it, loss=0.66] [A
 88%|████████▊ | 11121/12691 [3:40:38<31:40,  1.21s/it, loss=0.66][A
 88%|████████▊ | 11121/12691 [3:40:39<31:40,  1.21s/it, loss=0.66][A
 88%|████████▊ | 11122/12691 [3:40:39<31:49,  1.22s/it, loss=0.66][A
 88%|████████▊ | 11122/12691 [3:40:40<31:49,  1.22s/it, loss=0.657][A
 88%|████

 88%|████████▊ | 11173/12691 [3:41:44<31:44,  1.25s/it, loss=0.647][A
 88%|████████▊ | 11174/12691 [3:41:44<31:27,  1.24s/it, loss=0.647][A
 88%|████████▊ | 11174/12691 [3:41:45<31:27,  1.24s/it, loss=0.649][A
 88%|████████▊ | 11175/12691 [3:41:45<31:05,  1.23s/it, loss=0.649][A
 88%|████████▊ | 11175/12691 [3:41:47<31:05,  1.23s/it, loss=0.651][A
 88%|████████▊ | 11176/12691 [3:41:47<32:24,  1.28s/it, loss=0.651][A
 88%|████████▊ | 11176/12691 [3:41:48<32:24,  1.28s/it, loss=0.648][A
 88%|████████▊ | 11177/12691 [3:41:48<32:04,  1.27s/it, loss=0.648][A
 88%|████████▊ | 11177/12691 [3:41:49<32:04,  1.27s/it, loss=0.647][A
 88%|████████▊ | 11178/12691 [3:41:49<31:48,  1.26s/it, loss=0.647][A
 88%|████████▊ | 11178/12691 [3:41:50<31:48,  1.26s/it, loss=0.649][A
 88%|████████▊ | 11179/12691 [3:41:50<31:51,  1.26s/it, loss=0.649][A
 88%|████████▊ | 11179/12691 [3:41:52<31:51,  1.26s/it, loss=0.649][A
 88%|████████▊ | 11180/12691 [3:41:52<31:44,  1.26s/it, loss=0.649][A
 88%|█

 88%|████████▊ | 11231/12691 [3:42:55<30:25,  1.25s/it, loss=0.651][A
 88%|████████▊ | 11231/12691 [3:42:56<30:25,  1.25s/it, loss=0.656][A
 89%|████████▊ | 11232/12691 [3:42:56<29:57,  1.23s/it, loss=0.656][A
 89%|████████▊ | 11232/12691 [3:42:57<29:57,  1.23s/it, loss=0.654][A
 89%|████████▊ | 11233/12691 [3:42:57<29:18,  1.21s/it, loss=0.654][A
 89%|████████▊ | 11233/12691 [3:42:58<29:18,  1.21s/it, loss=0.654][A
 89%|████████▊ | 11234/12691 [3:42:58<29:17,  1.21s/it, loss=0.654][A
 89%|████████▊ | 11234/12691 [3:43:00<29:17,  1.21s/it, loss=0.651][A
 89%|████████▊ | 11235/12691 [3:43:00<28:46,  1.19s/it, loss=0.651][A
 89%|████████▊ | 11235/12691 [3:43:01<28:46,  1.19s/it, loss=0.651][A
 89%|████████▊ | 11236/12691 [3:43:01<29:02,  1.20s/it, loss=0.651][A
 89%|████████▊ | 11236/12691 [3:43:02<29:02,  1.20s/it, loss=0.651][A
 89%|████████▊ | 11237/12691 [3:43:02<29:51,  1.23s/it, loss=0.651][A
 89%|████████▊ | 11237/12691 [3:43:03<29:51,  1.23s/it, loss=0.648][A
 89%|█

 89%|████████▉ | 11288/12691 [3:44:03<26:39,  1.14s/it, loss=0.643][A
 89%|████████▉ | 11289/12691 [3:44:03<26:28,  1.13s/it, loss=0.643][A
 89%|████████▉ | 11289/12691 [3:44:05<26:28,  1.13s/it, loss=0.645][A
 89%|████████▉ | 11290/12691 [3:44:05<26:47,  1.15s/it, loss=0.645][A
 89%|████████▉ | 11290/12691 [3:44:06<26:47,  1.15s/it, loss=0.645][A
 89%|████████▉ | 11291/12691 [3:44:06<26:31,  1.14s/it, loss=0.645][A
 89%|████████▉ | 11291/12691 [3:44:07<26:31,  1.14s/it, loss=0.645][A
 89%|████████▉ | 11292/12691 [3:44:07<26:39,  1.14s/it, loss=0.645][A
 89%|████████▉ | 11292/12691 [3:44:08<26:39,  1.14s/it, loss=0.644][A
 89%|████████▉ | 11293/12691 [3:44:08<27:55,  1.20s/it, loss=0.644][A
 89%|████████▉ | 11293/12691 [3:44:09<27:55,  1.20s/it, loss=0.642][A
 89%|████████▉ | 11294/12691 [3:44:09<27:26,  1.18s/it, loss=0.642][A
 89%|████████▉ | 11294/12691 [3:44:11<27:26,  1.18s/it, loss=0.642][A
 89%|████████▉ | 11295/12691 [3:44:11<27:31,  1.18s/it, loss=0.642][A
 89%|█

 89%|████████▉ | 11346/12691 [3:45:10<26:21,  1.18s/it, loss=0.643][A
 89%|████████▉ | 11346/12691 [3:45:11<26:21,  1.18s/it, loss=0.646][A
 89%|████████▉ | 11347/12691 [3:45:11<26:07,  1.17s/it, loss=0.646][A
 89%|████████▉ | 11347/12691 [3:45:12<26:07,  1.17s/it, loss=0.644][A
 89%|████████▉ | 11348/12691 [3:45:12<27:26,  1.23s/it, loss=0.644][A
 89%|████████▉ | 11348/12691 [3:45:13<27:26,  1.23s/it, loss=0.641][A
 89%|████████▉ | 11349/12691 [3:45:13<26:57,  1.21s/it, loss=0.641][A
 89%|████████▉ | 11349/12691 [3:45:14<26:57,  1.21s/it, loss=0.638][A
 89%|████████▉ | 11350/12691 [3:45:14<26:22,  1.18s/it, loss=0.638][A
 89%|████████▉ | 11350/12691 [3:45:15<26:22,  1.18s/it, loss=0.639][A
 89%|████████▉ | 11351/12691 [3:45:15<26:00,  1.16s/it, loss=0.639][A
 89%|████████▉ | 11351/12691 [3:45:17<26:00,  1.16s/it, loss=0.638][A
 89%|████████▉ | 11352/12691 [3:45:17<25:43,  1.15s/it, loss=0.638][A
 89%|████████▉ | 11352/12691 [3:45:18<25:43,  1.15s/it, loss=0.637][A
 89%|█

 90%|████████▉ | 11403/12691 [3:46:19<25:33,  1.19s/it, loss=0.637][A
 90%|████████▉ | 11404/12691 [3:46:19<26:29,  1.23s/it, loss=0.637][A
 90%|████████▉ | 11404/12691 [3:46:20<26:29,  1.23s/it, loss=0.639][A
 90%|████████▉ | 11405/12691 [3:46:20<26:18,  1.23s/it, loss=0.639][A
 90%|████████▉ | 11405/12691 [3:46:21<26:18,  1.23s/it, loss=0.639][A
 90%|████████▉ | 11406/12691 [3:46:21<25:47,  1.20s/it, loss=0.639][A
 90%|████████▉ | 11406/12691 [3:46:22<25:47,  1.20s/it, loss=0.638][A
 90%|████████▉ | 11407/12691 [3:46:22<26:02,  1.22s/it, loss=0.638][A
 90%|████████▉ | 11407/12691 [3:46:24<26:02,  1.22s/it, loss=0.638][A
 90%|████████▉ | 11408/12691 [3:46:24<26:08,  1.22s/it, loss=0.638][A
 90%|████████▉ | 11408/12691 [3:46:25<26:08,  1.22s/it, loss=0.637][A
 90%|████████▉ | 11409/12691 [3:46:25<25:47,  1.21s/it, loss=0.637][A
 90%|████████▉ | 11409/12691 [3:46:26<25:47,  1.21s/it, loss=0.638][A
 90%|████████▉ | 11410/12691 [3:46:26<26:07,  1.22s/it, loss=0.638][A
 90%|█

 90%|█████████ | 11461/12691 [3:47:28<25:15,  1.23s/it, loss=0.665][A
 90%|█████████ | 11461/12691 [3:47:29<25:15,  1.23s/it, loss=0.664][A
 90%|█████████ | 11462/12691 [3:47:29<25:28,  1.24s/it, loss=0.664][A
 90%|█████████ | 11462/12691 [3:47:30<25:28,  1.24s/it, loss=0.667][A
 90%|█████████ | 11463/12691 [3:47:30<25:19,  1.24s/it, loss=0.667][A
 90%|█████████ | 11463/12691 [3:47:31<25:19,  1.24s/it, loss=0.665][A
 90%|█████████ | 11464/12691 [3:47:31<25:22,  1.24s/it, loss=0.665][A
 90%|█████████ | 11464/12691 [3:47:33<25:22,  1.24s/it, loss=0.665][A
 90%|█████████ | 11465/12691 [3:47:33<26:11,  1.28s/it, loss=0.665][A
 90%|█████████ | 11465/12691 [3:47:34<26:11,  1.28s/it, loss=0.665][A
 90%|█████████ | 11466/12691 [3:47:34<25:52,  1.27s/it, loss=0.665][A
 90%|█████████ | 11466/12691 [3:47:35<25:52,  1.27s/it, loss=0.663][A
 90%|█████████ | 11467/12691 [3:47:35<25:52,  1.27s/it, loss=0.663][A
 90%|█████████ | 11467/12691 [3:47:37<25:52,  1.27s/it, loss=0.663][A
 90%|█

 91%|█████████ | 11518/12691 [3:48:38<22:39,  1.16s/it, loss=0.64][A
 91%|█████████ | 11519/12691 [3:48:38<22:31,  1.15s/it, loss=0.64][A
 91%|█████████ | 11519/12691 [3:48:39<22:31,  1.15s/it, loss=0.641][A
 91%|█████████ | 11520/12691 [3:48:39<22:59,  1.18s/it, loss=0.641][A
 91%|█████████ | 11520/12691 [3:48:41<22:59,  1.18s/it, loss=0.64] [A
 91%|█████████ | 11521/12691 [3:48:41<23:57,  1.23s/it, loss=0.64][A
 91%|█████████ | 11521/12691 [3:48:42<23:57,  1.23s/it, loss=0.638][A
 91%|█████████ | 11522/12691 [3:48:42<23:56,  1.23s/it, loss=0.638][A
 91%|█████████ | 11522/12691 [3:48:43<23:56,  1.23s/it, loss=0.635][A
 91%|█████████ | 11523/12691 [3:48:43<24:03,  1.24s/it, loss=0.635][A
 91%|█████████ | 11523/12691 [3:48:44<24:03,  1.24s/it, loss=0.634][A
 91%|█████████ | 11524/12691 [3:48:44<23:33,  1.21s/it, loss=0.634][A
 91%|█████████ | 11524/12691 [3:48:46<23:33,  1.21s/it, loss=0.634][A
 91%|█████████ | 11525/12691 [3:48:46<23:25,  1.21s/it, loss=0.634][A
 91%|████

 91%|█████████ | 11576/12691 [3:49:48<23:20,  1.26s/it, loss=0.646][A
 91%|█████████ | 11576/12691 [3:49:49<23:20,  1.26s/it, loss=0.644][A
 91%|█████████ | 11577/12691 [3:49:49<22:35,  1.22s/it, loss=0.644][A
 91%|█████████ | 11577/12691 [3:49:50<22:35,  1.22s/it, loss=0.645][A
 91%|█████████ | 11578/12691 [3:49:50<22:31,  1.21s/it, loss=0.645][A
 91%|█████████ | 11578/12691 [3:49:51<22:31,  1.21s/it, loss=0.645][A
 91%|█████████ | 11579/12691 [3:49:51<22:16,  1.20s/it, loss=0.645][A
 91%|█████████ | 11579/12691 [3:49:53<22:16,  1.20s/it, loss=0.644][A
 91%|█████████ | 11580/12691 [3:49:53<22:08,  1.20s/it, loss=0.644][A
 91%|█████████ | 11580/12691 [3:49:54<22:08,  1.20s/it, loss=0.647][A
 91%|█████████▏| 11581/12691 [3:49:54<22:06,  1.19s/it, loss=0.647][A
 91%|█████████▏| 11581/12691 [3:49:55<22:06,  1.19s/it, loss=0.648][A
 91%|█████████▏| 11582/12691 [3:49:55<23:19,  1.26s/it, loss=0.648][A
 91%|█████████▏| 11582/12691 [3:49:56<23:19,  1.26s/it, loss=0.651][A
 91%|█

 92%|█████████▏| 11633/12691 [3:50:58<22:03,  1.25s/it, loss=0.647][A
 92%|█████████▏| 11634/12691 [3:50:58<22:07,  1.26s/it, loss=0.647][A
 92%|█████████▏| 11634/12691 [3:50:59<22:07,  1.26s/it, loss=0.643][A
 92%|█████████▏| 11635/12691 [3:50:59<21:55,  1.25s/it, loss=0.643][A
 92%|█████████▏| 11635/12691 [3:51:00<21:55,  1.25s/it, loss=0.642][A
 92%|█████████▏| 11636/12691 [3:51:00<21:54,  1.25s/it, loss=0.642][A
 92%|█████████▏| 11636/12691 [3:51:01<21:54,  1.25s/it, loss=0.643][A
 92%|█████████▏| 11637/12691 [3:51:01<21:35,  1.23s/it, loss=0.643][A
 92%|█████████▏| 11637/12691 [3:51:02<21:35,  1.23s/it, loss=0.643][A
 92%|█████████▏| 11638/12691 [3:51:02<21:47,  1.24s/it, loss=0.643][A
 92%|█████████▏| 11638/12691 [3:51:04<21:47,  1.24s/it, loss=0.642][A
 92%|█████████▏| 11639/12691 [3:51:04<21:18,  1.21s/it, loss=0.642][A
 92%|█████████▏| 11639/12691 [3:51:05<21:18,  1.21s/it, loss=0.641][A
 92%|█████████▏| 11640/12691 [3:51:05<20:57,  1.20s/it, loss=0.641][A
 92%|█

 92%|█████████▏| 11691/12691 [3:52:05<19:25,  1.17s/it, loss=0.642][A
 92%|█████████▏| 11691/12691 [3:52:06<19:25,  1.17s/it, loss=0.644][A
 92%|█████████▏| 11692/12691 [3:52:06<19:16,  1.16s/it, loss=0.644][A
 92%|█████████▏| 11692/12691 [3:52:07<19:16,  1.16s/it, loss=0.646][A
 92%|█████████▏| 11693/12691 [3:52:07<19:01,  1.14s/it, loss=0.646][A
 92%|█████████▏| 11693/12691 [3:52:08<19:01,  1.14s/it, loss=0.647][A
 92%|█████████▏| 11694/12691 [3:52:08<19:29,  1.17s/it, loss=0.647][A
 92%|█████████▏| 11694/12691 [3:52:09<19:29,  1.17s/it, loss=0.644][A
 92%|█████████▏| 11695/12691 [3:52:09<19:24,  1.17s/it, loss=0.644][A
 92%|█████████▏| 11695/12691 [3:52:10<19:24,  1.17s/it, loss=0.643][A
 92%|█████████▏| 11696/12691 [3:52:10<19:24,  1.17s/it, loss=0.643][A
 92%|█████████▏| 11696/12691 [3:52:12<19:24,  1.17s/it, loss=0.644][A
 92%|█████████▏| 11697/12691 [3:52:12<19:10,  1.16s/it, loss=0.644][A
 92%|█████████▏| 11697/12691 [3:52:13<19:10,  1.16s/it, loss=0.643][A
 92%|█

 93%|█████████▎| 11748/12691 [3:53:13<18:15,  1.16s/it, loss=0.64] [A
 93%|█████████▎| 11749/12691 [3:53:13<18:49,  1.20s/it, loss=0.64][A
 93%|█████████▎| 11749/12691 [3:53:14<18:49,  1.20s/it, loss=0.639][A
 93%|█████████▎| 11750/12691 [3:53:14<19:00,  1.21s/it, loss=0.639][A
 93%|█████████▎| 11750/12691 [3:53:16<19:00,  1.21s/it, loss=0.636][A
 93%|█████████▎| 11751/12691 [3:53:16<19:05,  1.22s/it, loss=0.636][A
 93%|█████████▎| 11751/12691 [3:53:17<19:05,  1.22s/it, loss=0.635][A
 93%|█████████▎| 11752/12691 [3:53:17<19:05,  1.22s/it, loss=0.635][A
 93%|█████████▎| 11752/12691 [3:53:18<19:05,  1.22s/it, loss=0.635][A
 93%|█████████▎| 11753/12691 [3:53:18<19:15,  1.23s/it, loss=0.635][A
 93%|█████████▎| 11753/12691 [3:53:19<19:15,  1.23s/it, loss=0.633][A
 93%|█████████▎| 11754/12691 [3:53:19<19:10,  1.23s/it, loss=0.633][A
 93%|█████████▎| 11754/12691 [3:53:21<19:10,  1.23s/it, loss=0.63] [A
 93%|█████████▎| 11755/12691 [3:53:21<19:43,  1.26s/it, loss=0.63][A
 93%|███

 93%|█████████▎| 11806/12691 [3:54:24<18:50,  1.28s/it, loss=0.642][A
 93%|█████████▎| 11806/12691 [3:54:25<18:50,  1.28s/it, loss=0.643][A
 93%|█████████▎| 11807/12691 [3:54:25<18:17,  1.24s/it, loss=0.643][A
 93%|█████████▎| 11807/12691 [3:54:26<18:17,  1.24s/it, loss=0.649][A
 93%|█████████▎| 11808/12691 [3:54:26<18:24,  1.25s/it, loss=0.649][A
 93%|█████████▎| 11808/12691 [3:54:28<18:24,  1.25s/it, loss=0.648][A
 93%|█████████▎| 11809/12691 [3:54:28<18:28,  1.26s/it, loss=0.648][A
 93%|█████████▎| 11809/12691 [3:54:29<18:28,  1.26s/it, loss=0.646][A
 93%|█████████▎| 11810/12691 [3:54:29<18:24,  1.25s/it, loss=0.646][A
 93%|█████████▎| 11810/12691 [3:54:30<18:24,  1.25s/it, loss=0.65] [A
 93%|█████████▎| 11811/12691 [3:54:30<18:42,  1.28s/it, loss=0.65][A
 93%|█████████▎| 11811/12691 [3:54:31<18:42,  1.28s/it, loss=0.648][A
 93%|█████████▎| 11812/12691 [3:54:31<18:31,  1.26s/it, loss=0.648][A
 93%|█████████▎| 11812/12691 [3:54:33<18:31,  1.26s/it, loss=0.649][A
 93%|██

 93%|█████████▎| 11863/12691 [3:55:32<15:56,  1.16s/it, loss=0.639][A
 93%|█████████▎| 11864/12691 [3:55:32<15:50,  1.15s/it, loss=0.639][A
 93%|█████████▎| 11864/12691 [3:55:33<15:50,  1.15s/it, loss=0.639][A
 93%|█████████▎| 11865/12691 [3:55:33<15:44,  1.14s/it, loss=0.639][A
 93%|█████████▎| 11865/12691 [3:55:34<15:44,  1.14s/it, loss=0.638][A
 93%|█████████▎| 11866/12691 [3:55:34<16:07,  1.17s/it, loss=0.638][A
 93%|█████████▎| 11866/12691 [3:55:35<16:07,  1.17s/it, loss=0.639][A
 94%|█████████▎| 11867/12691 [3:55:35<16:06,  1.17s/it, loss=0.639][A
 94%|█████████▎| 11867/12691 [3:55:36<16:06,  1.17s/it, loss=0.641][A
 94%|█████████▎| 11868/12691 [3:55:36<15:57,  1.16s/it, loss=0.641][A
 94%|█████████▎| 11868/12691 [3:55:38<15:57,  1.16s/it, loss=0.64] [A
 94%|█████████▎| 11869/12691 [3:55:38<15:46,  1.15s/it, loss=0.64][A
 94%|█████████▎| 11869/12691 [3:55:39<15:46,  1.15s/it, loss=0.639][A
 94%|█████████▎| 11870/12691 [3:55:39<16:03,  1.17s/it, loss=0.639][A
 94%|██

 94%|█████████▍| 11921/12691 [3:56:39<14:30,  1.13s/it, loss=0.638][A
 94%|█████████▍| 11921/12691 [3:56:40<14:30,  1.13s/it, loss=0.635][A
 94%|█████████▍| 11922/12691 [3:56:40<14:43,  1.15s/it, loss=0.635][A
 94%|█████████▍| 11922/12691 [3:56:41<14:43,  1.15s/it, loss=0.636][A
 94%|█████████▍| 11923/12691 [3:56:41<14:33,  1.14s/it, loss=0.636][A
 94%|█████████▍| 11923/12691 [3:56:42<14:33,  1.14s/it, loss=0.636][A
 94%|█████████▍| 11924/12691 [3:56:42<14:24,  1.13s/it, loss=0.636][A
 94%|█████████▍| 11924/12691 [3:56:43<14:24,  1.13s/it, loss=0.638][A
 94%|█████████▍| 11925/12691 [3:56:43<14:26,  1.13s/it, loss=0.638][A
 94%|█████████▍| 11925/12691 [3:56:44<14:26,  1.13s/it, loss=0.639][A
 94%|█████████▍| 11926/12691 [3:56:44<14:42,  1.15s/it, loss=0.639][A
 94%|█████████▍| 11926/12691 [3:56:45<14:42,  1.15s/it, loss=0.639][A
 94%|█████████▍| 11927/12691 [3:56:45<14:27,  1.14s/it, loss=0.639][A
 94%|█████████▍| 11927/12691 [3:56:47<14:27,  1.14s/it, loss=0.639][A
 94%|█

 94%|█████████▍| 11978/12691 [3:57:45<14:00,  1.18s/it, loss=0.646][A
 94%|█████████▍| 11979/12691 [3:57:45<13:46,  1.16s/it, loss=0.646][A
 94%|█████████▍| 11979/12691 [3:57:46<13:46,  1.16s/it, loss=0.647][A
 94%|█████████▍| 11980/12691 [3:57:46<13:46,  1.16s/it, loss=0.647][A
 94%|█████████▍| 11980/12691 [3:57:47<13:46,  1.16s/it, loss=0.645][A
 94%|█████████▍| 11981/12691 [3:57:47<13:41,  1.16s/it, loss=0.645][A
 94%|█████████▍| 11981/12691 [3:57:48<13:41,  1.16s/it, loss=0.644][A
 94%|█████████▍| 11982/12691 [3:57:48<13:35,  1.15s/it, loss=0.644][A
 94%|█████████▍| 11982/12691 [3:57:50<13:35,  1.15s/it, loss=0.641][A
 94%|█████████▍| 11983/12691 [3:57:50<13:32,  1.15s/it, loss=0.641][A
 94%|█████████▍| 11983/12691 [3:57:51<13:32,  1.15s/it, loss=0.639][A
 94%|█████████▍| 11984/12691 [3:57:51<14:02,  1.19s/it, loss=0.639][A
 94%|█████████▍| 11984/12691 [3:57:52<14:02,  1.19s/it, loss=0.639][A
 94%|█████████▍| 11985/12691 [3:57:52<14:13,  1.21s/it, loss=0.639][A
 94%|█

saving model checkpoint at iteration=12000



 95%|█████████▍| 12001/12691 [3:58:12<16:34,  1.44s/it, loss=0.641][A
 95%|█████████▍| 12001/12691 [3:58:13<16:34,  1.44s/it, loss=0.642][A
 95%|█████████▍| 12002/12691 [3:58:13<15:29,  1.35s/it, loss=0.642][A
 95%|█████████▍| 12002/12691 [3:58:14<15:29,  1.35s/it, loss=0.641][A
 95%|█████████▍| 12003/12691 [3:58:14<14:56,  1.30s/it, loss=0.641][A
 95%|█████████▍| 12003/12691 [3:58:15<14:56,  1.30s/it, loss=0.639][A
 95%|█████████▍| 12004/12691 [3:58:15<14:17,  1.25s/it, loss=0.639][A
 95%|█████████▍| 12004/12691 [3:58:17<14:17,  1.25s/it, loss=0.642][A
 95%|█████████▍| 12005/12691 [3:58:17<13:49,  1.21s/it, loss=0.642][A
 95%|█████████▍| 12005/12691 [3:58:18<13:49,  1.21s/it, loss=0.643][A
 95%|█████████▍| 12006/12691 [3:58:18<13:28,  1.18s/it, loss=0.643][A
 95%|█████████▍| 12006/12691 [3:58:19<13:28,  1.18s/it, loss=0.643][A
 95%|█████████▍| 12007/12691 [3:58:19<13:19,  1.17s/it, loss=0.643][A
 95%|█████████▍| 12007/12691 [3:58:20<13:19,  1.17s/it, loss=0.643][A
 95%|

 95%|█████████▌| 12058/12691 [3:59:19<12:28,  1.18s/it, loss=0.65][A
 95%|█████████▌| 12059/12691 [3:59:19<12:42,  1.21s/it, loss=0.65][A
 95%|█████████▌| 12059/12691 [3:59:20<12:42,  1.21s/it, loss=0.649][A
 95%|█████████▌| 12060/12691 [3:59:20<12:30,  1.19s/it, loss=0.649][A
 95%|█████████▌| 12060/12691 [3:59:22<12:30,  1.19s/it, loss=0.648][A
 95%|█████████▌| 12061/12691 [3:59:22<12:19,  1.17s/it, loss=0.648][A
 95%|█████████▌| 12061/12691 [3:59:23<12:19,  1.17s/it, loss=0.648][A
 95%|█████████▌| 12062/12691 [3:59:23<12:26,  1.19s/it, loss=0.648][A
 95%|█████████▌| 12062/12691 [3:59:24<12:26,  1.19s/it, loss=0.648][A
 95%|█████████▌| 12063/12691 [3:59:24<12:20,  1.18s/it, loss=0.648][A
 95%|█████████▌| 12063/12691 [3:59:25<12:20,  1.18s/it, loss=0.647][A
 95%|█████████▌| 12064/12691 [3:59:25<12:32,  1.20s/it, loss=0.647][A
 95%|█████████▌| 12064/12691 [3:59:26<12:32,  1.20s/it, loss=0.647][A
 95%|█████████▌| 12065/12691 [3:59:26<12:18,  1.18s/it, loss=0.647][A
 95%|███

 95%|█████████▌| 12116/12691 [4:00:26<10:50,  1.13s/it, loss=0.641][A
 95%|█████████▌| 12116/12691 [4:00:27<10:50,  1.13s/it, loss=0.64] [A
 95%|█████████▌| 12117/12691 [4:00:27<10:44,  1.12s/it, loss=0.64][A
 95%|█████████▌| 12117/12691 [4:00:28<10:44,  1.12s/it, loss=0.638][A
 95%|█████████▌| 12118/12691 [4:00:28<10:44,  1.13s/it, loss=0.638][A
 95%|█████████▌| 12118/12691 [4:00:30<10:44,  1.13s/it, loss=0.638][A
 95%|█████████▌| 12119/12691 [4:00:30<10:44,  1.13s/it, loss=0.638][A
 95%|█████████▌| 12119/12691 [4:00:31<10:44,  1.13s/it, loss=0.637][A
 96%|█████████▌| 12120/12691 [4:00:31<11:01,  1.16s/it, loss=0.637][A
 96%|█████████▌| 12120/12691 [4:00:32<11:01,  1.16s/it, loss=0.638][A
 96%|█████████▌| 12121/12691 [4:00:32<10:53,  1.15s/it, loss=0.638][A
 96%|█████████▌| 12121/12691 [4:00:33<10:53,  1.15s/it, loss=0.638][A
 96%|█████████▌| 12122/12691 [4:00:33<10:44,  1.13s/it, loss=0.638][A
 96%|█████████▌| 12122/12691 [4:00:34<10:44,  1.13s/it, loss=0.636][A
 96%|██

 96%|█████████▌| 12173/12691 [4:01:35<10:02,  1.16s/it, loss=0.632][A
 96%|█████████▌| 12174/12691 [4:01:35<09:57,  1.16s/it, loss=0.632][A
 96%|█████████▌| 12174/12691 [4:01:37<09:57,  1.16s/it, loss=0.634][A
 96%|█████████▌| 12175/12691 [4:01:37<10:17,  1.20s/it, loss=0.634][A
 96%|█████████▌| 12175/12691 [4:01:38<10:17,  1.20s/it, loss=0.635][A
 96%|█████████▌| 12176/12691 [4:01:38<10:08,  1.18s/it, loss=0.635][A
 96%|█████████▌| 12176/12691 [4:01:39<10:08,  1.18s/it, loss=0.637][A
 96%|█████████▌| 12177/12691 [4:01:39<10:03,  1.17s/it, loss=0.637][A
 96%|█████████▌| 12177/12691 [4:01:40<10:03,  1.17s/it, loss=0.636][A
 96%|█████████▌| 12178/12691 [4:01:40<09:56,  1.16s/it, loss=0.636][A
 96%|█████████▌| 12178/12691 [4:01:41<09:56,  1.16s/it, loss=0.635][A
 96%|█████████▌| 12179/12691 [4:01:41<09:52,  1.16s/it, loss=0.635][A
 96%|█████████▌| 12179/12691 [4:01:42<09:52,  1.16s/it, loss=0.636][A
 96%|█████████▌| 12180/12691 [4:01:42<09:48,  1.15s/it, loss=0.636][A
 96%|█

 96%|█████████▋| 12231/12691 [4:02:43<09:00,  1.18s/it, loss=0.622][A
 96%|█████████▋| 12231/12691 [4:02:44<09:00,  1.18s/it, loss=0.623][A
 96%|█████████▋| 12232/12691 [4:02:44<08:50,  1.16s/it, loss=0.623][A
 96%|█████████▋| 12232/12691 [4:02:45<08:50,  1.16s/it, loss=0.622][A
 96%|█████████▋| 12233/12691 [4:02:45<08:48,  1.15s/it, loss=0.622][A
 96%|█████████▋| 12233/12691 [4:02:46<08:48,  1.15s/it, loss=0.62] [A
 96%|█████████▋| 12234/12691 [4:02:46<08:42,  1.14s/it, loss=0.62][A
 96%|█████████▋| 12234/12691 [4:02:47<08:42,  1.14s/it, loss=0.623][A
 96%|█████████▋| 12235/12691 [4:02:47<08:33,  1.13s/it, loss=0.623][A
 96%|█████████▋| 12235/12691 [4:02:48<08:33,  1.13s/it, loss=0.623][A
 96%|█████████▋| 12236/12691 [4:02:48<08:32,  1.13s/it, loss=0.623][A
 96%|█████████▋| 12236/12691 [4:02:50<08:32,  1.13s/it, loss=0.619][A
 96%|█████████▋| 12237/12691 [4:02:50<08:44,  1.16s/it, loss=0.619][A
 96%|█████████▋| 12237/12691 [4:02:51<08:44,  1.16s/it, loss=0.619][A
 96%|██

 97%|█████████▋| 12288/12691 [4:03:50<07:51,  1.17s/it, loss=0.647][A
 97%|█████████▋| 12289/12691 [4:03:50<07:52,  1.18s/it, loss=0.647][A
 97%|█████████▋| 12289/12691 [4:03:51<07:52,  1.18s/it, loss=0.646][A
 97%|█████████▋| 12290/12691 [4:03:51<07:48,  1.17s/it, loss=0.646][A
 97%|█████████▋| 12290/12691 [4:03:52<07:48,  1.17s/it, loss=0.646][A
 97%|█████████▋| 12291/12691 [4:03:52<07:46,  1.17s/it, loss=0.646][A
 97%|█████████▋| 12291/12691 [4:03:53<07:46,  1.17s/it, loss=0.645][A
 97%|█████████▋| 12292/12691 [4:03:53<07:42,  1.16s/it, loss=0.645][A
 97%|█████████▋| 12292/12691 [4:03:54<07:42,  1.16s/it, loss=0.643][A
 97%|█████████▋| 12293/12691 [4:03:54<07:44,  1.17s/it, loss=0.643][A
 97%|█████████▋| 12293/12691 [4:03:55<07:44,  1.17s/it, loss=0.644][A
 97%|█████████▋| 12294/12691 [4:03:55<07:44,  1.17s/it, loss=0.644][A
 97%|█████████▋| 12294/12691 [4:03:57<07:44,  1.17s/it, loss=0.638][A
 97%|█████████▋| 12295/12691 [4:03:57<07:52,  1.19s/it, loss=0.638][A
 97%|█

 97%|█████████▋| 12346/12691 [4:04:57<06:47,  1.18s/it, loss=0.629][A
 97%|█████████▋| 12346/12691 [4:04:58<06:47,  1.18s/it, loss=0.632][A
 97%|█████████▋| 12347/12691 [4:04:58<06:50,  1.19s/it, loss=0.632][A
 97%|█████████▋| 12347/12691 [4:04:59<06:50,  1.19s/it, loss=0.631][A
 97%|█████████▋| 12348/12691 [4:04:59<06:43,  1.18s/it, loss=0.631][A
 97%|█████████▋| 12348/12691 [4:05:01<06:43,  1.18s/it, loss=0.627][A
 97%|█████████▋| 12349/12691 [4:05:01<07:00,  1.23s/it, loss=0.627][A
 97%|█████████▋| 12349/12691 [4:05:02<07:00,  1.23s/it, loss=0.628][A
 97%|█████████▋| 12350/12691 [4:05:02<06:50,  1.20s/it, loss=0.628][A
 97%|█████████▋| 12350/12691 [4:05:03<06:50,  1.20s/it, loss=0.628][A
 97%|█████████▋| 12351/12691 [4:05:03<06:44,  1.19s/it, loss=0.628][A
 97%|█████████▋| 12351/12691 [4:05:04<06:44,  1.19s/it, loss=0.629][A
 97%|█████████▋| 12352/12691 [4:05:04<06:45,  1.19s/it, loss=0.629][A
 97%|█████████▋| 12352/12691 [4:05:06<06:45,  1.19s/it, loss=0.628][A
 97%|█

 98%|█████████▊| 12403/12691 [4:06:07<05:47,  1.21s/it, loss=0.643][A
 98%|█████████▊| 12404/12691 [4:06:07<05:58,  1.25s/it, loss=0.643][A
 98%|█████████▊| 12404/12691 [4:06:08<05:58,  1.25s/it, loss=0.643][A
 98%|█████████▊| 12405/12691 [4:06:08<05:48,  1.22s/it, loss=0.643][A
 98%|█████████▊| 12405/12691 [4:06:10<05:48,  1.22s/it, loss=0.644][A
 98%|█████████▊| 12406/12691 [4:06:10<05:50,  1.23s/it, loss=0.644][A
 98%|█████████▊| 12406/12691 [4:06:11<05:50,  1.23s/it, loss=0.644][A
 98%|█████████▊| 12407/12691 [4:06:11<05:43,  1.21s/it, loss=0.644][A
 98%|█████████▊| 12407/12691 [4:06:12<05:43,  1.21s/it, loss=0.643][A
 98%|█████████▊| 12408/12691 [4:06:12<05:44,  1.22s/it, loss=0.643][A
 98%|█████████▊| 12408/12691 [4:06:13<05:44,  1.22s/it, loss=0.643][A
 98%|█████████▊| 12409/12691 [4:06:13<05:43,  1.22s/it, loss=0.643][A
 98%|█████████▊| 12409/12691 [4:06:15<05:43,  1.22s/it, loss=0.644][A
 98%|█████████▊| 12410/12691 [4:06:15<05:46,  1.23s/it, loss=0.644][A
 98%|█

 98%|█████████▊| 12461/12691 [4:07:16<04:31,  1.18s/it, loss=0.649][A
 98%|█████████▊| 12461/12691 [4:07:17<04:31,  1.18s/it, loss=0.65] [A
 98%|█████████▊| 12462/12691 [4:07:17<04:27,  1.17s/it, loss=0.65][A
 98%|█████████▊| 12462/12691 [4:07:18<04:27,  1.17s/it, loss=0.65][A
 98%|█████████▊| 12463/12691 [4:07:18<04:23,  1.16s/it, loss=0.65][A
 98%|█████████▊| 12463/12691 [4:07:19<04:23,  1.16s/it, loss=0.647][A
 98%|█████████▊| 12464/12691 [4:07:19<04:23,  1.16s/it, loss=0.647][A
 98%|█████████▊| 12464/12691 [4:07:20<04:23,  1.16s/it, loss=0.646][A
 98%|█████████▊| 12465/12691 [4:07:20<04:21,  1.16s/it, loss=0.646][A
 98%|█████████▊| 12465/12691 [4:07:22<04:21,  1.16s/it, loss=0.646][A
 98%|█████████▊| 12466/12691 [4:07:22<04:34,  1.22s/it, loss=0.646][A
 98%|█████████▊| 12466/12691 [4:07:23<04:34,  1.22s/it, loss=0.647][A
 98%|█████████▊| 12467/12691 [4:07:23<04:34,  1.23s/it, loss=0.647][A
 98%|█████████▊| 12467/12691 [4:07:24<04:34,  1.23s/it, loss=0.65] [A
 98%|████

 99%|█████████▊| 12518/12691 [4:08:23<03:18,  1.14s/it, loss=0.643][A
 99%|█████████▊| 12519/12691 [4:08:23<03:16,  1.14s/it, loss=0.643][A
 99%|█████████▊| 12519/12691 [4:08:24<03:16,  1.14s/it, loss=0.643][A
 99%|█████████▊| 12520/12691 [4:08:24<03:13,  1.13s/it, loss=0.643][A
 99%|█████████▊| 12520/12691 [4:08:25<03:13,  1.13s/it, loss=0.641][A
 99%|█████████▊| 12521/12691 [4:08:25<03:11,  1.13s/it, loss=0.641][A
 99%|█████████▊| 12521/12691 [4:08:27<03:11,  1.13s/it, loss=0.64] [A
 99%|█████████▊| 12522/12691 [4:08:27<03:14,  1.15s/it, loss=0.64][A
 99%|█████████▊| 12522/12691 [4:08:28<03:14,  1.15s/it, loss=0.638][A
 99%|█████████▊| 12523/12691 [4:08:28<03:11,  1.14s/it, loss=0.638][A
 99%|█████████▊| 12523/12691 [4:08:29<03:11,  1.14s/it, loss=0.637][A
 99%|█████████▊| 12524/12691 [4:08:29<03:08,  1.13s/it, loss=0.637][A
 99%|█████████▊| 12524/12691 [4:08:30<03:08,  1.13s/it, loss=0.639][A
 99%|█████████▊| 12525/12691 [4:08:30<03:04,  1.11s/it, loss=0.639][A
 99%|██

 99%|█████████▉| 12576/12691 [4:09:30<02:13,  1.16s/it, loss=0.641][A
 99%|█████████▉| 12576/12691 [4:09:31<02:13,  1.16s/it, loss=0.641][A
 99%|█████████▉| 12577/12691 [4:09:31<02:16,  1.19s/it, loss=0.641][A
 99%|█████████▉| 12577/12691 [4:09:33<02:16,  1.19s/it, loss=0.641][A
 99%|█████████▉| 12578/12691 [4:09:33<02:12,  1.18s/it, loss=0.641][A
 99%|█████████▉| 12578/12691 [4:09:34<02:12,  1.18s/it, loss=0.645][A
 99%|█████████▉| 12579/12691 [4:09:34<02:11,  1.17s/it, loss=0.645][A
 99%|█████████▉| 12579/12691 [4:09:35<02:11,  1.17s/it, loss=0.646][A
 99%|█████████▉| 12580/12691 [4:09:35<02:09,  1.17s/it, loss=0.646][A
 99%|█████████▉| 12580/12691 [4:09:36<02:09,  1.17s/it, loss=0.646][A
 99%|█████████▉| 12581/12691 [4:09:36<02:07,  1.16s/it, loss=0.646][A
 99%|█████████▉| 12581/12691 [4:09:37<02:07,  1.16s/it, loss=0.646][A
 99%|█████████▉| 12582/12691 [4:09:37<02:08,  1.18s/it, loss=0.646][A
 99%|█████████▉| 12582/12691 [4:09:38<02:08,  1.18s/it, loss=0.647][A
 99%|█

100%|█████████▉| 12633/12691 [4:10:39<01:08,  1.18s/it, loss=0.642][A
100%|█████████▉| 12634/12691 [4:10:39<01:08,  1.21s/it, loss=0.642][A
100%|█████████▉| 12634/12691 [4:10:40<01:08,  1.21s/it, loss=0.643][A
100%|█████████▉| 12635/12691 [4:10:40<01:06,  1.19s/it, loss=0.643][A
100%|█████████▉| 12635/12691 [4:10:42<01:06,  1.19s/it, loss=0.645][A
100%|█████████▉| 12636/12691 [4:10:42<01:06,  1.20s/it, loss=0.645][A
100%|█████████▉| 12636/12691 [4:10:43<01:06,  1.20s/it, loss=0.643][A
100%|█████████▉| 12637/12691 [4:10:43<01:05,  1.21s/it, loss=0.643][A
100%|█████████▉| 12637/12691 [4:10:44<01:05,  1.21s/it, loss=0.642][A
100%|█████████▉| 12638/12691 [4:10:44<01:03,  1.20s/it, loss=0.642][A
100%|█████████▉| 12638/12691 [4:10:45<01:03,  1.20s/it, loss=0.643][A
100%|█████████▉| 12639/12691 [4:10:45<01:02,  1.21s/it, loss=0.643][A
100%|█████████▉| 12639/12691 [4:10:46<01:02,  1.21s/it, loss=0.643][A
100%|█████████▉| 12640/12691 [4:10:46<01:00,  1.19s/it, loss=0.643][A
100%|█

100%|██████████| 12691/12691 [4:11:48<00:00,  1.19s/it, loss=0.642][A
  0%|          | 0/1 [4:11:48<?, ?it/s, avg_loss=0.668]            [A
  0%|          | 0/1411 [00:00<?, ?it/s][A
  0%|          | 1/1411 [00:00<14:42,  1.60it/s][A
  0%|          | 2/1411 [00:01<14:31,  1.62it/s][A
  0%|          | 3/1411 [00:01<14:31,  1.62it/s][A
  0%|          | 4/1411 [00:02<15:18,  1.53it/s][A
  0%|          | 5/1411 [00:03<15:21,  1.53it/s][A
  0%|          | 6/1411 [00:03<15:24,  1.52it/s][A
  0%|          | 7/1411 [00:04<15:02,  1.56it/s][A
  1%|          | 8/1411 [00:05<15:01,  1.56it/s][A
  1%|          | 9/1411 [00:05<14:52,  1.57it/s][A
  1%|          | 10/1411 [00:06<14:54,  1.57it/s][A
  1%|          | 11/1411 [00:07<15:34,  1.50it/s][A
  1%|          | 12/1411 [00:07<15:14,  1.53it/s][A
  1%|          | 13/1411 [00:08<15:28,  1.50it/s][A
  1%|          | 14/1411 [00:09<15:38,  1.49it/s][A
  1%|          | 15/1411 [00:09<15:32,  1.50it/s][A
  1%|          | 16/1411 [00

 11%|█         | 151/1411 [01:40<13:38,  1.54it/s][A
 11%|█         | 152/1411 [01:41<13:32,  1.55it/s][A
 11%|█         | 153/1411 [01:41<13:51,  1.51it/s][A
 11%|█         | 154/1411 [01:42<14:43,  1.42it/s][A
 11%|█         | 155/1411 [01:43<14:15,  1.47it/s][A
 11%|█         | 156/1411 [01:43<13:45,  1.52it/s][A
 11%|█         | 157/1411 [01:44<13:35,  1.54it/s][A
 11%|█         | 158/1411 [01:45<13:35,  1.54it/s][A
 11%|█▏        | 159/1411 [01:45<13:37,  1.53it/s][A
 11%|█▏        | 160/1411 [01:46<13:55,  1.50it/s][A
 11%|█▏        | 161/1411 [01:47<14:19,  1.45it/s][A
 11%|█▏        | 162/1411 [01:47<13:38,  1.53it/s][A
 12%|█▏        | 163/1411 [01:48<13:10,  1.58it/s][A
 12%|█▏        | 164/1411 [01:49<13:50,  1.50it/s][A
 12%|█▏        | 165/1411 [01:49<14:04,  1.48it/s][A
 12%|█▏        | 166/1411 [01:50<14:11,  1.46it/s][A
 12%|█▏        | 167/1411 [01:51<13:53,  1.49it/s][A
 12%|█▏        | 168/1411 [01:52<14:50,  1.40it/s][A
 12%|█▏        | 169/1411 [0

 21%|██▏       | 302/1411 [03:19<12:15,  1.51it/s][A
 21%|██▏       | 303/1411 [03:20<12:19,  1.50it/s][A
 22%|██▏       | 304/1411 [03:21<12:58,  1.42it/s][A
 22%|██▏       | 305/1411 [03:21<12:28,  1.48it/s][A
 22%|██▏       | 306/1411 [03:22<12:13,  1.51it/s][A
 22%|██▏       | 307/1411 [03:22<12:18,  1.49it/s][A
 22%|██▏       | 308/1411 [03:23<12:25,  1.48it/s][A
 22%|██▏       | 309/1411 [03:24<12:28,  1.47it/s][A
 22%|██▏       | 310/1411 [03:24<12:25,  1.48it/s][A
 22%|██▏       | 311/1411 [03:25<12:29,  1.47it/s][A
 22%|██▏       | 312/1411 [03:26<12:03,  1.52it/s][A
 22%|██▏       | 313/1411 [03:26<12:00,  1.52it/s][A
 22%|██▏       | 314/1411 [03:27<12:07,  1.51it/s][A
 22%|██▏       | 315/1411 [03:28<11:39,  1.57it/s][A
 22%|██▏       | 316/1411 [03:28<12:06,  1.51it/s][A
 22%|██▏       | 317/1411 [03:29<12:13,  1.49it/s][A
 23%|██▎       | 318/1411 [03:30<12:46,  1.43it/s][A
 23%|██▎       | 319/1411 [03:31<12:41,  1.43it/s][A
 23%|██▎       | 320/1411 [0

 32%|███▏      | 453/1411 [05:01<10:17,  1.55it/s][A
 32%|███▏      | 454/1411 [05:02<10:47,  1.48it/s][A
 32%|███▏      | 455/1411 [05:03<10:56,  1.46it/s][A
 32%|███▏      | 456/1411 [05:04<10:54,  1.46it/s][A
 32%|███▏      | 457/1411 [05:04<10:41,  1.49it/s][A
 32%|███▏      | 458/1411 [05:05<10:26,  1.52it/s][A
 33%|███▎      | 459/1411 [05:05<10:39,  1.49it/s][A
 33%|███▎      | 460/1411 [05:06<10:23,  1.52it/s][A
 33%|███▎      | 461/1411 [05:07<11:00,  1.44it/s][A
 33%|███▎      | 462/1411 [05:07<10:19,  1.53it/s][A
 33%|███▎      | 463/1411 [05:08<10:19,  1.53it/s][A
 33%|███▎      | 464/1411 [05:09<10:24,  1.52it/s][A
 33%|███▎      | 465/1411 [05:09<10:13,  1.54it/s][A
 33%|███▎      | 466/1411 [05:10<10:09,  1.55it/s][A
 33%|███▎      | 467/1411 [05:11<09:59,  1.58it/s][A
 33%|███▎      | 468/1411 [05:11<10:10,  1.54it/s][A
 33%|███▎      | 469/1411 [05:12<09:56,  1.58it/s][A
 33%|███▎      | 470/1411 [05:13<10:05,  1.55it/s][A
 33%|███▎      | 471/1411 [0

 43%|████▎     | 604/1411 [06:43<09:16,  1.45it/s][A
 43%|████▎     | 605/1411 [06:44<09:17,  1.45it/s][A
 43%|████▎     | 606/1411 [06:45<09:13,  1.45it/s][A
 43%|████▎     | 607/1411 [06:45<08:58,  1.49it/s][A
 43%|████▎     | 608/1411 [06:46<08:52,  1.51it/s][A
 43%|████▎     | 609/1411 [06:47<08:55,  1.50it/s][A
 43%|████▎     | 610/1411 [06:47<08:44,  1.53it/s][A
 43%|████▎     | 611/1411 [06:48<09:25,  1.41it/s][A
 43%|████▎     | 612/1411 [06:49<09:05,  1.46it/s][A
 43%|████▎     | 613/1411 [06:50<09:06,  1.46it/s][A
 44%|████▎     | 614/1411 [06:50<09:09,  1.45it/s][A
 44%|████▎     | 615/1411 [06:51<08:59,  1.48it/s][A
 44%|████▎     | 616/1411 [06:52<09:03,  1.46it/s][A
 44%|████▎     | 617/1411 [06:52<08:47,  1.51it/s][A
 44%|████▍     | 618/1411 [06:53<09:21,  1.41it/s][A
 44%|████▍     | 619/1411 [06:54<09:18,  1.42it/s][A
 44%|████▍     | 620/1411 [06:54<09:07,  1.44it/s][A
 44%|████▍     | 621/1411 [06:55<08:52,  1.48it/s][A
 44%|████▍     | 622/1411 [0

 54%|█████▎    | 755/1411 [08:26<07:33,  1.45it/s][A
 54%|█████▎    | 756/1411 [08:27<07:27,  1.46it/s][A
 54%|█████▎    | 757/1411 [08:27<07:07,  1.53it/s][A
 54%|█████▎    | 758/1411 [08:28<07:07,  1.53it/s][A
 54%|█████▍    | 759/1411 [08:29<06:59,  1.55it/s][A
 54%|█████▍    | 760/1411 [08:29<07:30,  1.44it/s][A
 54%|█████▍    | 761/1411 [08:30<07:16,  1.49it/s][A
 54%|█████▍    | 762/1411 [08:31<07:13,  1.50it/s][A
 54%|█████▍    | 763/1411 [08:31<07:17,  1.48it/s][A
 54%|█████▍    | 764/1411 [08:32<07:05,  1.52it/s][A
 54%|█████▍    | 765/1411 [08:33<07:00,  1.54it/s][A
 54%|█████▍    | 766/1411 [08:33<07:07,  1.51it/s][A
 54%|█████▍    | 767/1411 [08:34<07:13,  1.49it/s][A
 54%|█████▍    | 768/1411 [08:35<07:01,  1.52it/s][A
 55%|█████▍    | 769/1411 [08:35<07:00,  1.53it/s][A
 55%|█████▍    | 770/1411 [08:36<07:07,  1.50it/s][A
 55%|█████▍    | 771/1411 [08:37<07:12,  1.48it/s][A
 55%|█████▍    | 772/1411 [08:37<07:11,  1.48it/s][A
 55%|█████▍    | 773/1411 [0

 64%|██████▍   | 906/1411 [10:10<05:41,  1.48it/s][A
 64%|██████▍   | 907/1411 [10:11<05:32,  1.52it/s][A
 64%|██████▍   | 908/1411 [10:12<05:32,  1.51it/s][A
 64%|██████▍   | 909/1411 [10:12<05:35,  1.49it/s][A
 64%|██████▍   | 910/1411 [10:13<05:56,  1.40it/s][A
 65%|██████▍   | 911/1411 [10:14<05:48,  1.43it/s][A
 65%|██████▍   | 912/1411 [10:15<05:47,  1.44it/s][A
 65%|██████▍   | 913/1411 [10:15<05:44,  1.44it/s][A
 65%|██████▍   | 914/1411 [10:16<05:43,  1.45it/s][A
 65%|██████▍   | 915/1411 [10:17<05:34,  1.48it/s][A
 65%|██████▍   | 916/1411 [10:17<05:29,  1.50it/s][A
 65%|██████▍   | 917/1411 [10:18<05:37,  1.47it/s][A
 65%|██████▌   | 918/1411 [10:19<05:31,  1.49it/s][A
 65%|██████▌   | 919/1411 [10:19<05:24,  1.52it/s][A
 65%|██████▌   | 920/1411 [10:20<05:28,  1.49it/s][A
 65%|██████▌   | 921/1411 [10:21<05:27,  1.50it/s][A
 65%|██████▌   | 922/1411 [10:21<05:18,  1.53it/s][A
 65%|██████▌   | 923/1411 [10:22<05:15,  1.55it/s][A
 65%|██████▌   | 924/1411 [1

 75%|███████▍  | 1056/1411 [11:53<04:16,  1.38it/s][A
 75%|███████▍  | 1057/1411 [11:53<04:14,  1.39it/s][A
 75%|███████▍  | 1058/1411 [11:54<04:12,  1.40it/s][A
 75%|███████▌  | 1059/1411 [11:55<04:09,  1.41it/s][A
 75%|███████▌  | 1060/1411 [11:56<04:18,  1.36it/s][A
 75%|███████▌  | 1061/1411 [11:56<04:13,  1.38it/s][A
 75%|███████▌  | 1062/1411 [11:57<04:09,  1.40it/s][A
 75%|███████▌  | 1063/1411 [11:58<04:03,  1.43it/s][A
 75%|███████▌  | 1064/1411 [11:58<04:03,  1.43it/s][A
 75%|███████▌  | 1065/1411 [11:59<04:03,  1.42it/s][A
 76%|███████▌  | 1066/1411 [12:00<04:02,  1.42it/s][A
 76%|███████▌  | 1067/1411 [12:01<04:12,  1.36it/s][A
 76%|███████▌  | 1068/1411 [12:01<04:04,  1.40it/s][A
 76%|███████▌  | 1069/1411 [12:02<04:03,  1.41it/s][A
 76%|███████▌  | 1070/1411 [12:03<04:03,  1.40it/s][A
 76%|███████▌  | 1071/1411 [12:03<04:04,  1.39it/s][A
 76%|███████▌  | 1072/1411 [12:04<04:00,  1.41it/s][A
 76%|███████▌  | 1073/1411 [12:05<03:59,  1.41it/s][A
 76%|█████

 85%|████████▌ | 1204/1411 [13:35<02:20,  1.47it/s][A
 85%|████████▌ | 1205/1411 [13:35<02:17,  1.50it/s][A
 85%|████████▌ | 1206/1411 [13:36<02:14,  1.53it/s][A
 86%|████████▌ | 1207/1411 [13:37<02:13,  1.53it/s][A
 86%|████████▌ | 1208/1411 [13:37<02:07,  1.59it/s][A
 86%|████████▌ | 1209/1411 [13:38<02:07,  1.58it/s][A
 86%|████████▌ | 1210/1411 [13:39<02:16,  1.47it/s][A
 86%|████████▌ | 1211/1411 [13:39<02:13,  1.50it/s][A
 86%|████████▌ | 1212/1411 [13:40<02:09,  1.54it/s][A
 86%|████████▌ | 1213/1411 [13:40<02:08,  1.55it/s][A
 86%|████████▌ | 1214/1411 [13:41<02:05,  1.57it/s][A
 86%|████████▌ | 1215/1411 [13:42<02:05,  1.57it/s][A
 86%|████████▌ | 1216/1411 [13:42<02:10,  1.50it/s][A
 86%|████████▋ | 1217/1411 [13:43<02:09,  1.50it/s][A
 86%|████████▋ | 1218/1411 [13:44<02:06,  1.53it/s][A
 86%|████████▋ | 1219/1411 [13:44<02:06,  1.51it/s][A
 86%|████████▋ | 1220/1411 [13:45<02:03,  1.55it/s][A
 87%|████████▋ | 1221/1411 [13:46<02:01,  1.56it/s][A
 87%|█████

 96%|█████████▌| 1352/1411 [15:15<00:39,  1.50it/s][A
 96%|█████████▌| 1353/1411 [15:16<00:39,  1.46it/s][A
 96%|█████████▌| 1354/1411 [15:17<00:38,  1.48it/s][A
 96%|█████████▌| 1355/1411 [15:17<00:37,  1.48it/s][A
 96%|█████████▌| 1356/1411 [15:18<00:35,  1.53it/s][A
 96%|█████████▌| 1357/1411 [15:19<00:35,  1.53it/s][A
 96%|█████████▌| 1358/1411 [15:19<00:34,  1.53it/s][A
 96%|█████████▋| 1359/1411 [15:20<00:33,  1.54it/s][A
 96%|█████████▋| 1360/1411 [15:21<00:34,  1.47it/s][A
 96%|█████████▋| 1361/1411 [15:21<00:33,  1.49it/s][A
 97%|█████████▋| 1362/1411 [15:22<00:32,  1.50it/s][A
 97%|█████████▋| 1363/1411 [15:23<00:32,  1.49it/s][A
 97%|█████████▋| 1364/1411 [15:23<00:31,  1.48it/s][A
 97%|█████████▋| 1365/1411 [15:24<00:30,  1.48it/s][A
 97%|█████████▋| 1366/1411 [15:25<00:30,  1.48it/s][A
 97%|█████████▋| 1367/1411 [15:26<00:31,  1.41it/s][A
 97%|█████████▋| 1368/1411 [15:26<00:30,  1.41it/s][A
 97%|█████████▋| 1369/1411 [15:27<00:28,  1.46it/s][A
 97%|█████

valid_auc:  0.9723561326072134


100%|██████████| 1/1 [4:27:47<00:00, 16067.42s/it, avg_loss=0.668]

   bnsp_auc  bpsn_auc                       subgroup  subgroup_auc  \
2  0.963416  0.903663      homosexual_gay_or_lesbian      0.865033   
7  0.963650  0.907260                          white      0.866349   
5  0.956636  0.926331                         muslim      0.874164   
6  0.969270  0.894249                          black      0.875013   
4  0.957454  0.949695                         jewish      0.921056   
1  0.958010  0.959049                         female      0.933433   
0  0.963167  0.952065                           male      0.933494   
3  0.947694  0.968912                      christian      0.935789   
8  0.977021  0.947789  psychiatric_or_mental_illness      0.958829   

   subgroup_size  
2           1104  
7           2419  
5           1957  
6           1378  
4            707  
1           5067  
0           3993  
3           3608  
8            412  
final metric is 0.9423242654210833



