#Imports 

In [14]:
!pip install pytorch_lightning
!pip install lightning-bolts 
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.distributions import Beta
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import Dataset, DataLoader
from torchmetrics import Accuracy
import pandas as pd
from pytorch_lightning.callbacks import ModelCheckpoint
from  pl_bolts import optimizers
import copy

from pytorch_lightning import seed_everything



#Data Download

In [15]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
!gunzip  covtype.data.gz

--2021-08-20 04:02:27--  https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11240707 (11M) [application/x-httpd-php]
Saving to: ‘covtype.data.gz’


2021-08-20 04:02:27 (65.5 MB/s) - ‘covtype.data.gz’ saved [11240707/11240707]

gzip: covtype.data already exists; do you wish to overwrite (y or n)? n
	not overwritten


#Utils

In [16]:
class Mixup:
  def __init__(self,alpha):
    '''
    mixup the data with mixing coeffecient
    '''
    super().__init__()
    self.alpha=alpha
  def __call__(self,x):
    lam=Beta(self.alpha,self.alpha).sample()
    randidx = torch.randperm(len(x)).to(x.device)
    x = lam * x + (1-lam) * x[randidx]
    return x, lam , randidx


In [17]:
class Maxout(nn.Module):
  '''
  Apply a maxout pooling layer
  '''
    def __init__(self, pool_size):
        super().__init__()
        self._pool_size = pool_size

    def forward(self, x):
        assert x.shape[-1] % self._pool_size == 0, \
            'Wrong input last dim size ({}) for Maxout({})'.format(x.shape[-1], self._pool_size)
        m, i = x.view(*x.shape[:-1], x.shape[-1] // self._pool_size, self._pool_size).max(-1)
        return m

#Model architecture

In [18]:
class TabulerModel(nn.Module):
  '''
  5 layer Mlp with Projection Head and Maxout layer
  '''
    def __init__(self,hid_dim,input_dim,head_dim,num_classes,pool_size):
        super(TabulerModel, self).__init__()
        
        self.layer = nn.Sequential(nn.Linear(input_dim,hid_dim),
                    nn.BatchNorm1d(hid_dim),
                    nn.ReLU(),
                    nn.Linear(hid_dim,hid_dim),
                    nn.BatchNorm1d(hid_dim),
                    nn.ReLU(),
                    nn.Linear(hid_dim,hid_dim*2),
                    nn.BatchNorm1d(hid_dim*2),
                    nn.ReLU(),
                    nn.Linear(hid_dim*2,hid_dim*2),
                    nn.BatchNorm1d(hid_dim*2),
                    nn.ReLU(),
                    nn.Linear(hid_dim*2,hid_dim*2*2),
                    nn.BatchNorm1d(hid_dim*2*2)) 
        self.maxout=Maxout(pool_size)
        self.projectHead = nn.Sequential(nn.Linear(hid_dim,hid_dim),
                                         nn.ReLU(),
                                         nn.Linear(hid_dim,head_dim))
        
       
        
    def forward(self, inputs,):
        x = self.layer(inputs)
        x_max = self.maxout(x)
        
        return self.projectHead(x_max)



#Dataset class

In [20]:
class DatasetTabular(Dataset):
  '''
  Tabular data class
  '''
  def __init__(self, data, y ):
    super().__init__()
    self.data=data.values
    self.y=y.values
  def __len__(self):
    return len(self.data)
  def __getitem__(self,idx):
    return self.data[idx],self.y[idx]


def create_dataset(path):
  '''
  create dataset object from path
  '''
  df=pd.read_csv(path,header=None)
  df.iloc[:,-1]=df.iloc[:,-1]-1
  df.iloc[:,:11]=(df.iloc[:,:11]-df.iloc[:,:11].mean())/df.iloc[:,:11].std()
  x,y=df.iloc[:,:-1],df.iloc[:,-1]
  dataset=DatasetTabular(x,y)
  return dataset

def generate_splits(dataset, split):
  '''
  split data 
  '''
  train_sz, test_sz=len(dataset)-int(len(dataset)*split),int(len(dataset)*split)
  train,test=torch.utils.data.random_split(dataset,[train_sz ,test_sz])
  return train, test


def tabularaugment(input,prob=0.2):
  '''
  Data Augmentation adding noise
  '''
  matrix=torch.ones(input.shape)*prob
  
  p=torch.bernoulli(matrix).to(input.device)
 
  output =input+p*torch.randn(input.shape,device=input.device)

  return output 



In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Trainer-Supervised 

In [22]:

class TestSUP(pl.LightningModule):
  '''
  create a fine tuned model
  '''
 
  def __init__(self,model,fc,learningRate,weight_decay, momentum,num_output,freeze_encoder=True,layers_to_freeze=6):
     super().__init__()
     self.model=model
     self.fc=fc
     self.learningRate=learningRate
     self.weight_decay=weight_decay
     self.momentum=momentum
     self.setup_criterion()
     self.train_acc=Accuracy(num_classes=num_output)
     self.validation_acc=Accuracy(num_classes=num_output)
     self.test_acc=Accuracy(num_classes=num_output)
     if freeze_encoder:
       self.model.eval()
       ct=0
       for child in self.model.children():
            ct += 1
            if ct < layers_to_freeze:
                for param in child.parameters():
                    param.requires_grad = False
       #self.model.projectHead=fc
  def forward(self,x):
    return self.model(x)
     

  def _shared_step(self, batch, batch_idx,accuracy):
    x,y=batch
    pretrain=self.model(x.float())
   
    #output=self.fc(pretrain)
    prediction=torch.sigmoid(pretrain)
    loss=self.criterion(pretrain.squeeze(),y)
  
    accuracy.update(prediction,y)
    return loss 

  def training_step(self, batch,batch_idx):
    
    loss=self._shared_step(batch,batch_idx,self.train_acc)
    self.log('train_loss', loss, on_step=True, 
                 on_epoch=True, prog_bar=True, logger=True)
  
    return {'loss': loss}
  def train_epoch_end(self, out): 
    self.log('training accuracy',self.train_acc.compute(),prog_bar=True,)
    self.train_acc.reset()
  def validation_step(self, batch,batch_idx):
    loss=self._shared_step(batch,batch_idx,self.validation_acc)
 
    self.log('val_loss', loss, on_step=True, 
                 on_epoch=True, prog_bar=True, logger=True)
  
 
  def validation_epoch_end(self, out):
    self.log('validation accuracy',self.validation_acc.compute(),prog_bar=True,)
    print(self.validation_acc.compute())
    self.validation_acc.reset()
  
  def test_step(self,batch,batch_idx):
     loss=self._shared_step(batch,batch_idx,self.test_acc)
     self.log('test_loss', loss, on_step=True, 
                 on_epoch=True, prog_bar=True, logger=True)
  def test_epoch_end(self, out): 
    self.log('test accuracy',self.test_acc.compute(),prog_bar=True,)
    self.test_acc.reset()

  def configure_optimizers(self): 
    optimizer =torch.optim.SGD(self.parameters(), lr=self.learningRate)
    scheduler=optimizers.lr_scheduler.LinearWarmupCosineAnnealingLR(optimizer,warmup_epochs=10, max_epochs=500)
    return {
                'optimizer': optimizer,
                "lr_scheduler": scheduler
            }

  def setup_criterion(self):
    self.criterion= nn.CrossEntropyLoss()

 


#Trainer-Self supervised(N-Pair)

In [23]:
class Npair(pl.LightningModule):
  '''
  pretrain an N-Pair model with i-mix Loss
  '''
 
  def __init__(self,model,alpha, tabularaugment,t,learningRate,weight_decay, momentum, use_imix=False):
     super().__init__()
     self.model=model
     self.alpha=alpha
     self.mixup=Mixup(alpha)
     self.augment=tabularaugment
     self.t=t
     self.learningRate=learningRate
     self.weight_decay=weight_decay
     self.momentum=momentum
     self.setup_criterion()
     self.use_imix=use_imix
     
  def _shared_step(self,batch,batch_idx):
    x,_=batch
    
    r= self.augment(x.float())
    
    r_prime=self.augment(x.float())
    #calculating the loss
    if self.use_imix:
      r_mix , lam , randidx=self.mixup(r)

      randidx = randidx.to(self.device)
      logits = torch.matmul(F.normalize(model(r_mix)), F.normalize(model(r_prime)).T) / self.t
      loss= lam * self.criterion(logits, torch.arange(len(x)).to(self.device)) + \
        (1-lam) * self.criterion(logits, randidx)
     
    else:
      logits=torch.matmul(F.normalize(model(r)), F.normalize(model(r_prime)).T) / self.t
      loss=self.criterion(logits, torch.arange(len(x)).to(x.device))
      
    return loss


  def training_step(self, batch,batch_idx):
    loss=self._shared_step(batch,'train')
    print(loss)
    self.log('loss', loss, on_step=True, 
                 on_epoch=True, prog_bar=True, logger=True)
    return {'loss': loss}
  def validation_step(self,batch,batch_idx):
    loss=self._shared_step(batch,'val')
 
    self.log('val_loss', loss, on_step=True, 
                 on_epoch=True, prog_bar=True, logger=True)
    
    return {'val_loss': loss}
  def configure_optimizers(self): 
    optimizer =torch.optim.SGD(self.parameters(), lr=self.learningRate,weight_decay=self.weight_decay,momentum=self.momentum)
    scheduler=optimizers.lr_scheduler.LinearWarmupCosineAnnealingLR(optimizer,warmup_epochs=10, max_epochs=5000)
    return {
                'optimizer': optimizer,
                "lr_scheduler": scheduler
            }



  def setup_criterion(self):
    self.criterion= nn.CrossEntropyLoss()


#Setup Experiment 

In [24]:
#pretraining dataset
dataset=create_dataset(path='/content/covtype.data')
train, test=generate_splits(dataset,split=0.99)
#create pretrain dataloader
pin_memory = True if torch.cuda.is_available() else False
train_dataloader=DataLoader(train,batch_size=512,pin_memory=pin_memory)
test_dataloader=DataLoader(test,batch_size=512,pin_memory=pin_memory)
#create finetune dataloader
trainF, valF=generate_splits(train,split=0.2)
trainF_dataloader=DataLoader(trainF,batch_size=512,pin_memory=pin_memory)
valF_dataloader=DataLoader(valF,batch_size=512,pin_memory=pin_memory)
#create model
model=TabulerModel(hid_dim=2048,input_dim=54,head_dim=128,num_classes=7,pool_size=4)

In [25]:
pretrained_checkpoint='/content/drive/MyDrive/checkpoints6/i-Mix-NpairFineTuneepoch=341-loss=2.67.ckpt'
experiment='pre-train'
hid_dim=2048
num_classes=7
seed_everything(123)
if experiment=='pre-train':
    LitModel=Npair(model,alpha=2,tabularaugment=tabularaugment,t=0.1,
                       learningRate=0.125,weight_decay=0.0001,momentum=0.9,use_imix=True)
elif experiment=='fine-tune':
    fc=nn.Linear(hid_dim,num_classes)
    LitModel=TestSUP(model,fc=fc,learningRate=0.001,
                     weight_decay=0,momentum=0.9,num_output=num_classes,freeze_encoder=True)
    if pretrained_checkpoint:
      unsup_param=dict(model=model,alpha=2,
                       tabularaugment=tabularaugment,t=0.1,
                       learningRate=0.125*2
                      ,weight_decay=0.001,momentum=0.9)
      pretrained_Model=Npair(**unsup_param)
      pretrained_Model.load_from_checkpoint(checkpoint_path=pretrained_checkpoint,**unsup_param,strict=False)
      LitModel.model=copy.deepcopy(pretrained_Model.model)
      LitModel.model.projectHead= fc



Global seed set to 123


In [26]:
#Trainign
checkpoint_callback = ModelCheckpoint(monitor="loss",mode='min',
                                      filename="i-Mix-pretrain{epoch:02d}-{val-loss:.2f}",
                                      dirpath='/content/drive/MyDrive/checkpoints8',
                                      verbose=True,
                                      save_last=True)

In [None]:

logger = TensorBoardLogger("tb4_logs", name="LitModel")
trainer=pl.Trainer(max_epochs=400, gpus=-1,auto_select_gpus=True,
                   callbacks=[checkpoint_callback],
                   #default_root_dir='/content/checkpoints',
                   logger=logger)
if experiment=='pre-train':
    trainer.fit(LitModel,train_dataloader)
    checkpoint=checkpoint_callback.best_model_path
else:
    trainer.fit(LitModel,trainF_dataloader,valF_dataloader)

#score= checkpoint_callback.best_model_score


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(f"you defined a {step_name} but have no {loader_name}. Skipping {stage} loop")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type         | Params
---------------------------------------
0 | model | TabulerModel | 67.5 M
---------------------------------------
67.5 M    Trainable params
0         Non-trainable params
67.5 M    Total params
270.181   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 123
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

tensor(6.0793, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.0425, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.9613, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.1033, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.0764, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.0942, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.0958, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.9703, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.9540, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.0576, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.0642, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 0, global step 11: loss reached 6.01385 (best 6.01385), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=00-val-loss=0.00.ckpt" as top 1


tensor(5.0230, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.1018, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.5715, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.5197, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.3833, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.0068, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.3049, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5978, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.0905, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3255, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.8699, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 1, global step 23: loss reached 5.10406 (best 5.10406), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=01-val-loss=0.00.ckpt" as top 1


tensor(4.8889, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6286, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.7424, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5982, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6946, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1871, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7285, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.8371, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.8416, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6521, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 2, global step 35: loss reached 4.37218 (best 4.37218), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=02-val-loss=0.00.ckpt" as top 1


tensor(4.8459, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5976, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.7738, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3617, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5472, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0863, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2005, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5386, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6360, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6960, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7824, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8825, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 3, global step 47: loss reached 4.12315 (best 4.12315), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=03-val-loss=0.00.ckpt" as top 1


tensor(4.2495, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9792, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6075, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2614, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4182, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5960, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5612, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6699, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6350, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6765, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1837, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6635, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 4, global step 59: loss was not in top 1


tensor(4.6182, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2217, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2319, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4471, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5816, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9909, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3316, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5970, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8029, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2176, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.7473, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4640, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 5, global step 71: loss was not in top 1


tensor(4.7123, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7656, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4165, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5193, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4019, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5997, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2266, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1041, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0125, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7328, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1954, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5964, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 6, global step 83: loss reached 4.02723 (best 4.02723), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=06-val-loss=0.00.ckpt" as top 1


tensor(3.5667, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8199, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6314, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8487, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.7064, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.7284, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3113, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7416, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4739, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5745, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6836, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 7, global step 95: loss was not in top 1


tensor(4.5320, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4442, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2912, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5764, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1453, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3146, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6417, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4379, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4228, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9797, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5951, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5948, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 8, global step 107: loss was not in top 1


tensor(4.6387, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3866, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5160, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4825, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3807, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5821, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0839, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4982, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2916, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5237, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3976, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4594, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 9, global step 119: loss was not in top 1


tensor(4.7120, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5103, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2823, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7337, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1858, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2602, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5370, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2093, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9497, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5649, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3917, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 10, global step 131: loss was not in top 1


tensor(4.4840, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4474, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2799, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8119, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4787, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5890, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4809, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5411, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1586, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5745, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4070, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 11, global step 143: loss was not in top 1


tensor(3.2418, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4871, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4918, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4376, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6851, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1560, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4800, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3326, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5249, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5713, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8059, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5601, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 12, global step 155: loss was not in top 1


tensor(4.0215, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1967, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5132, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0476, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3275, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9367, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5469, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1177, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3466, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4547, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5072, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4594, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 13, global step 167: loss reached 3.84703 (best 3.84703), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=13-val-loss=0.00.ckpt" as top 1


tensor(3.5979, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3084, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.6327, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5129, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6994, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4660, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9680, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3889, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3315, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5394, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0669, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5561, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 14, global step 179: loss was not in top 1


tensor(4.0345, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5756, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2118, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1498, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2399, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5269, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4190, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5473, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3607, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0897, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4911, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2061, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 15, global step 191: loss was not in top 1


tensor(3.6220, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1784, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3959, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1270, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2779, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9665, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4196, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3315, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9969, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6446, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4274, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0273, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 16, global step 203: loss reached 3.70454 (best 3.70454), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=16-val-loss=0.00.ckpt" as top 1


tensor(3.7821, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8544, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5691, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1581, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5875, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0156, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8914, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4615, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4745, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6755, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0850, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0469, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 17, global step 215: loss was not in top 1


tensor(4.3837, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4791, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6098, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3190, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2677, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2829, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4959, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2040, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0232, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7977, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4483, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4139, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 18, global step 227: loss was not in top 1


tensor(3.1742, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4079, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4792, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4758, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3733, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8451, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9944, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0492, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0693, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2961, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5413, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 19, global step 239: loss was not in top 1


tensor(4.1776, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4262, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4475, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4934, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2400, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4151, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4981, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4339, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1706, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1348, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4571, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8573, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 20, global step 251: loss was not in top 1


tensor(4.4580, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0456, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1070, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8414, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3783, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4104, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1693, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6052, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3830, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9535, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7618, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5015, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 21, global step 263: loss was not in top 1


tensor(4.5674, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2057, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5255, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8213, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3947, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9865, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4953, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0668, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1758, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3521, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6575, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3704, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 22, global step 275: loss was not in top 1


tensor(4.1282, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3823, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6317, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9546, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1609, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5975, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9312, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9922, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7007, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4891, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4876, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5729, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 23, global step 287: loss was not in top 1


tensor(4.3652, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3940, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9550, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8552, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7399, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7558, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0377, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7875, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9841, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5271, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4863, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 24, global step 299: loss was not in top 1


tensor(3.6738, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5117, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1986, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1434, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4714, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4145, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3205, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3445, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1501, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1867, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2466, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0643, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 25, global step 311: loss was not in top 1


tensor(4.6048, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3029, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2312, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4865, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3932, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8118, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8562, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3658, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3903, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7606, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4008, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1755, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 26, global step 323: loss was not in top 1


tensor(4.3883, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1896, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2205, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9813, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4227, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3443, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8670, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1118, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1670, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4883, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1665, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8870, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 27, global step 335: loss was not in top 1


tensor(4.4707, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1781, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3810, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0691, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2450, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2979, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3390, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0768, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7356, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4586, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1785, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6323, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 28, global step 347: loss was not in top 1


tensor(4.2930, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3767, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2796, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2935, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7423, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3441, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4148, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3239, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2429, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1825, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7419, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 29, global step 359: loss was not in top 1


tensor(4.0061, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6787, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9056, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3979, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6312, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2636, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3664, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0782, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8255, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3376, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9828, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0950, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 30, global step 371: loss was not in top 1


tensor(4.1629, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2670, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3385, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7814, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4085, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9501, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0213, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2148, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1197, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9251, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2341, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4106, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 31, global step 383: loss was not in top 1


tensor(4.4696, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7301, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4844, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1982, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2344, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3256, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1251, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2926, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6275, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2613, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4216, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 32, global step 395: loss was not in top 1


tensor(2.4283, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1473, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2987, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4428, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3137, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8708, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3689, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3623, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0508, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3585, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5066, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1337, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 33, global step 407: loss was not in top 1


tensor(3.6829, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2040, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6487, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2394, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1191, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0609, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6773, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3678, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2549, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3558, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3255, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1084, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 34, global step 419: loss was not in top 1


tensor(3.2216, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6334, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9201, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3740, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8122, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6240, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5733, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3212, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4531, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2129, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3036, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3743, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 35, global step 431: loss was not in top 1


tensor(3.6296, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1624, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2591, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2430, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6143, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7951, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6276, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5396, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3782, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8668, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 36, global step 443: loss reached 3.69760 (best 3.69760), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=36-val-loss=0.00.ckpt" as top 1


tensor(4.5286, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3504, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4278, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2506, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1761, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3436, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9969, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4334, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3153, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1453, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2782, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 37, global step 455: loss was not in top 1


tensor(3.3218, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1916, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8790, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0675, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0370, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8241, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7417, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5842, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4475, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3975, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2043, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8852, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 38, global step 467: loss was not in top 1


tensor(4.4588, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3011, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6163, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4434, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9111, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9478, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6360, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7109, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3562, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2909, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3815, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 39, global step 479: loss reached 3.68510 (best 3.68510), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=39-val-loss=0.00.ckpt" as top 1


tensor(3.7077, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4080, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2583, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7483, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9680, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3662, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1047, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5710, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5324, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0921, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6378, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8219, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 40, global step 491: loss was not in top 1


tensor(3.7479, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3752, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3411, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4195, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9556, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4391, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2519, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2196, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6301, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8408, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4104, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0050, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 41, global step 503: loss was not in top 1


tensor(3.3187, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9040, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4745, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3737, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6239, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1687, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5930, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3068, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2560, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9151, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8927, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 42, global step 515: loss was not in top 1


tensor(4.3557, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0456, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9496, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0260, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2476, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0676, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7116, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3159, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4191, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1270, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4512, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2019, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 43, global step 527: loss was not in top 1


tensor(4.3331, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0773, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2652, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1580, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3130, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2564, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6062, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4722, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9121, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0961, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8874, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9182, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 44, global step 539: loss was not in top 1


tensor(3.5843, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2253, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4225, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2515, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0906, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2299, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7741, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1916, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2126, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2229, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3418, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0999, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 45, global step 551: loss was not in top 1


tensor(3.6679, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4697, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1250, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8244, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6616, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8357, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6922, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3089, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2710, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1963, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2959, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9555, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 46, global step 563: loss was not in top 1


tensor(4.2820, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9082, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2319, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1115, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3126, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7601, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5695, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3382, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7228, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0994, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3021, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1673, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 47, global step 575: loss was not in top 1


tensor(3.9217, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2205, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9727, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9869, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8626, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2420, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7747, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2202, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6753, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2626, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9043, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1851, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 48, global step 587: loss was not in top 1


tensor(3.1862, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8194, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3118, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8655, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6748, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3059, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1704, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2603, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3917, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8069, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2055, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8676, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 49, global step 599: loss was not in top 1


tensor(4.1711, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3635, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0893, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1966, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5127, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1400, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2655, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2570, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2544, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9242, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2993, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 50, global step 611: loss was not in top 1


tensor(3.3817, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1699, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8212, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1803, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3571, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0854, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2855, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3291, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2841, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7600, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7230, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2374, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 51, global step 623: loss reached 3.33751 (best 3.33751), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=51-val-loss=0.00.ckpt" as top 1


tensor(2.7233, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1272, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0083, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7924, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8797, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3608, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2448, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3964, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3868, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0328, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7418, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7617, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 52, global step 635: loss was not in top 1


tensor(3.9617, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1456, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1805, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8088, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8745, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9378, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4476, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2872, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2245, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9176, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0130, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2283, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 53, global step 647: loss was not in top 1


tensor(3.7777, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9462, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3038, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2433, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9327, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1040, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0458, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3190, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7394, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0947, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7502, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7691, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 54, global step 659: loss was not in top 1


tensor(3.1508, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3015, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1001, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2147, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6039, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1806, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8349, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2363, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7047, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8758, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4419, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5111, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 55, global step 671: loss was not in top 1


tensor(4.2270, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0604, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3098, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1564, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4570, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2640, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2222, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2648, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9666, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6077, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2013, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6990, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 56, global step 683: loss was not in top 1


tensor(3.8572, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3462, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9453, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5154, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2227, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2870, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9722, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0964, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6397, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3200, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3890, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0264, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 57, global step 695: loss was not in top 1


tensor(3.8686, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0391, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1973, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3024, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8266, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3115, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1637, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0712, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1634, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0203, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0507, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 58, global step 707: loss was not in top 1


tensor(2.5265, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6662, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4466, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2464, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2870, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6710, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0254, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2174, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0515, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2324, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0065, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1171, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 59, global step 719: loss was not in top 1


tensor(4.2222, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5697, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1490, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8789, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1616, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0371, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2964, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3819, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8640, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2692, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4437, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3189, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 60, global step 731: loss was not in top 1


tensor(3.9309, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7113, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1829, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4407, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7773, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0443, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5245, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2347, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7512, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3502, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3598, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8575, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 61, global step 743: loss was not in top 1


tensor(3.2445, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2969, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3506, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4556, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1782, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4296, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3722, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2277, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0909, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0457, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1337, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 62, global step 755: loss was not in top 1


tensor(3.5551, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9999, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0180, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0965, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2490, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7086, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1737, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4164, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2198, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4890, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2713, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 63, global step 767: loss was not in top 1


tensor(4.3191, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3834, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9035, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7988, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7209, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1590, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.4191, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1257, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2836, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8423, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2528, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0773, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 64, global step 779: loss was not in top 1


tensor(3.0958, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0307, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1800, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6612, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6540, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9813, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1555, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4072, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2415, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4453, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2449, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2592, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 65, global step 791: loss was not in top 1


tensor(2.7639, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7741, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0139, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9678, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8308, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2352, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6166, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1293, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9494, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2220, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6394, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9544, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 66, global step 803: loss was not in top 1


tensor(4.1343, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9165, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3779, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9886, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5766, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0439, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2194, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5799, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0914, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1862, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2328, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1327, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 67, global step 815: loss was not in top 1


tensor(4.1153, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4468, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1905, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2338, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2731, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0907, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9425, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0799, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0560, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0302, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5560, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9751, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 68, global step 827: loss was not in top 1


tensor(4.1237, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8650, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0585, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9717, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1919, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8458, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6588, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2189, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8344, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6234, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9301, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 69, global step 839: loss was not in top 1


tensor(2.3493, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9008, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2487, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6811, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5621, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5812, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2412, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9677, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2258, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1902, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8467, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2709, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 70, global step 851: loss was not in top 1


tensor(4.1896, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1337, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2644, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0683, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9534, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8374, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2105, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1061, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4803, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0840, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2064, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1950, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 71, global step 863: loss was not in top 1


tensor(4.3085, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7738, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2790, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1248, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2302, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1968, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1433, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0058, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2044, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1588, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0029, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1388, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 72, global step 875: loss was not in top 1


tensor(4.2903, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1680, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2737, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4743, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6904, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2280, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9978, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3848, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6459, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7874, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1753, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0040, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 73, global step 887: loss was not in top 1


tensor(4.0872, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9151, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1480, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1437, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6892, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0731, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0353, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3403, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3146, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2175, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0370, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 74, global step 899: loss was not in top 1


tensor(4.0745, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2391, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1779, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2561, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7025, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7231, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5514, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1351, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1156, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0711, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8794, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8190, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 75, global step 911: loss was not in top 1


tensor(2.8006, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2437, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9936, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0107, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8592, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1415, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1817, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9157, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0647, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1544, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8907, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6426, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 76, global step 923: loss was not in top 1


tensor(4.0319, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2221, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7869, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1395, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1699, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5499, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0128, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3634, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2229, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9101, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8601, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1467, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 77, global step 935: loss was not in top 1


tensor(3.3962, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2731, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7825, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1206, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8488, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7030, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1068, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1471, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2168, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4769, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8720, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 78, global step 947: loss was not in top 1


tensor(3.7591, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2029, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2659, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2563, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0923, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6091, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6670, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4602, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6074, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5520, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4620, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1818, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 79, global step 959: loss was not in top 1


tensor(4.0183, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6665, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2723, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3151, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4085, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1847, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9876, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2736, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1245, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9802, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2709, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1917, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 80, global step 971: loss was not in top 1


tensor(4.1964, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9021, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3790, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7916, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0468, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0946, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0433, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1406, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7209, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1540, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0513, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3494, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 81, global step 983: loss reached 3.32981 (best 3.32981), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=81-val-loss=0.00.ckpt" as top 1


tensor(3.2080, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3242, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7329, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7651, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1865, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1881, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1312, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1218, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7256, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4064, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0885, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0836, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 82, global step 995: loss was not in top 1


tensor(4.2385, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0850, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5253, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7016, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8381, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1466, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9823, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2610, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5253, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1042, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7992, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1136, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 83, global step 1007: loss was not in top 1


tensor(2.5587, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7450, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2453, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7297, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4581, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1406, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4676, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0590, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2045, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5206, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1404, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2160, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 84, global step 1019: loss was not in top 1


tensor(3.9505, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6315, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5873, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1909, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9174, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3566, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0637, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5975, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9020, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1094, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7540, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 85, global step 1031: loss was not in top 1


tensor(3.7875, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0168, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9243, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2575, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2105, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9685, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2194, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2689, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8126, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1712, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9497, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0643, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 86, global step 1043: loss was not in top 1


tensor(4.1937, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0764, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4916, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0286, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2298, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2020, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2338, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3463, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8411, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3293, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9688, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0287, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 87, global step 1055: loss was not in top 1


tensor(3.7684, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2259, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2940, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0049, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7241, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2447, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9993, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1051, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2121, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6055, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4728, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8702, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 88, global step 1067: loss was not in top 1


tensor(4.1756, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1837, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2784, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4077, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5739, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7281, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9772, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3774, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1448, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1837, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3541, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2892, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 89, global step 1079: loss was not in top 1


tensor(3.6581, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9630, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0381, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9119, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0751, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4767, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2783, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9700, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3572, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3475, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1948, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7495, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 90, global step 1091: loss was not in top 1


tensor(4.2479, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2243, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4206, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7677, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2298, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1617, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5853, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7432, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2164, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7715, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0050, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2339, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 91, global step 1103: loss was not in top 1


tensor(4.1183, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1203, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5636, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3140, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4929, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3216, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5780, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8878, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8037, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2615, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9328, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7678, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 92, global step 1115: loss was not in top 1


tensor(4.1338, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4291, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2968, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0475, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9337, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0445, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7629, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0940, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1024, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1895, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3057, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6605, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 93, global step 1127: loss was not in top 1


tensor(4.1937, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4924, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1018, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1506, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0244, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2848, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1393, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2479, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2904, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1617, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7125, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 94, global step 1139: loss was not in top 1


tensor(3.0928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7367, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6047, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6456, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8349, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3437, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9839, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8847, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5051, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1167, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0677, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8947, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 95, global step 1151: loss was not in top 1


tensor(3.5095, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4056, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5700, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6890, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5466, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5274, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4502, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6799, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6298, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6577, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8542, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3837, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 96, global step 1163: loss reached 3.00280 (best 3.00280), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=96-val-loss=0.00.ckpt" as top 1


tensor(2.5928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4285, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3859, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2584, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0361, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8938, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4812, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1731, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9584, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2324, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8911, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8271, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 97, global step 1175: loss was not in top 1


tensor(3.4082, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6794, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0027, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6321, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1448, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1595, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4495, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8102, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8512, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5807, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1763, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9668, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 98, global step 1187: loss was not in top 1


tensor(2.8549, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0678, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1720, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0132, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2361, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2320, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0246, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3890, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0724, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0966, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1563, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 99, global step 1199: loss was not in top 1


tensor(4.0759, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2454, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9887, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1711, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7684, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0080, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1132, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5786, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8385, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1495, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4681, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4890, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 100, global step 1211: loss was not in top 1


tensor(3.9685, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2262, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9503, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9636, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1224, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1361, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1708, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1247, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1449, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0610, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2078, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8060, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 101, global step 1223: loss was not in top 1


tensor(3.4344, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0390, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3655, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6803, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2064, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8148, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2698, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8840, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7542, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1936, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1449, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1439, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 102, global step 1235: loss was not in top 1


tensor(2.2788, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2592, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.3762, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9843, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2203, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0283, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9733, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8814, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8666, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9486, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1913, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 103, global step 1247: loss was not in top 1


tensor(3.8075, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2275, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5434, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7947, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2072, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1691, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0299, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1524, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1389, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3533, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8982, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8985, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 104, global step 1259: loss was not in top 1


tensor(3.5664, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0812, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9376, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0834, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7995, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8087, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3635, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1194, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9470, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7627, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0959, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1717, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 105, global step 1271: loss was not in top 1


tensor(2.7995, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2367, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1378, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7640, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1923, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1326, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0374, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4008, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0175, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0751, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9588, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1366, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 106, global step 1283: loss was not in top 1


tensor(2.0264, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5147, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2359, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1599, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6804, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2169, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8951, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6804, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1519, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0705, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0202, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 107, global step 1295: loss was not in top 1


tensor(3.1114, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8051, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8553, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7650, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3845, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9202, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5386, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9860, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1141, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4954, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1931, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2515, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 108, global step 1307: loss was not in top 1


tensor(4.1965, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0733, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6787, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5959, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1443, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0604, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8862, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1645, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1563, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2500, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0161, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 109, global step 1319: loss was not in top 1


tensor(2.8963, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1902, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7980, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3684, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1688, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1037, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1048, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1734, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1116, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9759, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0675, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8151, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 110, global step 1331: loss was not in top 1


tensor(4.1849, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3403, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1663, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1768, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4924, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4918, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1314, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8953, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2337, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4466, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6063, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1122, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4113, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 111, global step 1343: loss was not in top 1


tensor(3.3091, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2331, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0154, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8854, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7642, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9567, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1939, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0193, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7611, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8635, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6625, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 112, global step 1355: loss was not in top 1


tensor(2.7484, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4958, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1160, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6335, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7572, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7116, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3924, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3253, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0193, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2683, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9266, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6956, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 113, global step 1367: loss was not in top 1


tensor(4.3518, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8228, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1036, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0612, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3133, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8473, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8466, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4894, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5613, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8543, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4699, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2195, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 114, global step 1379: loss was not in top 1


tensor(3.6769, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0810, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9874, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0217, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0421, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0215, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0987, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9962, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7317, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4941, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 115, global step 1391: loss was not in top 1


tensor(2.7597, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9378, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2056, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4943, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8967, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3429, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2467, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1399, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2720, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0528, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8362, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0832, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 116, global step 1403: loss was not in top 1


tensor(3.1372, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0576, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1485, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8249, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0349, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1295, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9541, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2475, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8596, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0179, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2497, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1572, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 117, global step 1415: loss was not in top 1


tensor(4.0095, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2012, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0638, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8703, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0228, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6895, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8858, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7683, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3832, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5665, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1115, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6084, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 118, global step 1427: loss was not in top 1


tensor(2.4807, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0784, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9830, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9606, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6546, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0862, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2570, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2920, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0207, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1320, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1022, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9147, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 119, global step 1439: loss was not in top 1


tensor(3.1111, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0696, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9869, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8978, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4770, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9715, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0696, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3775, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0867, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0736, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0447, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2317, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 120, global step 1451: loss was not in top 1


tensor(3.9158, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0410, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2032, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7114, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8465, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7492, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2292, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8247, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5450, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9019, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1231, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1268, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 121, global step 1463: loss was not in top 1


tensor(4.1302, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0334, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8976, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8818, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9251, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1753, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0053, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1543, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7759, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1097, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4051, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0161, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 122, global step 1475: loss was not in top 1


tensor(3.8259, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0467, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4651, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0563, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9515, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9416, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0619, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0764, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2351, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1807, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1893, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2558, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 123, global step 1487: loss was not in top 1


tensor(3.7748, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5009, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0475, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5934, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9371, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1972, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4236, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1542, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9704, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0772, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4872, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 124, global step 1499: loss was not in top 1


tensor(3.9125, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8926, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6113, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3465, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8998, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1390, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8347, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7575, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2338, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9160, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7528, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4745, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 125, global step 1511: loss was not in top 1


tensor(2.5458, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9767, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2985, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1829, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2481, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4279, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7234, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1758, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1513, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9791, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5203, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8929, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 126, global step 1523: loss was not in top 1


tensor(3.7770, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.5277, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7242, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1760, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0295, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9854, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1525, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2851, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0557, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1367, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5426, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 127, global step 1535: loss was not in top 1


tensor(4.0126, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2080, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0776, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7228, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0972, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6185, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6457, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1066, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0199, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4802, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8816, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1010, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 128, global step 1547: loss was not in top 1


tensor(2.9240, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6379, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4991, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6232, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5469, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0695, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0667, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9789, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9444, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7394, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7587, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1940, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 129, global step 1559: loss was not in top 1


tensor(3.4582, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8533, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2065, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0227, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1879, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7676, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9002, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0993, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5736, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1239, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8284, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 130, global step 1571: loss was not in top 1


tensor(3.8636, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9329, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7131, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9685, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8306, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6268, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3705, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1528, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9876, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1501, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7190, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0548, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 131, global step 1583: loss was not in top 1


tensor(4.0913, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4804, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0485, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8551, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0414, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0332, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1133, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8133, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4545, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1203, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1286, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0205, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 132, global step 1595: loss was not in top 1


tensor(2.9318, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0173, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6507, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1324, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0699, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8901, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7417, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8985, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0147, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0062, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0805, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7304, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 133, global step 1607: loss was not in top 1


tensor(4.1056, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4288, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3636, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8256, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7801, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3819, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.2104, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9890, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0960, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9641, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9911, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0531, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 134, global step 1619: loss was not in top 1


tensor(3.2807, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2056, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1769, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0725, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0885, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7792, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9908, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9719, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5527, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8306, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1130, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 135, global step 1631: loss was not in top 1


tensor(3.7665, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9786, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1607, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6747, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2898, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1126, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9170, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0230, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2847, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2813, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1347, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0771, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 136, global step 1643: loss was not in top 1


tensor(3.4512, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5584, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0833, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8679, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7776, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7835, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9406, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1208, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9506, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8504, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1498, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4248, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 137, global step 1655: loss was not in top 1


tensor(4.1288, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9937, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5527, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1521, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0827, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8244, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8710, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6779, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5221, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3067, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0299, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5677, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 138, global step 1667: loss was not in top 1


tensor(3.9931, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9210, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1011, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9359, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5836, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8919, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3113, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9964, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1006, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9825, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0300, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0561, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 139, global step 1679: loss was not in top 1


tensor(3.9262, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3014, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9124, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0988, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0419, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0271, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8069, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5827, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4933, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8592, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9785, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1054, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 140, global step 1691: loss was not in top 1


tensor(2.3591, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1160, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8011, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7517, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8711, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9761, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5518, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0535, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8681, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0766, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0076, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 141, global step 1703: loss was not in top 1


tensor(3.9963, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0325, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9516, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0325, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0349, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1877, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7342, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3999, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8848, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0710, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0246, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8141, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 142, global step 1715: loss was not in top 1


tensor(3.5731, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7032, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9696, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7237, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6141, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6486, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0800, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5629, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7922, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4145, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0110, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4531, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 143, global step 1727: loss was not in top 1


tensor(3.9243, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9854, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1273, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9051, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9606, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5708, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0856, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4831, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9882, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9963, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8929, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8225, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 144, global step 1739: loss was not in top 1


tensor(2.1419, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4317, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8135, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0087, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9994, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5629, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3649, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5591, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8058, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0661, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6837, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9838, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 145, global step 1751: loss was not in top 1


tensor(4.0523, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2560, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0699, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3311, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7682, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9662, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0853, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0487, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0138, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9183, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6148, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7615, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 146, global step 1763: loss was not in top 1


tensor(3.1372, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9485, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0437, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2082, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4879, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7078, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6606, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1262, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9211, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0495, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0427, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0347, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 147, global step 1775: loss was not in top 1


tensor(2.7623, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8963, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4725, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3128, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8315, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9305, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8617, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9503, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0967, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1456, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8047, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0848, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 148, global step 1787: loss was not in top 1


tensor(4.0964, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0026, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0463, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9155, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6999, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9062, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0151, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2965, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0635, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3614, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1076, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2068, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 149, global step 1799: loss was not in top 1


tensor(4.0294, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1141, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9828, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1486, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8884, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5859, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0514, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4038, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7947, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4624, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7004, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1542, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 150, global step 1811: loss was not in top 1


tensor(4.0637, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0509, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0690, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9360, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9843, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8231, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1543, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9294, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1823, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0197, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8707, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 151, global step 1823: loss was not in top 1


tensor(3.3059, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9318, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9154, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4766, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5831, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7303, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4563, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6773, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2670, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3912, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7670, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8993, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 152, global step 1835: loss was not in top 1


tensor(4.1942, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0251, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0480, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4821, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3348, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5894, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7974, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2286, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7476, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0456, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0205, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8500, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 153, global step 1847: loss was not in top 1


tensor(4.0914, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1560, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5678, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0218, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4925, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6959, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7366, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8073, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4126, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4244, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9594, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2068, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 154, global step 1859: loss was not in top 1


tensor(4.0084, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0665, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7546, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7737, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7723, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7464, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9619, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7895, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9485, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7293, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6654, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 155, global step 1871: loss was not in top 1


tensor(3.9746, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7204, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9284, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8777, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5645, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2314, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5152, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2266, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6249, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6035, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1301, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5803, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 156, global step 1883: loss was not in top 1


tensor(3.4580, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2569, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0699, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6991, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7205, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5198, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8762, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0947, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8187, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1422, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0912, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9814, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 157, global step 1895: loss was not in top 1


tensor(4.0744, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0882, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6188, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6741, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2725, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9250, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9160, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3665, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4781, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9134, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9116, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7188, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 158, global step 1907: loss was not in top 1


tensor(3.8442, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9543, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6841, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0215, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8427, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9656, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1495, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3131, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5460, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7847, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9865, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2170, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 159, global step 1919: loss was not in top 1


tensor(3.8396, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9879, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7122, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3500, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0127, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9005, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1276, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0756, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6229, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7155, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7436, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 160, global step 1931: loss was not in top 1


tensor(3.7181, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2062, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8791, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9687, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8074, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4223, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1508, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0557, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8848, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0644, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7866, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3585, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 161, global step 1943: loss was not in top 1


tensor(3.9550, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9514, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7809, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6777, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5320, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9631, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7054, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0519, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9157, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9751, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7293, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 162, global step 1955: loss was not in top 1


tensor(3.6739, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7208, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8669, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9992, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5833, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9473, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9153, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0422, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6298, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0282, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7019, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 163, global step 1967: loss was not in top 1


tensor(3.9886, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0240, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3724, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4128, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9691, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3303, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3811, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0877, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6475, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7595, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9607, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0912, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 164, global step 1979: loss was not in top 1


tensor(4.0009, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6141, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6009, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7675, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7861, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9475, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0439, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0575, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2005, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5541, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2761, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7612, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 165, global step 1991: loss was not in top 1


tensor(3.7644, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7379, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0961, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7254, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0815, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7154, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8395, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0423, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8180, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2113, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8829, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0974, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 166, global step 2003: loss was not in top 1


tensor(3.8842, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7301, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0255, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6875, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4481, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8779, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3317, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1508, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7476, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9410, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7662, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5867, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 167, global step 2015: loss was not in top 1


tensor(3.7709, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3306, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4846, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9458, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0624, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8698, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0562, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7253, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9914, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3845, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1021, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9894, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 168, global step 2027: loss was not in top 1


tensor(3.9779, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9241, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6376, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5029, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8770, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9740, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7202, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3851, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8944, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4739, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6711, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4394, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 169, global step 2039: loss was not in top 1


tensor(3.9070, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4548, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6686, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6433, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0796, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1948, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6025, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7222, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9344, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8666, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4677, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 170, global step 2051: loss was not in top 1


tensor(3.3253, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8929, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0300, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1825, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9309, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6104, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4050, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5080, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8553, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8359, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5915, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6324, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 171, global step 2063: loss was not in top 1


tensor(1.7784, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8324, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9393, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0443, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4925, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9649, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3904, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5646, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1363, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0067, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6320, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6534, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 172, global step 2075: loss was not in top 1


tensor(3.8829, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6237, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3858, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9953, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0973, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1375, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3866, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8937, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9912, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9179, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7085, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8166, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 173, global step 2087: loss was not in top 1


tensor(4.0757, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7712, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0290, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7433, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8263, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8504, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1483, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7827, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5291, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2703, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8419, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8050, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 174, global step 2099: loss was not in top 1


tensor(3.7958, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9888, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9676, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3198, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0467, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0545, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0367, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9571, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8286, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0134, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9430, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9485, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 175, global step 2111: loss was not in top 1


tensor(3.3491, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0876, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3278, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8108, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6665, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1010, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9795, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9930, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7927, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8554, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9721, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3195, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 176, global step 2123: loss was not in top 1


tensor(4.0989, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.2516, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9920, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8987, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5872, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0914, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0420, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8820, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8713, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2791, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2712, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6313, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 177, global step 2135: loss was not in top 1


tensor(4.1183, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0484, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0866, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6401, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9643, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8823, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5124, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7932, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7995, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5324, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4687, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 178, global step 2147: loss was not in top 1


tensor(4.0029, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1301, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9802, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5707, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8180, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6888, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8035, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9881, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7354, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9465, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3999, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8500, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 179, global step 2159: loss was not in top 1


tensor(4.0040, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4161, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9857, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7166, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3137, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4769, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9931, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9968, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3694, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4797, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9195, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3377, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 180, global step 2171: loss was not in top 1


tensor(2.0968, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8075, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1208, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8152, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8388, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5162, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7942, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0422, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9319, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4199, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0720, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 181, global step 2183: loss was not in top 1


tensor(2.4645, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9932, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8501, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0846, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8209, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6409, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1442, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8785, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9927, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2369, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9998, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4724, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 182, global step 2195: loss was not in top 1


tensor(3.9307, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2459, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2448, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0157, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5017, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5440, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7431, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2350, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8740, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2655, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8582, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7668, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 183, global step 2207: loss was not in top 1


tensor(3.7759, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3828, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9380, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8659, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0341, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4980, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9371, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1883, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7648, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9587, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7634, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8642, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 184, global step 2219: loss was not in top 1


tensor(3.9114, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8872, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7878, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2826, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2576, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6960, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9162, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7609, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5544, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6359, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8052, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7559, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 185, global step 2231: loss was not in top 1


tensor(4.0048, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5910, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8628, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8830, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9909, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2520, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6912, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4133, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0165, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8529, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0804, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9752, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 186, global step 2243: loss was not in top 1


tensor(3.7137, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9178, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8405, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6961, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9940, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8783, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8292, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7338, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3265, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7424, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7811, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3839, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 187, global step 2255: loss was not in top 1


tensor(3.7432, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0205, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3078, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8586, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0586, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2009, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9233, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7147, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7546, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 188, global step 2267: loss was not in top 1


tensor(3.7606, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5432, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8676, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7580, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6465, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9340, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9221, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4954, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1525, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8939, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7662, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8827, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 189, global step 2279: loss was not in top 1


tensor(3.4807, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9579, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0008, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9143, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8011, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8000, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9105, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1601, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8204, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6223, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7137, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4888, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 190, global step 2291: loss was not in top 1


tensor(3.9452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9879, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0792, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5780, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7970, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7301, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1855, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2504, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9341, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5500, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9587, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7959, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 191, global step 2303: loss was not in top 1


tensor(3.1685, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7917, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1481, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6242, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7181, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8808, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8094, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0342, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7944, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5999, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5034, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 192, global step 2315: loss was not in top 1


tensor(3.9856, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5255, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9096, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7533, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2187, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7549, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1951, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1854, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3765, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0056, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2738, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 193, global step 2327: loss reached 2.96123 (best 2.96123), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=193-val-loss=0.00.ckpt" as top 1


tensor(2.3944, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9065, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0044, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6471, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7771, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9533, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5050, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3786, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9232, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7846, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9309, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6628, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 194, global step 2339: loss was not in top 1


tensor(3.3642, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3805, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2140, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6660, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3418, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5001, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4434, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9776, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3852, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9498, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8884, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2546, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 195, global step 2351: loss was not in top 1


tensor(3.9470, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3918, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6904, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8025, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9002, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7621, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8165, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8930, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1234, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6656, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1128, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5837, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 196, global step 2363: loss was not in top 1


tensor(3.8818, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7657, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1139, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7441, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4061, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9117, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8201, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8037, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4024, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0229, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7786, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9972, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 197, global step 2375: loss was not in top 1


tensor(3.7652, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3838, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8944, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6785, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8284, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5771, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8661, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0482, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8587, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9142, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8256, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8572, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 198, global step 2387: loss was not in top 1


tensor(3.3001, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3094, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6731, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7171, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9422, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7920, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6142, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4987, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4515, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8024, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1700, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7667, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 199, global step 2399: loss was not in top 1


tensor(3.8531, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1050, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9106, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5537, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6433, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6540, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4957, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0382, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8058, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6882, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4758, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 200, global step 2411: loss was not in top 1


tensor(3.4610, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2931, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9619, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7044, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5381, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0322, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7849, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4870, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5898, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8858, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0595, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6818, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 201, global step 2423: loss was not in top 1


tensor(3.1807, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9755, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3757, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7976, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4281, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6602, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0028, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0125, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7149, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1674, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6843, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9802, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 202, global step 2435: loss was not in top 1


tensor(3.4414, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2950, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8550, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2440, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1572, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6815, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6521, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4532, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9968, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3211, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1747, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8812, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 203, global step 2447: loss was not in top 1


tensor(3.4050, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9602, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6048, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8729, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1502, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2006, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7094, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8605, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6865, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.1022, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0431, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 204, global step 2459: loss was not in top 1


tensor(1.8267, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9815, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5389, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5917, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9011, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6165, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4161, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8651, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7033, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0303, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8518, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3914, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 205, global step 2471: loss was not in top 1


tensor(3.9971, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8976, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8974, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8017, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8014, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0012, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7993, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3933, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0957, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9028, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9053, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7927, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 206, global step 2483: loss was not in top 1


tensor(2.3307, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7915, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3101, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5458, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9315, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6855, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8797, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4054, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8622, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9122, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3897, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9511, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 207, global step 2495: loss was not in top 1


tensor(2.5698, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0988, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6337, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6764, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8582, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0055, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8285, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9885, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8934, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9502, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8960, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 208, global step 2507: loss was not in top 1


tensor(3.1861, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6096, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6448, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7166, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7376, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1814, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5643, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7519, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3326, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8965, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8509, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7790, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 209, global step 2519: loss was not in top 1


tensor(3.9847, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6990, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9408, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7116, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4929, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9528, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6556, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6460, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8541, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7892, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8932, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4964, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 210, global step 2531: loss was not in top 1


tensor(2.1268, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8080, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6971, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7816, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9378, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9195, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6018, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5247, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0975, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4587, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8265, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9375, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 211, global step 2543: loss reached 2.74396 (best 2.74396), saving model to "/content/drive/MyDrive/checkpoints8/i-Mix-pretrainepoch=211-val-loss=0.00.ckpt" as top 1


tensor(2.3579, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8666, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8267, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6012, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0035, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9604, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9813, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1642, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6905, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3520, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8527, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6340, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 212, global step 2555: loss was not in top 1


tensor(3.9087, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7841, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5850, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9051, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1719, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5414, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8447, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6030, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8403, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1979, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2428, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3214, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 213, global step 2567: loss was not in top 1


tensor(2.8413, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6628, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0954, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4926, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8372, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6465, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7515, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8621, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5874, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8131, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9600, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1800, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 214, global step 2579: loss was not in top 1


tensor(2.6042, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8577, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4958, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7547, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5747, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8032, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2380, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9757, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7793, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7406, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3827, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 215, global step 2591: loss was not in top 1


tensor(3.8521, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4382, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4796, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8478, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.5953, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7502, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7889, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0034, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5048, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7121, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8790, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4089, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 216, global step 2603: loss was not in top 1


tensor(3.5222, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8602, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8215, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6311, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8406, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7029, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4586, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7264, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6314, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8047, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8078, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6678, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 217, global step 2615: loss was not in top 1


tensor(3.9127, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6647, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8520, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7564, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2528, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6057, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7096, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5260, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0925, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6608, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8484, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8006, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 218, global step 2627: loss was not in top 1


tensor(2.1685, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0250, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9004, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0538, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1357, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7576, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8103, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5021, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9627, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8273, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8597, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9079, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 219, global step 2639: loss was not in top 1


tensor(2.4535, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4536, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3763, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7125, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8757, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6829, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8462, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6995, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2344, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8001, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5715, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3573, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 220, global step 2651: loss was not in top 1


tensor(3.7551, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0125, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6272, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0147, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7484, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1898, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7606, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3824, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3027, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4295, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7522, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 221, global step 2663: loss was not in top 1


tensor(2.8355, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1066, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6450, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9579, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3520, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8737, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8768, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5491, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8847, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3637, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9065, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7027, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 222, global step 2675: loss was not in top 1


tensor(3.6976, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5210, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9395, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8153, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8947, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8446, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6206, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9722, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8756, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5461, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7311, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9141, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 223, global step 2687: loss was not in top 1


tensor(3.7442, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5855, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3745, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5246, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8816, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7812, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9099, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9127, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6800, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5405, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1336, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 224, global step 2699: loss was not in top 1


tensor(3.9002, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0230, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3824, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4566, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7582, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8246, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9666, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3851, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2756, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7487, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8639, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8796, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 225, global step 2711: loss was not in top 1


tensor(3.5361, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9796, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3604, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5949, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5601, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0099, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6598, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5256, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8546, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7572, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8223, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8177, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 226, global step 2723: loss was not in top 1


tensor(3.8484, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9874, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2719, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0894, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2895, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8555, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3388, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8156, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6546, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7660, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1836, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9067, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 227, global step 2735: loss was not in top 1


tensor(3.8700, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4519, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4922, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3464, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7862, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2020, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6920, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2040, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2635, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4233, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8537, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7195, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 228, global step 2747: loss was not in top 1


tensor(2.2258, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5908, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8494, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6870, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2337, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7804, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6621, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4648, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1129, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5912, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6782, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 229, global step 2759: loss was not in top 1


tensor(3.7371, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8513, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7384, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7571, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9542, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5598, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8477, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8153, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6363, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6387, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1498, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4494, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 230, global step 2771: loss was not in top 1


tensor(3.9370, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6197, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5132, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6131, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4700, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9387, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2071, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8187, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3267, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7823, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8317, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4045, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 231, global step 2783: loss was not in top 1


tensor(3.8350, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9134, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7080, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1349, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4326, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8882, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7541, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8646, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7383, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7780, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4633, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 232, global step 2795: loss was not in top 1


tensor(3.7100, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6277, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5842, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7282, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7528, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2938, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8772, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7172, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6647, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8069, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3723, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5055, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 233, global step 2807: loss was not in top 1


tensor(3.3871, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9070, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6235, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7199, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5242, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7945, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3970, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0965, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5819, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8011, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6164, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3623, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 234, global step 2819: loss was not in top 1


tensor(2.2052, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8537, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8155, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9171, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9105, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9393, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6487, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4567, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1815, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9076, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6982, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4653, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 235, global step 2831: loss was not in top 1


tensor(2.3336, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4023, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6352, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6780, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2801, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7042, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9280, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1599, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3699, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2797, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1642, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 236, global step 2843: loss was not in top 1


tensor(3.9042, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7474, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2517, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5963, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8846, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4922, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5145, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2821, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7107, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7005, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7780, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 237, global step 2855: loss was not in top 1


tensor(3.5146, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9972, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1993, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5195, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6465, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2418, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4807, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3886, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6209, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5755, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3180, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6065, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 238, global step 2867: loss was not in top 1


tensor(3.8406, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8312, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4377, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0280, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7617, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7134, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4664, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9619, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6015, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7397, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8054, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3821, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 239, global step 2879: loss was not in top 1


tensor(3.6792, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6003, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7784, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6272, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9251, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7371, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1963, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0018, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6215, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8156, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9080, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8005, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 240, global step 2891: loss was not in top 1


tensor(3.4925, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8993, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0976, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5825, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8639, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4356, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3547, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6963, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2461, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8889, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9639, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9271, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 241, global step 2903: loss was not in top 1


tensor(2.8464, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8371, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6656, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8626, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7732, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7262, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4625, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2778, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2850, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8423, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9441, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 242, global step 2915: loss was not in top 1


tensor(3.7662, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7127, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1684, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8666, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3949, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9746, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6647, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8787, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8404, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7950, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8308, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5832, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 243, global step 2927: loss was not in top 1


tensor(3.7567, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8987, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4491, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5712, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8252, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9301, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9136, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0934, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5731, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0612, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9372, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7413, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 244, global step 2939: loss was not in top 1


tensor(2.1126, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2759, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6215, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2869, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8042, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4789, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3985, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7307, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9067, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8260, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3561, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 245, global step 2951: loss was not in top 1


tensor(3.1246, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6958, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0013, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6570, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8220, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5847, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0943, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8593, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5922, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4434, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0567, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5654, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 246, global step 2963: loss was not in top 1


tensor(3.4757, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0763, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1657, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8735, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7972, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8905, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5742, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9043, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8633, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7842, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6394, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7448, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 247, global step 2975: loss was not in top 1


tensor(3.5530, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6267, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7697, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8223, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0691, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4370, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8226, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3162, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8765, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7852, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8022, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5636, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 248, global step 2987: loss was not in top 1


tensor(1.8452, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4932, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8582, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4488, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9673, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6767, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7046, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2420, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7185, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6676, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8849, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6854, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 249, global step 2999: loss was not in top 1


tensor(3.9131, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9317, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8116, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7275, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8290, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6379, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6978, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6564, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.0135, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5948, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8827, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 250, global step 3011: loss was not in top 1


tensor(2.8755, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8801, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6091, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2841, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1714, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8254, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7671, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4143, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5137, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2721, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7936, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1721, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 251, global step 3023: loss was not in top 1


tensor(2.1343, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6288, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8878, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6855, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9388, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2794, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0472, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2606, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6495, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2884, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7089, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 252, global step 3035: loss was not in top 1


tensor(3.9398, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2876, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8134, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3435, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7981, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7097, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8682, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8564, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3233, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5136, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8330, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 253, global step 3047: loss was not in top 1


tensor(3.7003, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6965, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1575, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0025, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7668, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3223, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9100, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8690, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7877, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4133, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7776, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9047, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 254, global step 3059: loss was not in top 1


tensor(3.8428, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4522, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3994, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9069, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4170, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9835, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5510, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4746, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7168, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3599, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6141, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 255, global step 3071: loss was not in top 1


tensor(3.8491, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6265, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9730, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4967, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0108, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6872, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4990, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7476, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8293, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5040, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7720, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4104, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 256, global step 3083: loss was not in top 1


tensor(2.0533, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0244, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8782, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7346, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8097, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7158, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6805, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2325, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8971, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3973, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8943, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5602, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 257, global step 3095: loss was not in top 1


tensor(3.8054, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7810, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1574, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9107, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6013, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4839, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4769, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8925, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7514, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5229, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2470, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8975, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 258, global step 3107: loss was not in top 1


tensor(2.1400, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5830, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7818, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7990, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7169, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2792, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0493, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6643, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7756, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9984, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7979, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8966, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 259, global step 3119: loss was not in top 1


tensor(3.3872, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7565, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7442, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0594, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6141, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6867, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4624, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2509, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5823, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1310, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1423, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4473, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 260, global step 3131: loss was not in top 1


tensor(2.9633, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7274, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0459, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3353, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8370, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2330, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8609, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8673, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8965, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7848, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8320, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3721, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 261, global step 3143: loss was not in top 1


tensor(3.8070, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9145, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7329, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2754, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6616, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6163, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9265, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6256, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0634, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3216, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3779, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 262, global step 3155: loss was not in top 1


tensor(2.3762, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9391, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5202, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6591, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6370, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2621, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8613, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5691, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4697, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5494, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6946, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5612, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 263, global step 3167: loss was not in top 1


tensor(3.4160, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8086, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4888, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8504, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8780, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5052, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6579, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5589, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5857, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8691, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0045, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8926, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 264, global step 3179: loss was not in top 1


tensor(3.5681, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4832, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8880, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7846, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8142, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5624, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6167, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7690, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7192, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5391, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2993, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6792, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 265, global step 3191: loss was not in top 1


tensor(3.4446, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7042, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7183, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.8609, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5339, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7189, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5515, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6942, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4340, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6826, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3002, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8493, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 266, global step 3203: loss was not in top 1


tensor(3.8653, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4026, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2610, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2748, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8829, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4885, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7983, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7390, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1511, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5929, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8167, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7785, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 267, global step 3215: loss was not in top 1


tensor(3.6586, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9530, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3782, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3448, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7925, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2529, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6020, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9003, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2899, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6588, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6644, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0543, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 268, global step 3227: loss was not in top 1


tensor(1.5108, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4935, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8338, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.6295, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8894, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5485, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8134, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5070, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6297, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.8609, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6344, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1734, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 269, global step 3239: loss was not in top 1


tensor(2.4429, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9281, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2363, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7556, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7209, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7140, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7457, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8543, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9864, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0928, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8219, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.6563, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 270, global step 3251: loss was not in top 1


tensor(3.8666, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.7496, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.9037, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5499, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4837, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0446, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7711, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8211, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7252, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3913, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.3873, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5082, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 271, global step 3263: loss was not in top 1


tensor(3.4017, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9960, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.8576, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0680, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4086, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7407, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2262, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7593, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2185, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1777, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.5932, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0065, device='cuda:0', grad_fn=<AddBackward0>)


Epoch 272, global step 3275: loss was not in top 1


tensor(3.8510, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0498, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3959, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7930, device='cuda:0', grad_fn=<AddBackward0>)


#Test

In [None]:
trainer.test(ckpt_path='best',
                    test_dataloaders=test_dataloader,
                    )


#Tensor Board

In [None]:
# Start tensorboard.
%reload_ext tensorboard
%tensorboard --logdir ./tb_logs/

#Configs 

In [None]:
from argparse import Namespace
args=Namespace(
    # experiment parameters
    experiment = 'pre-train', # {'pre-train', 'fine-tune'}

    pretrained_checkpoint = None, #'/content/drive/MyDrive/checkpoints6/i-Mix-NpairFineTuneepoch=341-loss=2.67.ckpt',
    

    #path to data
    path='/content/covtype.data',

    #Parameters for the model 
    hid_dim=2048,
    input_dim=54,
    head_dim=128,
    num_classes=7,
    pool_size=4,
    
    #Parameters for mixup
    alpha=2,

    # parameter for contrastive loss
    t = 0.2,


    # parameters for training
    batch_size=512,
    learningRate = 0.125,
    weight_decay = 0.0001,
    momentum=0.99,
    freeze_encoder = True, # freeze transformer layer

    num_epochs = 400
    no_of_gpus = -1,
    seed = 123,
    resume_checkpoint = None,
    monitor = 'val_loss', # {val_loss, val_auroc_epoch}
    monitor_mode = 'min' # {max, min}
    resume_checkpoint = None

   
)