In [1]:
import os
import sys
import wandb
import argparse
import numpy as np


sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../")))
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "")))
import torch
import torchvision.transforms as T
import torchvision

from dataloaders.dataloader_cifar10 import get_cifar10
from dataloaders.dataloader_cifar100 import get_cifar100
from utils.eval_metrics import linear_evaluation, get_t_SNE_plot
from models.linear_classifer import LinearClassifier
from models.ssl import  SimSiam, Siamese, Encoder, Predictor

from trainers.train_simsiam import train_simsiam
from trainers.train_infomax import train_infomax
from trainers.train_barlow import train_barlow

from trainers.train_PFR import train_PFR_simsiam
from trainers.train_PFR_contrastive import train_PFR_contrastive_simsiam
from trainers.train_contrastive import train_contrastive_simsiam
from trainers.train_ering import train_ering_simsiam

from torchsummary import summary
import random
from utils.lr_schedulers import LinearWarmupCosineAnnealingLR, SimSiamScheduler
from utils.eval_metrics import Knn_Validation_cont
from copy import deepcopy
from loss import invariance_loss,CovarianceLoss,ErrorCovarianceLoss
import torch.nn as nn
import time
import torch.nn.functional as F
import wandb
import torch
import numpy as np
from copy import deepcopy
import torch.nn.functional as F

import torch.nn as nn

from utils.lr_schedulers import LinearWarmupCosineAnnealingLR, SimSiamScheduler
from utils.eval_metrics import Knn_Validation_cont
from copy import deepcopy
from loss import invariance_loss,CovarianceLoss,ErrorCovarianceLoss,BarlowTwinsLoss
from utils.lars import LARS

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GaussianBlur(object):
    """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""

    def __init__(self, sigma=[0.1, 2.0]):
        self.sigma = sigma

    def __call__(self, x):
        sigma = random.uniform(self.sigma[0], self.sigma[1])
        x = torchvision.transforms.functional.gaussian_blur(x,kernel_size=[3,3],sigma=sigma)#kernel size and sigma are open problems but right now seems ok!
        return x


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

In [3]:
class Args():
    normalization = 'batch'
    weight_standard = True
    same_lr = False
    pretrain_batch_size = 512
    pretrain_warmup_epochs = 10
    pretrain_warmup_lr = 3e-3
    pretrain_base_lr = 0.03
    pretrain_momentum = 0.9
    pretrain_weight_decay = 5e-4
    min_lr = 0.00
    lambdap = 1.0
    appr = 'barlow_PFR'
    knn_report_freq = 10
    cuda_device = 4
    num_workers = 8
    contrastive_ratio = 0.001
    dataset = 'cifar100'
    class_split = [25,25,25,25]
    epochs = [500,500,500,500]
    cov_loss_weight = 1.0
    sim_loss_weight = 250.0
    info_loss = 'invariance'
    lambda_norm = 1.0
    subspace_rate = 0.99
    lambda_param = 5e-3
    bsize = 32
    msize = 150
    proj_hidden = 2048
    proj_out = 2048 #infomax
    pred_hidden = 512
    pred_out = 2048
    scale_loss = 0.1
    contrastive_ratio = 0.1



In [4]:
args = Args()

In [5]:
if args.dataset == "cifar10":
        get_dataloaders = get_cifar10
        num_classes=10
elif args.dataset == "cifar100":
    get_dataloaders = get_cifar100
    num_classes=100
assert sum(args.class_split) == num_classes
assert len(args.class_split) == len(args.epochs)

In [6]:
num_worker = args.num_workers
#device
device = torch.device("cuda:" + str(args.cuda_device) if torch.cuda.is_available() else "cpu")
print(device)

cuda:4


In [7]:
#wandb init
wandb.init(project="CSSL",  entity="yavuz-team",
            mode="disabled",
            config=args,
            name= str(args.dataset) + '-algo' + str(args.appr) + "-e" + str(args.epochs) + "-b" 
            + str(args.pretrain_batch_size) + "-lr" + str(args.pretrain_base_lr)+"-CS"+str(args.class_split))

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




In [8]:
if 'infomax' in args.appr or 'barlow' in args.appr:
    transform = T.Compose([
            T.RandomResizedCrop(size=32, scale=(0.2, 1.0)),
            T.RandomHorizontalFlip(),
            T.RandomApply(torch.nn.ModuleList([T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1)]), p=0.8),
            T.RandomGrayscale(p=0.2),
            T.RandomApply([GaussianBlur()], p=0.5), 
            T.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])])

    transform_prime = T.Compose([
            T.RandomResizedCrop(size=32, scale=(0.2, 1.0)),
            T.RandomHorizontalFlip(),
            T.RandomApply(torch.nn.ModuleList([T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1)]), p=0.8),
            T.RandomGrayscale(p=0.2),
            T.RandomApply([GaussianBlur()], p=0.5), 
            T.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])])

In [9]:
#Dataloaders
print("Creating Dataloaders..")
#Class Based
train_data_loaders, train_data_loaders_knn, test_data_loaders, _, train_data_loaders_linear, train_data_loaders_pure  = get_dataloaders(transform, transform_prime, \
                                    classes=args.class_split, valid_rate = 0.00, batch_size=args.pretrain_batch_size, seed = 0, num_worker= num_worker)
_, train_data_loaders_knn_all, test_data_loaders_all, _, train_data_loaders_linear_all, train_data_loaders_pure_all = get_dataloaders(transform, transform_prime, \
                                        classes=[num_classes], valid_rate = 0.00, batch_size=args.pretrain_batch_size, seed = 0, num_worker= num_worker)


Creating Dataloaders..


Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [10]:
device = torch.device("cuda:" + str(args.cuda_device) if torch.cuda.is_available() else "cpu")
print(device)
if 'infomax' in args.appr or 'barlow' in args.appr:
    proj_hidden = args.proj_hidden
    proj_out = args.proj_out
    encoder = Encoder(hidden_dim=proj_hidden, output_dim=proj_out, normalization = args.normalization, weight_standard = args.weight_standard,appr_name =args.appr)
    model = Siamese(encoder)
    model.to(device) #automatically detects from model

cuda:4


In [11]:
model.temporal_projector = nn.Sequential(
            nn.Linear(args.proj_out, args.proj_hidden, bias=False),
            nn.BatchNorm1d(args.proj_hidden),
            nn.ReLU(),
            nn.Linear(args.proj_hidden, args.proj_out),
        ).to(device)

In [12]:
def loss_fn(x, y):
    x = F.normalize(x, dim=-1, p=2)
    y = F.normalize(y, dim=-1, p=2)
    return  - (x * y).sum(dim=-1).mean()

def get_linear_vector(model, loader, rate=0.99,device = None, task=None):
    model.eval()
    outs = []
    for x,y in loader:
        x = x.to(device)
        out = model(x).cpu().detach().numpy()
        outs.append(out)
        
    outs = np.concatenate(outs)
    outs = outs.transpose()
    outs = torch.tensor(outs)

    

    remaining = outs
    U, S, V = torch.svd(remaining)
    for i in range(len(S)):
        total = torch.norm(outs)**2 
        hand =  torch.norm(S[0:i+1])**2
        
        if hand / total > rate:
            break

    print(U[:,0:i+1].shape)

    
    Q = U[:,0:i+1]
    Q_weighted = Q * S[0:i+1].reshape(1,-1)

    vector = torch.mean(Q_weighted,dim=1)
    vector = torch.nn.functional.normalize(vector,dim=0)
   

    return vector

def extract_subspace(model, loader, rate=0.99,device = None, Q_prev = None, task=None):
    model.eval()
    outs = []
    for x,y in loader:
        x = x.to(device)
        out = model(x).cpu().detach().numpy()
        outs.append(out)
        
    outs = np.concatenate(outs)
    outs = outs.transpose()
    outs = torch.tensor(outs)

    if Q_prev == None:
        projected = torch.zeros(1)
    else:
        Q_prev = Q_prev.to('cpu')
        projected = Q_prev  @ Q_prev.T @ outs 

    remaining = outs - projected
    U, S, V = torch.svd(remaining)
    for i in range(len(S)):
        total = torch.norm(outs)**2 
        hand =  torch.norm(projected)**2 + torch.norm(S[0:i+1])**2
        
        if hand / total > rate:
            break

    print(U[:,0:i+1].shape)

    if Q_prev == None:
        Q_prev = U[:,0:i+1]
    else:
        Q_prev = torch.cat((Q_prev, U[:,0:i+1]),dim=1)
        Q_prev, _ = torch.linalg.qr(Q_prev, mode="reduced")
    wandb.log({"Task": task, "LRD Space Used ": Q_prev.shape[1]/Q_prev.shape[0] })  
    print(Q_prev.shape)
    return Q_prev

In [17]:
def update_memory(memory_x, memory_y, dataloader, size, task):
    indices = np.random.choice(len(dataloader.dataset), size=size, replace=False)
    x, _ =  dataloader.dataset[indices]
    memory_x = torch.cat((memory_x, x), dim=0)
    y = torch.ones(x.shape[0],dtype=torch.long) * task
    memory_y = torch.cat((memory_y, y), dim=0)
    return memory_x, memory_y.to(dtype=torch.long)

def train_LRD_cross_barlow(model, train_data_loaders, knn_train_data_loaders, train_data_loaders_pure, test_data_loaders, device, args):#just for 2 tasks
    
    memory_x = torch.Tensor()
    memory_y = torch.Tensor()

    epoch_counter = 0
    criterion = nn.CosineSimilarity(dim=1)
    cross_loss = BarlowTwinsLoss(lambda_param= args.lambda_param, scale_loss =args.scale_loss)
    Q = None
    contrastive_classifier = None

    cross_loss = nn.CrossEntropyLoss()

    for task_id, loader in enumerate(train_data_loaders):
        # Optimizer and Scheduler
        model.task_id = task_id
        init_lr = args.pretrain_base_lr*args.pretrain_batch_size/256.
        if task_id != 0 and args.same_lr != True:
            init_lr = init_lr / 10

        project_dim = args.proj_out
        covarince_loss = CovarianceLoss(project_dim,device=device)

            
        # optimizer = torch.optim.SGD(model.parameters(), lr=init_lr, momentum=args.pretrain_momentum, weight_decay= args.pretrain_weight_decay)
        # scheduler = LinearWarmupCosineAnnealingLR(optimizer, warmup_epochs=args.pretrain_warmup_epochs , max_epochs=args.epochs[task_id],warmup_start_lr=args.pretrain_warmup_lr,eta_min=args.min_lr) #eta_min=2e-4 is removed scheduler + values ref: infomax paper
       
        optimizer = LARS(model.parameters(),lr=init_lr, momentum=args.pretrain_momentum, weight_decay= args.pretrain_weight_decay, eta=0.02, clip_lr=True, exclude_bias_n_norm=True)  
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs[task_id]) #eta_min=2e-4 is removed scheduler + values ref: infomax paper

        loss_ = []
        for epoch in range(args.epochs[task_id]):
            start = time.time()
            model.train()
            epoch_loss_task = []
            epoch_loss_kd = []
            epoch_loss_contrastive = []
            for x1, x2, y in loader:
                x1,x2 = x1.to(device), x2.to(device)
                f1 = model.encoder.backbone(x1).squeeze() # NxC
                f2 = model.encoder.backbone(x2).squeeze() # NxC


                if task_id > 0:
                    #samples from old tasks with labels (task ids)
                    indices = np.random.choice(len(memory_x), size=min(args.bsize, len(memory_x)), replace=False)
                    x_old = memory_x[indices].to(device)
                    y_old = memory_y[indices].to(device)
                    #samples from the new task
                    indices = np.random.choice(len(train_data_loaders_pure[task_id].dataset), size=args.bsize, replace=False)
                    x_new, _ =  train_data_loaders_pure[task_id].dataset[indices]
                    x_new = x_new.to(device)
                    y_new = torch.ones(x_new.shape[0],dtype=torch.long).to(device) * task_id          
                    #concatenate
                    x = torch.cat((x_old,x_new),dim=0)
                    y = torch.cat((y_old,y_new),dim=0)
                    

                    #pass from the model
                    encoded_vectors = model.encoder.backbone(x).squeeze() # NxC

                    #do classification loss
                    outputs = contrastive_classifier(encoded_vectors) 
                    contrastive_loss = cross_loss(outputs, y)

                    
                    f1_projected = f1 @ Q @ Q.T  
                    f2_projected = f2 @ Q @ Q.T

                else:
                    contrastive_loss = torch.tensor(0)



                z1 = model.encoder.projector(f1) # NxC
                z2 = model.encoder.projector(f2) # NxC

                z1 = F.normalize(z1, p=2)
                z2 = F.normalize(z2, p=2)

                loss_task = cross_loss(z1, z2) 

                if task_id != 0: #do Distillation
                    f1Old = oldModel(x1).squeeze().detach()
                    f2Old = oldModel(x2).squeeze().detach()

                    lossKD = (-(criterion(f1_projected, f1Old).mean() * 0.5
                                            + criterion(f2_projected, f2Old).mean() * 0.5) )
                else:
                    lossKD = torch.tensor(0)
                


                epoch_loss_task.append(loss_task.item())
                epoch_loss_kd.append(lossKD.item())
                epoch_loss_contrastive.append(contrastive_loss.item())
                
                if task_id > 0:
                    #tune the classifier (might be optional in the future)
                    contrastive_optimizer.zero_grad()
                optimizer.zero_grad()
                loss = loss_task +  args.lambdap * lossKD + args.contrastive_ratio * contrastive_loss
                loss.backward()

                if task_id > 0:
                    #tune the classifier (might be optional in the future)
                    contrastive_optimizer.step()
            
                optimizer.step() 
                    
            epoch_counter += 1
            scheduler.step()
            loss_.append(np.mean(epoch_loss_task))
            end = time.time()
            print('epoch end')
            if (epoch+1) % args.knn_report_freq == 0:
                knn_acc, task_acc_arr = Knn_Validation_cont(model, knn_train_data_loaders[:task_id+1], test_data_loaders[:task_id+1], device=device, K=200, sigma=0.5) 
                wandb.log({" Global Knn Accuracy ": knn_acc, " Epoch ": epoch_counter})
                for i, acc in enumerate(task_acc_arr):
                    wandb.log({" Knn Accuracy Task-"+str(i): acc, " Epoch ": epoch_counter})
                    print(f" Knn Accuracy Task- {str(i)} : {acc},  Epoch : {epoch_counter}")
                print(f'Task {task_id:2d} | Epoch {epoch:3d} | Time:  {end-start:.1f}s  | Loss: {np.mean(epoch_loss_task):.4f} | KDLoss: {np.mean(epoch_loss_kd):.4f} | Contrastive_Loss: {np.mean(epoch_loss_contrastive):.4f}   | Knn:  {knn_acc*100:.2f}')
                print(task_acc_arr)
            else:
                print(f'Task {task_id:2d} | Epoch {epoch:3d} | Time:  {end-start:.1f}s  | Loss: {np.mean(epoch_loss_task):.4f} | KDLoss: {np.mean(epoch_loss_kd):.4f} | Contrastive_Loss: {np.mean(epoch_loss_contrastive):.4f} ')
        
            wandb.log({" Average Training Loss ": np.mean(epoch_loss_task), " Epoch ": epoch_counter, " Average KD Loss ": np.mean(epoch_loss_kd) , " Average Contrastive Loss ": np.mean(epoch_loss_contrastive) })  
            wandb.log({" lr ": optimizer.param_groups[0]['lr'], " Epoch ": epoch_counter})
            

        oldModel = deepcopy(model.encoder.backbone)  # save t-1 model
        oldModel.to(device)
        oldModel.eval()
        for param in oldModel.parameters(): #Freeze old model
            param.requires_grad = False

        Q = None # each time make Q empty
        
        Q = extract_subspace(model, knn_train_data_loaders[task_id], rate= args.subspace_rate,device = device, Q_prev = Q, task=task_id)
        Q = Q.to(device)

        vec = get_linear_vector(model, knn_train_data_loaders[task_id], rate= args.subspace_rate, device = device, task=task_id)

        new_contrastive_classifier = nn.Linear(f1.shape[1], task_id+2, bias=False).to(device)

        with torch.no_grad():
            if contrastive_classifier != None:
                new_contrastive_classifier.weight[:task_id,:] = contrastive_classifier.weight[:task_id,:].detach().cpu()

            new_contrastive_classifier.weight[task_id,:] = vec.detach().cpu()
            new_contrastive_classifier.weight[task_id+1,:] = torch.nn.functional.normalize(new_contrastive_classifier.weight[task_id+1,:],dim=0)
            

        contrastive_classifier = new_contrastive_classifier
        contrastive_optimizer = torch.optim.SGD(contrastive_classifier.parameters(), lr=0.001)

        memory_x, memory_y = update_memory(memory_x,memory_y, train_data_loaders_pure[task_id], args.msize, task_id)

        

    return model, loss_, optimizer

In [18]:
class Args():
    normalization = 'batch'
    weight_standard = True
    same_lr = False
    pretrain_batch_size = 512
    pretrain_warmup_epochs = 10
    pretrain_warmup_lr = 3e-3
    pretrain_base_lr = 0.03
    pretrain_momentum = 0.9
    pretrain_weight_decay = 5e-4
    min_lr = 0.00
    lambdap = 1.0
    appr = 'barlow_PFR'
    knn_report_freq = 5
    cuda_device = 4
    num_workers = 8
    contrastive_ratio = 0.001
    dataset = 'cifar100'
    class_split = [25,25,25,25]
    epochs = [5,5,500,500]
    cov_loss_weight = 1.0
    sim_loss_weight = 250.0
    info_loss = 'invariance'
    lambda_norm = 1.0
    subspace_rate = 0.99
    lambda_param = 5e-3
    bsize = 32
    msize = 150
    proj_hidden = 2048
    proj_out = 2048 #infomax
    pred_hidden = 512
    pred_out = 2048
    scale_loss = 0.1

In [19]:
args = Args()

In [20]:
model, loss, optimizer = train_LRD_cross_barlow(model, train_data_loaders, train_data_loaders_knn,train_data_loaders_pure , test_data_loaders, device, args)

epoch end
Task  0 | Epoch   0 | Time:  13.2s  | Loss: -343.9242 | KDLoss: 0.0000 | Contrastive_Loss: 0.0000 
epoch end
Task  0 | Epoch   1 | Time:  13.2s  | Loss: -344.2932 | KDLoss: 0.0000 | Contrastive_Loss: 0.0000 
epoch end
Task  0 | Epoch   2 | Time:  13.1s  | Loss: -343.8786 | KDLoss: 0.0000 | Contrastive_Loss: 0.0000 
epoch end
Task  0 | Epoch   3 | Time:  13.3s  | Loss: -344.4109 | KDLoss: 0.0000 | Contrastive_Loss: 0.0000 
epoch end
 Knn Accuracy Task- 0 : 0.1444,  Epoch : 5
Task  0 | Epoch   4 | Time:  13.4s  | Loss: -344.4367 | KDLoss: 0.0000 | Contrastive_Loss: 0.0000   | Knn:  14.44
[0.1444]
torch.Size([512, 3])
torch.Size([512, 3])
torch.Size([512, 3])
epoch end
Task  1 | Epoch   0 | Time:  17.2s  | Loss: -344.3045 | KDLoss: -0.9721 | Contrastive_Loss: 2.4481 
epoch end
Task  1 | Epoch   1 | Time:  17.4s  | Loss: -343.8689 | KDLoss: -0.9385 | Contrastive_Loss: 1.7123 
epoch end
Task  1 | Epoch   2 | Time:  17.2s  | Loss: -344.0959 | KDLoss: -0.9354 | Contrastive_Loss: 1.5

RuntimeError: The expanded size of the tensor (1) must match the existing size (2) at non-singleton dimension 0.  Target sizes: [1, 512].  Tensor sizes: [2, 512]

In [None]:
if 'infomax' in args.appr or 'barlow' in args.appr:
    proj_hidden = args.proj_hidden
    proj_out = args.proj_out
    encoder = Encoder(hidden_dim=proj_hidden, output_dim=proj_out, normalization = args.normalization, weight_standard = args.weight_standard,appr_name=args.appr)
    old_model = Siamese(encoder)
    old_model.to(device) #automatically detects from model

In [None]:
old_model.load_state_dict(dict['state_dict'])

<All keys matched successfully>

In [None]:
Q = extract_subspace(old_model, train_data_loaders_knn[0], rate= args.subspace_rate,device = device,Q_prev = None)
old_model = None

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:7 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)

In [None]:
device = 'cpu'
Q = Q.to(device)
model.eval()
model.to(device)
for x1, x2, y in train_data_loaders[1]:
    x1,x2 = x1.to(device), x2.to(device)
    f1 = model.encoder.backbone(x1).squeeze() # NxC
    f2 = model.encoder.backbone(x2).squeeze() # NxC

    if Q != None:#let's do projection
        f1_projected = f1 @ Q @ Q.T  
        f2_projected = f2 @ Q @ Q.T

        f1 = f1 - f1_projected
        f2 = f2 - f2_projected

        norm_loss_1 = torch.norm(f1_projected,dim =1) / (torch.norm(f1,dim =1) + 0.0000001) 
        norm_loss_1 = torch.mean(norm_loss_1)

        norm_loss_2 = torch.norm(f2_projected,dim =1) / (torch.norm(f2,dim =1) + 0.0000001) 
        norm_loss_2 = torch.mean(norm_loss_2)

        loss_norm = (norm_loss_1 + norm_loss_2) / 2
        print(loss_norm)

tensor(0.9881, grad_fn=<DivBackward0>)


KeyboardInterrupt: 

In [None]:
knn_acc, task_acc_arr = Knn_Validation_cont(model, train_data_loaders_knn[:1], test_data_loaders[:task_id+1], device=device, K=200, sigma=0.5) 

In [None]:
torch.cuda.empty_cache()

In [None]:
#Test Linear classification acc
print("Starting Classifier Training..")
lin_epoch = 100
if args.dataset == 'cifar10':
    classifier = LinearClassifier(num_classes = 10).to(device)
elif args.dataset == 'cifar100':
    classifier = LinearClassifier(num_classes = 100).to(device)

lin_optimizer = torch.optim.SGD(classifier.parameters(), 0.1, momentum=0.9) # Infomax: no weight decay, epoch 100, cosine scheduler
lin_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(lin_optimizer, lin_epoch, eta_min=2e-4) #scheduler + values ref: infomax paper
test_loss, test_acc1, test_acc5, classifier = linear_evaluation(model, train_data_loaders_knn_all[0],
                                                                    test_data_loaders_all[0],lin_optimizer, classifier, 
                                                                    lin_scheduler, epochs=lin_epoch, device=device) 


Starting Classifier Training..


Lin.Train Epoch: [1] Loss: 1.9388 : 100%|██████████| 98/98 [00:09<00:00, 10.29it/s]
Lin.Test Epoch: [1] Loss: 1.7908 ACC@1: 63.40% ACC@5: 80.53% : 100%|██████████| 20/20 [00:02<00:00,  6.98it/s]
Lin.Train Epoch: [2] Loss: 1.6584 : 100%|██████████| 98/98 [00:09<00:00, 10.25it/s]
Lin.Test Epoch: [2] Loss: 1.6033 ACC@1: 69.04% ACC@5: 85.03% : 100%|██████████| 20/20 [00:02<00:00,  7.09it/s]
Lin.Train Epoch: [3] Loss: 1.4899 : 100%|██████████| 98/98 [00:09<00:00, 10.60it/s]
Lin.Test Epoch: [3] Loss: 1.4646 ACC@1: 71.50% ACC@5: 87.95% : 100%|██████████| 20/20 [00:02<00:00,  6.97it/s]
Lin.Train Epoch: [4] Loss: 1.3616 : 100%|██████████| 98/98 [00:09<00:00, 10.46it/s]
Lin.Test Epoch: [4] Loss: 1.3554 ACC@1: 72.52% ACC@5: 90.94% : 100%|██████████| 20/20 [00:02<00:00,  7.05it/s]
Lin.Train Epoch: [5] Loss: 1.2603 : 100%|██████████| 98/98 [00:09<00:00, 10.20it/s]
Lin.Test Epoch: [5] Loss: 1.2686 ACC@1: 73.70% ACC@5: 92.30% : 100%|██████████| 20/20 [00:02<00:00,  6.93it/s]
Lin.Train Epoch: [6] Loss

KeyboardInterrupt: 