In [None]:
from __future__ import print_function
import os
import time
import logging
import random
import argparse
import numpy as np
from visdom import Visdom
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from utils import *
from metric.loss import FitNet, AttentionTransfer, RKdAngle, RkdDistance

# Teacher models:
# VGG11/VGG13/VGG16/VGG19, GoogLeNet, AlxNet, ResNet18, ResNet34, 
# ResNet50, ResNet101, ResNet152, ResNeXt29_2x64d, ResNeXt29_4x64d, 
# ResNeXt29_8x64d, ResNeXt29_32x64d, PreActResNet18, PreActResNet34, 
# PreActResNet50, PreActResNet101, PreActResNet152, 
# DenseNet121, DenseNet161, DenseNet169, DenseNet201, 
import models

# Student models:
# myNet, LeNet, FitNet

start_time = time.time()

# Training settings
parser = argparse.ArgumentParser(description='PyTorch LR_adaptive_AT')

parser.add_argument('--dataset',
                    choices=['CIFAR10',
                             'CIFAR100'
                            ],
                    default='CIFAR10')
parser.add_argument('--teachers',
                    choices=['ResNet32',
                             'ResNet44',
                             'ResNet50',
                             'ResNet56',
                             'ResNet110'
                            ],
                    default=['ResNet44', 'ResNet56', 'ResNet110'],
                    nargs='+')
parser.add_argument('--student',
                    choices=['ResNet20',
                             'myNet'
                            ],
                    default='ResNet20')
parser.add_argument('--kd_ratio', default=0.7, type=float)
parser.add_argument('--n_class', type=int, default=10, metavar='N', help='num of classes')
parser.add_argument('--T', type=float, default=20.0, metavar='Temputure', help='Temputure for distillation')
parser.add_argument('--batch_size', type=int, default=128, metavar='N', help='input batch size for training')
parser.add_argument('--test_batch_size', type=int, default=128, metavar='N', help='input test batch size for training')
parser.add_argument('--epochs', type=int, default=20, metavar='N', help='number of epochs to train (default: 20)')
parser.add_argument('--lr', type=float, default=0.1, metavar='LR', help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='SGD momentum (default: 0.5)')
parser.add_argument('--device', default='cuda:1', type=str, help='device: cuda or cpu')
parser.add_argument('--print_freq', type=int, default=10, metavar='N', help='how many batches to wait before logging training status')

config = ['--epochs', '200', '--teachers', 'ResNet32', 'ResNet56', 'ResNet110', '--T', '5.0', '--device', 'cuda:1']
args = parser.parse_args(config)

device = args.device if torch.cuda.is_available() else 'cpu'
load_dir = './checkpoint/' + args.dataset + '/'

# teachers model
teacher_models = []
for te in args.teachers:
    te_model = getattr(models, te)(num_classes=args.n_class)
#     print(te_model)
    te_model.load_state_dict(torch.load(load_dir + te_model.model_name + '.pth'))
    te_model.to(device)
    te_model.eval()  # eval mode
    teacher_models.append(te_model)

st_model = getattr(models, args.student)()  # args.student()
st_model.to(device)

# logging
logfile = load_dir + 'adapter2_distill_' + st_model.model_name + '.log'
if os.path.exists(logfile):
    os.remove(logfile)
def log_out(info):
    f = open(logfile, mode='a')
    f.write(info)
    f.write('\n')
    f.close()
    print(info)
    
# visualizer
vis = Visdom(env='distill')
loss_win = vis.line(
    X=np.array([0]),
    Y=np.array([0]),
    opts=dict(
        title='adapter + multi-AT Loss',
        xlabel='epoch',
        xtickmin=0,
#         xtickmax=1,
#         xtickstep=5,
        ylabel='loss',
        ytickmin=0,
#         ytickmax=1,
        ytickstep=0.5,
#         markers=True,
#         markersymbol='dot',
#         markersize=5,
    ),
    name="loss"
)

acc_win = vis.line(
    X=np.column_stack((0, 0)),
    Y=np.column_stack((0, 0)),
    opts=dict(
        title='adapter + multi-AT Acc',
        xlabel='epoch',
        xtickmin=0,
#         xtickstep=5,
        ylabel='accuracy',
        ytickmin=0,
        ytickmax=100,
#         markers=True,
#         markersymbol='dot',
#         markersize=5,
        legend=['train_acc', 'test_acc']
    ),
    name="acc"
)

# get triplets
def random_triplets(st_maps, te_maps):
    # input: t1, t2 - triplet pair
    def triplet_distance(t1, t2):
        return (t1 - t2).pow(2).sum()
    
    conflict = 0
    st_triplet_list = []
    triplet_set_size = st_maps.size(0)
    batch_list = [x for x in range(triplet_set_size)]
    for i in range(triplet_set_size):
        triplet_index = random.sample(batch_list, 3)
        anchor_index = triplet_index[0]  # denote the 1st triplet item as anchor
        st_triplet = st_maps[triplet_index]
        te_triplet = te_maps[triplet_index]
        distance_01 = triplet_distance(te_triplet[0], te_triplet[1])
        distance_02 = triplet_distance(te_triplet[0], te_triplet[2])
        if distance_01 > distance_02:
            conflict += 1
            # swap postive and negative
            st_triplet[1], st_triplet[2] = st_triplet[2], st_triplet[1]
        st_triplet_list.append(st_triplet)
    
    st_triplet_batch = torch.stack(st_triplet_list, dim=1)
    return st_triplet_batch


# adapter model
class Adapter():
    def __init__(self, in_models, pool_size):
        # representations of teachers
        pool_ch = pool_size[1]  # 64
        pool_w = pool_size[2]   # 8
        LR_list = []
        torch.manual_seed(1)
        self.theta = torch.randn(len(in_models), pool_ch).to(device)  # [3, 64]
        self.theta.requires_grad_(True)
   
        self.max_feat = nn.MaxPool2d(kernel_size=(pool_w, pool_w), stride=pool_w).to(device)
        self.W = torch.randn(pool_ch, 1).to(device)
        self.W.requires_grad_(True)
        self.val = False

    def loss(self, y, labels, weighted_logits, T=10.0, alpha=0.7):
        ls = nn.KLDivLoss()(F.log_softmax(y/T), weighted_logits) * (T*T * 2.0 * alpha) + F.cross_entropy(y, labels) * (1. - alpha)
        if not self.val:
            ls += 0.1 * (torch.sum(self.W * self.W) + torch.sum(torch.sum(self.theta * self.theta, dim=1), dim=0))
        return ls
        
    def gradient(self, lr=0.01):
        self.W.data = self.W.data - lr * self.W.grad.data
        # Manually zero the gradients after updating weights
        self.W.grad.data.zero_()
        
    def eval(self):
        self.val = True
        self.theta.detach()
        self.W.detach()
    
    # input size: [64, 8, 8], [128, 3, 10]
    def forward(self, conv_map, te_logits_list):
        beta = self.max_feat(conv_map)
        beta = torch.squeeze(beta)  # [128, 64]
        
        latent_factor = []
        for t in self.theta:
            latent_factor.append(beta * t)
#         latent_factor = torch.stack(latent_factor, dim=0)  # [3, 128, 64]
        alpha = []
        for lf in latent_factor:  # lf.size:[128, 64]
            alpha.append(lf.mm(self.W))
        alpha = torch.stack(alpha, dim=0)  # [3, 128, 1]
        alpha = torch.squeeze(alpha).transpose(0, 1) # [128, 3]
        miu = F.softmax(alpha)  # [128, 3]
        miu = torch.unsqueeze(miu, dim=2)
        weighted_logits = miu * te_logits_list  # [128, 3, 10]
        weighted_logits = torch.sum(weighted_logits, dim=1)
#         print(weighted_logits)
        
        return weighted_logits

# adapter instance
_,_,_,pool_m,_ = st_model(torch.randn(1,3, 128, 128).to(device))  # get pool_size of student
# reate adapter instance
adapter = Adapter(teacher_models, pool_m.size())


# data
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, 4),
    transforms.ToTensor(),
    normalize,
])
test_transform = transforms.Compose([transforms.ToTensor(), normalize])
train_set = getattr(datasets, args.dataset)(root='../data', train=True, download=True, transform=train_transform)
test_set = getattr(datasets, args.dataset)(root='../data', train=False, download=False, transform=test_transform)
train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=args.test_batch_size, shuffle=False)
# optim
optimizer_W = optim.SGD([adapter.W], lr=args.lr, momentum=0.9)
optimizer_theta = optim.SGD([adapter.theta], lr=args.lr, momentum=0.9)
optimizer_sgd = optim.SGD(st_model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer_sgd, gamma=0.1, milestones=[100, 150])
lr_scheduler2 = optim.lr_scheduler.MultiStepLR(optimizer_W, milestones=[40, 50])
lr_scheduler3 = optim.lr_scheduler.MultiStepLR(optimizer_theta, milestones=[40, 50])

# losses
dist_criterion = RkdDistance().to(device)
angle_criterion = RKdAngle().to(device)
# triplet loss
triplet_loss = nn.TripletMarginLoss(margin=0.2, p=2).to(device)


def train_adapter(n_epochs=70, model=st_model):
    print('Training adapter:')
    start_time = time.time()
    model.train()

    for ep in range(n_epochs):
        lr_scheduler2.step()
        lr_scheduler3.step()
        for i, (input, target) in enumerate(train_loader):

            input, target = input.to(device), target.to(device)
            # compute outputs
            b1, b2, b3, pool, output = model(input) # out_feat: 16, 32, 64, 64, - 
            st_maps = [b1, b2, b3, pool]
#             print('b1:{}, b2:{}, b3{}, pool:{}'.format(b1.size(), b2.size(), b3.size(), pool.size()))

            te_scores_list = []
            hint_maps = []
            for j,te in enumerate(teacher_models):
#                 te.eval()
                with torch.no_grad():
                    t_b1, t_b2, t_b3, t_pool, t_output = te(input)
                
#                 print('t_b1:{}, t_b2:{}, t_b3{}, t_pool:{}'.format(t_b1.size(), t_b2.size(), t_b3.size(), t_pool.size()))
                hint_maps.append([t_b1, t_b2, t_b3, t_pool])
                t_output = F.softmax(t_output/args.T)
                te_scores_list.append(t_output)
            te_scores_Tensor = torch.stack(te_scores_list, dim=1)  # size: [128, 3, 10]
            
            optimizer_sgd.zero_grad()
            optimizer_W.zero_grad()
            optimizer_theta.zero_grad()
            
            st_tripets = random_triplets(b2, t_b2)
            relation_loss = triplet_loss(st_tripets[0], st_tripets[1], st_tripets[2])
            
            weighted_logits = adapter.forward(pool, te_scores_Tensor)
            
            angle_loss = angle_criterion(output, weighted_logits)
            dist_loss = dist_criterion(output, weighted_logits)
            # compute gradient and do SGD step
            ada_loss = adapter.loss(output, target, weighted_logits, T=args.T, alpha=args.kd_ratio)
            loss = ada_loss + angle_loss + dist_loss + relation_loss
            
            loss.backward(retain_graph=True)
            optimizer_sgd.step()
            optimizer_W.step()
            optimizer_theta.step()
            
#          vis.line(np.array([loss.item()]), np.array([ep]), loss_win, update="append")
        log_out('epoch[{}/{}]adapter Loss: {:.4f}'.format(ep, n_epochs, loss.item()))
    end_time = time.time()
    log_out("--- adapter training cost {:.3f} mins ---".format((end_time - start_time)/60))


# train with multi-teacher
def train(epoch, model):
    print('Training:')
    # switch to train mode
    model.train()
    adapter.eval()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    
    end = time.time()
    for i, (input, target) in enumerate(train_loader):

        # measure data loading time
        data_time.update(time.time() - end)

        input, target = input.to(device), target.to(device)
        
        # compute outputs
        b1, b2, b3, pool, output = model(input)
        st_maps = [b1, b2, b3, pool]
        
        te_scores_list = []
        hint_maps = []
        for j,te in enumerate(teacher_models):
            te.eval()
            t_b1, t_b2, t_b3, t_pool, t_output = te(input)
            t_b1, t_b2, t_b3, t_pool, t_output = t_b1.detach(), t_b2.detach(), t_b3.detach(), t_pool.detach(), t_output.detach()
            hint_maps.append([t_b1, t_b2, t_b3, t_pool])
            t_output = F.softmax(t_output/args.T)
            te_scores_list.append(t_output)
        te_scores_Tensor = torch.stack(te_scores_list, dim=1)  # size: [128, 3, 10]
        weighted_logits = adapter.forward(pool, te_scores_Tensor)
        
        optimizer_sgd.zero_grad()
        
        angle_loss = angle_criterion(output, weighted_logits)
        dist_loss = dist_criterion(output, weighted_logits)
        
        st_tripets = random_triplets(b2, t_b2)
        relation_loss = triplet_loss(st_tripets[0], st_tripets[1], st_tripets[2])

        weighted_logits = adapter.forward(pool, te_scores_Tensor)
        # compute gradient and do SGD step
        ada_loss = adapter.loss(output, target, weighted_logits, T=args.T, alpha=args.kd_ratio)
        loss = ada_loss + angle_loss + dist_loss + relation_loss

        loss.backward(retain_graph=True)
        optimizer_sgd.step()

        output = output.float()
        loss = loss.float()
        # measure accuracy and record loss
        train_acc = accuracy(output.data, target.data)[0]
        losses.update(loss.item(), input.size(0))
        top1.update(train_acc, input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            log_out('[{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      i, len(train_loader), batch_time=batch_time,
                      data_time=data_time, loss=losses, top1=top1))
    return losses.avg, train_acc.cpu().numpy()


def test(model):
    print('Testing:')
    # switch to evaluate mode
    model.eval()
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(test_loader):
            input, target = input.to(device), target.to(device)

            # compute output
            _,_,_,_,output = model(input)
            loss = F.cross_entropy(output, target)

            output = output.float()
            loss = loss.float()

            # measure accuracy and record loss
            test_acc = accuracy(output.data, target.data)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(test_acc, input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                log_out('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                          i, len(test_loader), batch_time=batch_time, loss=losses,
                          top1=top1))

    log_out(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))

    return losses.avg, test_acc.cpu().numpy(), top1.avg.cpu().numpy()

# """
print('StudentNet:\n')
print(st_model)
st_model.apply(weights_init_normal)
train_adapter(n_epochs=80)
# st_model.apply(weights_init_normal)
best_acc = 0
for epoch in range(1, args.epochs + 1):
    log_out("\n===> epoch: {}/{}".format(epoch, args.epochs))
    log_out('current lr {:.5e}'.format(optimizer_sgd.param_groups[0]['lr']))
    lr_scheduler.step(epoch)
    train_loss, train_acc = train(epoch, st_model)
    # visaulize loss
    vis.line(np.array([train_loss]), np.array([epoch]), loss_win, update="append")
    _, test_acc, top1 = test(st_model)
    vis.line(np.column_stack((train_acc, top1)), np.column_stack((epoch, epoch)), acc_win, update="append")
    if top1 > best_acc:
        best_acc = top1
        
# release GPU memory
torch.cuda.empty_cache()
log_out("BEST ACC: {:.3f}".format(best_acc))
log_out("--- {:.3f} mins ---".format((time.time() - start_time)/60))
# """

  init.kaiming_normal(m.weight)


Files already downloaded and verified
StudentNet:

ResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (1): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2



epoch[0/80]adapter Loss: 2.6623
epoch[1/80]adapter Loss: 1.8386
epoch[2/80]adapter Loss: 1.7279
epoch[3/80]adapter Loss: 1.4636
epoch[4/80]adapter Loss: 1.2764
epoch[5/80]adapter Loss: 1.3325
epoch[6/80]adapter Loss: 1.2382
epoch[7/80]adapter Loss: 1.1367
epoch[8/80]adapter Loss: 1.3161
epoch[9/80]adapter Loss: 1.0568
epoch[10/80]adapter Loss: 1.0053
epoch[11/80]adapter Loss: 1.1947
epoch[12/80]adapter Loss: 0.6687
epoch[13/80]adapter Loss: 1.2803
epoch[14/80]adapter Loss: 1.0175
epoch[15/80]adapter Loss: 0.9144
epoch[16/80]adapter Loss: 0.7287
epoch[17/80]adapter Loss: 1.0714
epoch[18/80]adapter Loss: 1.0551
epoch[19/80]adapter Loss: 0.9994
epoch[20/80]adapter Loss: 0.9893
epoch[21/80]adapter Loss: 0.7603
epoch[22/80]adapter Loss: 0.8927
epoch[23/80]adapter Loss: 0.8880
epoch[24/80]adapter Loss: 1.0651
epoch[25/80]adapter Loss: 1.0390
epoch[26/80]adapter Loss: 0.6986
epoch[27/80]adapter Loss: 0.7813
epoch[28/80]adapter Loss: 0.7678
epoch[29/80]adapter Loss: 0.8396
epoch[30/80]adapter 



[10/391]	Time 0.128 (0.132)	Data 0.018 (0.019)	Loss 0.7020 (0.7328)	Prec@1 87.500 (86.790)
[20/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.7128 (0.7221)	Prec@1 86.719 (86.719)
[30/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.7236 (0.7359)	Prec@1 89.844 (86.492)
[40/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.6952 (0.7299)	Prec@1 91.406 (86.871)
[50/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.6955 (0.7424)	Prec@1 86.719 (86.443)
[60/391]	Time 0.130 (0.129)	Data 0.018 (0.018)	Loss 0.6806 (0.7565)	Prec@1 89.062 (86.027)
[70/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.7115 (0.7625)	Prec@1 87.500 (85.673)
[80/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.7256 (0.7577)	Prec@1 85.156 (85.822)
[90/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.7354 (0.7596)	Prec@1 88.281 (85.757)
[100/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.9417 (0.7631)	Prec@1 78.906 (85.620)
[110/391]	Time 0.145 (0.129)	Data 0.030 (0.019)	Loss 0.7173 (0.7632)	Prec@1 88.281 (85.73

Test: [50/79]	Time 0.019 (0.019)	Loss 1.0297 (0.9760)	Prec@1 78.125 (77.681)
Test: [60/79]	Time 0.020 (0.019)	Loss 1.1102 (0.9755)	Prec@1 72.656 (77.574)
Test: [70/79]	Time 0.019 (0.019)	Loss 0.9757 (0.9787)	Prec@1 79.688 (77.586)
 * Prec@1 77.700

===> epoch: 3/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.133 (0.133)	Data 0.020 (0.020)	Loss 0.6933 (0.6933)	Prec@1 89.844 (89.844)
[10/391]	Time 0.128 (0.131)	Data 0.019 (0.019)	Loss 0.6844 (0.7352)	Prec@1 87.500 (86.790)
[20/391]	Time 0.130 (0.132)	Data 0.018 (0.019)	Loss 0.7125 (0.7367)	Prec@1 88.281 (86.272)
[30/391]	Time 0.130 (0.131)	Data 0.018 (0.019)	Loss 0.7871 (0.7599)	Prec@1 82.812 (85.988)
[40/391]	Time 0.130 (0.131)	Data 0.018 (0.019)	Loss 0.6034 (0.7562)	Prec@1 89.844 (85.995)
[50/391]	Time 0.132 (0.131)	Data 0.019 (0.019)	Loss 0.8629 (0.7602)	Prec@1 79.688 (86.106)
[60/391]	Time 0.134 (0.131)	Data 0.019 (0.019)	Loss 0.8076 (0.7656)	Prec@1 85.938 (85.976)
[70/391]	Time 0.132 (0.131)	Data 0.018 (0.019)	Loss 0.8876 (0.76

[390/391]	Time 0.084 (0.130)	Data 0.012 (0.018)	Loss 0.8412 (0.7773)	Prec@1 85.000 (85.286)
Testing:
Test: [0/79]	Time 0.020 (0.020)	Loss 1.2113 (1.2113)	Prec@1 73.438 (73.438)
Test: [10/79]	Time 0.019 (0.019)	Loss 1.6233 (1.4791)	Prec@1 64.062 (71.804)
Test: [20/79]	Time 0.019 (0.019)	Loss 1.6752 (1.5384)	Prec@1 64.844 (69.792)
Test: [30/79]	Time 0.019 (0.019)	Loss 1.1677 (1.4988)	Prec@1 73.438 (70.388)
Test: [40/79]	Time 0.019 (0.019)	Loss 2.0215 (1.4922)	Prec@1 63.281 (70.293)
Test: [50/79]	Time 0.019 (0.019)	Loss 1.6252 (1.5104)	Prec@1 67.969 (69.868)
Test: [60/79]	Time 0.019 (0.019)	Loss 1.2916 (1.4754)	Prec@1 74.219 (70.133)
Test: [70/79]	Time 0.019 (0.019)	Loss 1.5390 (1.4842)	Prec@1 69.531 (70.169)
 * Prec@1 70.150

===> epoch: 5/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.134 (0.134)	Data 0.020 (0.020)	Loss 0.9132 (0.9132)	Prec@1 81.250 (81.250)
[10/391]	Time 0.130 (0.131)	Data 0.019 (0.019)	Loss 0.7246 (0.7846)	Prec@1 86.719 (84.162)
[20/391]	Time 0.130 (0.131)	Data 0

[340/391]	Time 0.132 (0.129)	Data 0.018 (0.018)	Loss 0.7873 (0.7699)	Prec@1 86.719 (85.530)
[350/391]	Time 0.130 (0.129)	Data 0.019 (0.018)	Loss 0.8111 (0.7693)	Prec@1 84.375 (85.564)
[360/391]	Time 0.132 (0.130)	Data 0.018 (0.018)	Loss 0.7633 (0.7693)	Prec@1 90.625 (85.593)
[370/391]	Time 0.130 (0.130)	Data 0.018 (0.018)	Loss 0.7923 (0.7686)	Prec@1 81.250 (85.580)
[380/391]	Time 0.129 (0.130)	Data 0.019 (0.018)	Loss 0.9209 (0.7688)	Prec@1 80.469 (85.575)
[390/391]	Time 0.086 (0.129)	Data 0.012 (0.018)	Loss 0.8759 (0.7706)	Prec@1 77.500 (85.522)
Testing:
Test: [0/79]	Time 0.019 (0.019)	Loss 0.8177 (0.8177)	Prec@1 81.250 (81.250)
Test: [10/79]	Time 0.019 (0.019)	Loss 0.7283 (0.7602)	Prec@1 79.688 (81.321)
Test: [20/79]	Time 0.019 (0.019)	Loss 0.7429 (0.8084)	Prec@1 80.469 (80.692)
Test: [30/79]	Time 0.019 (0.019)	Loss 0.6447 (0.8073)	Prec@1 82.812 (80.670)
Test: [40/79]	Time 0.019 (0.019)	Loss 0.8948 (0.7957)	Prec@1 79.688 (80.907)
Test: [50/79]	Time 0.019 (0.019)	Loss 0.8665 (0.7982)	P

[290/391]	Time 0.128 (0.130)	Data 0.018 (0.018)	Loss 0.8177 (0.7719)	Prec@1 83.594 (85.588)
[300/391]	Time 0.128 (0.130)	Data 0.018 (0.018)	Loss 0.6480 (0.7695)	Prec@1 89.062 (85.665)
[310/391]	Time 0.130 (0.130)	Data 0.018 (0.018)	Loss 0.6831 (0.7685)	Prec@1 88.281 (85.704)
[320/391]	Time 0.128 (0.130)	Data 0.018 (0.018)	Loss 0.7344 (0.7696)	Prec@1 84.375 (85.638)
[330/391]	Time 0.129 (0.130)	Data 0.019 (0.018)	Loss 0.7919 (0.7689)	Prec@1 83.594 (85.661)
[340/391]	Time 0.128 (0.130)	Data 0.018 (0.018)	Loss 0.8394 (0.7694)	Prec@1 86.719 (85.637)
[350/391]	Time 0.128 (0.130)	Data 0.018 (0.018)	Loss 0.8180 (0.7710)	Prec@1 82.812 (85.593)
[360/391]	Time 0.133 (0.130)	Data 0.018 (0.018)	Loss 0.7334 (0.7715)	Prec@1 84.375 (85.563)
[370/391]	Time 0.130 (0.130)	Data 0.018 (0.018)	Loss 0.8763 (0.7725)	Prec@1 82.812 (85.554)
[380/391]	Time 0.127 (0.130)	Data 0.018 (0.018)	Loss 0.6708 (0.7735)	Prec@1 92.188 (85.523)
[390/391]	Time 0.085 (0.130)	Data 0.012 (0.018)	Loss 0.7977 (0.7746)	Prec@1 85.0

[240/391]	Time 0.129 (0.129)	Data 0.018 (0.018)	Loss 0.7832 (0.7682)	Prec@1 85.156 (85.720)
[250/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.6817 (0.7676)	Prec@1 90.625 (85.741)
[260/391]	Time 0.127 (0.129)	Data 0.018 (0.018)	Loss 0.5619 (0.7656)	Prec@1 93.750 (85.830)
[270/391]	Time 0.129 (0.129)	Data 0.018 (0.018)	Loss 0.6724 (0.7659)	Prec@1 86.719 (85.828)
[280/391]	Time 0.129 (0.129)	Data 0.018 (0.018)	Loss 0.6547 (0.7656)	Prec@1 88.281 (85.835)
[290/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.6647 (0.7653)	Prec@1 90.625 (85.822)
[300/391]	Time 0.129 (0.129)	Data 0.018 (0.018)	Loss 0.9027 (0.7660)	Prec@1 79.688 (85.782)
[310/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.7911 (0.7666)	Prec@1 86.719 (85.779)
[320/391]	Time 0.130 (0.129)	Data 0.018 (0.018)	Loss 0.8768 (0.7688)	Prec@1 81.250 (85.706)
[330/391]	Time 0.127 (0.129)	Data 0.018 (0.018)	Loss 0.8139 (0.7689)	Prec@1 83.594 (85.697)
[340/391]	Time 0.130 (0.129)	Data 0.019 (0.018)	Loss 0.6497 (0.7675)	Prec@1 92.1

[190/391]	Time 0.131 (0.130)	Data 0.019 (0.019)	Loss 0.7037 (0.7660)	Prec@1 89.062 (85.672)
[200/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.7298 (0.7676)	Prec@1 89.844 (85.615)
[210/391]	Time 0.131 (0.130)	Data 0.018 (0.019)	Loss 0.7254 (0.7675)	Prec@1 85.156 (85.641)
[220/391]	Time 0.130 (0.130)	Data 0.019 (0.019)	Loss 0.7700 (0.7689)	Prec@1 85.938 (85.580)
[230/391]	Time 0.132 (0.130)	Data 0.019 (0.019)	Loss 0.7606 (0.7683)	Prec@1 86.719 (85.576)
[240/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.6651 (0.7661)	Prec@1 89.844 (85.672)
[250/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.7651 (0.7664)	Prec@1 89.062 (85.670)
[260/391]	Time 0.128 (0.130)	Data 0.019 (0.019)	Loss 0.7394 (0.7666)	Prec@1 82.812 (85.671)
[270/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.7115 (0.7673)	Prec@1 85.156 (85.620)
[280/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.7526 (0.7670)	Prec@1 86.719 (85.626)
[290/391]	Time 0.130 (0.130)	Data 0.019 (0.019)	Loss 0.8926 (0.7673)	Prec@1 81.2

[140/391]	Time 0.128 (0.129)	Data 0.018 (0.019)	Loss 0.7649 (0.7472)	Prec@1 84.375 (86.314)
[150/391]	Time 0.127 (0.129)	Data 0.018 (0.019)	Loss 0.8461 (0.7519)	Prec@1 86.719 (86.207)
[160/391]	Time 0.131 (0.129)	Data 0.020 (0.019)	Loss 0.9063 (0.7564)	Prec@1 84.375 (86.141)
[170/391]	Time 0.127 (0.129)	Data 0.018 (0.019)	Loss 0.6492 (0.7602)	Prec@1 89.062 (86.011)
[180/391]	Time 0.129 (0.129)	Data 0.018 (0.019)	Loss 0.6969 (0.7614)	Prec@1 89.062 (85.976)
[190/391]	Time 0.130 (0.129)	Data 0.018 (0.019)	Loss 0.5653 (0.7608)	Prec@1 91.406 (86.027)
[200/391]	Time 0.127 (0.129)	Data 0.018 (0.019)	Loss 0.7196 (0.7630)	Prec@1 83.594 (85.976)
[210/391]	Time 0.128 (0.129)	Data 0.018 (0.019)	Loss 0.7849 (0.7642)	Prec@1 87.500 (85.926)
[220/391]	Time 0.130 (0.129)	Data 0.019 (0.019)	Loss 0.6563 (0.7635)	Prec@1 90.625 (85.959)
[230/391]	Time 0.127 (0.129)	Data 0.019 (0.019)	Loss 0.7988 (0.7664)	Prec@1 84.375 (85.795)
[240/391]	Time 0.129 (0.129)	Data 0.019 (0.019)	Loss 0.8390 (0.7651)	Prec@1 83.5

[90/391]	Time 0.132 (0.151)	Data 0.019 (0.024)	Loss 0.6408 (0.7454)	Prec@1 92.188 (86.538)
[100/391]	Time 0.134 (0.149)	Data 0.020 (0.024)	Loss 0.8719 (0.7537)	Prec@1 83.594 (86.363)
[110/391]	Time 0.132 (0.148)	Data 0.019 (0.024)	Loss 0.8598 (0.7552)	Prec@1 84.375 (86.325)
[120/391]	Time 0.150 (0.148)	Data 0.023 (0.023)	Loss 0.6789 (0.7548)	Prec@1 89.062 (86.357)
[130/391]	Time 0.143 (0.147)	Data 0.025 (0.023)	Loss 0.8067 (0.7556)	Prec@1 85.156 (86.313)
[140/391]	Time 0.145 (0.147)	Data 0.026 (0.023)	Loss 0.5886 (0.7534)	Prec@1 94.531 (86.298)
[150/391]	Time 0.201 (0.149)	Data 0.032 (0.024)	Loss 0.7236 (0.7549)	Prec@1 89.062 (86.294)
[160/391]	Time 0.134 (0.152)	Data 0.019 (0.024)	Loss 0.8044 (0.7573)	Prec@1 82.031 (86.195)
[170/391]	Time 0.139 (0.151)	Data 0.024 (0.024)	Loss 0.7969 (0.7589)	Prec@1 85.938 (86.129)
[180/391]	Time 0.149 (0.150)	Data 0.024 (0.024)	Loss 0.7342 (0.7557)	Prec@1 84.375 (86.231)
[190/391]	Time 0.128 (0.149)	Data 0.018 (0.024)	Loss 0.6127 (0.7547)	Prec@1 87.50

[40/391]	Time 0.149 (0.156)	Data 0.034 (0.027)	Loss 0.7562 (0.7409)	Prec@1 83.594 (86.147)
[50/391]	Time 0.141 (0.157)	Data 0.019 (0.028)	Loss 0.6847 (0.7324)	Prec@1 84.375 (86.229)
[60/391]	Time 0.144 (0.158)	Data 0.019 (0.028)	Loss 0.6410 (0.7316)	Prec@1 92.969 (86.399)
[70/391]	Time 0.138 (0.159)	Data 0.020 (0.027)	Loss 0.7072 (0.7266)	Prec@1 89.062 (86.642)
[80/391]	Time 0.136 (0.160)	Data 0.019 (0.027)	Loss 0.8343 (0.7299)	Prec@1 83.594 (86.487)
[90/391]	Time 0.137 (0.160)	Data 0.020 (0.028)	Loss 0.9235 (0.7353)	Prec@1 82.812 (86.255)
[100/391]	Time 0.138 (0.160)	Data 0.019 (0.028)	Loss 0.6891 (0.7429)	Prec@1 90.625 (86.146)
[110/391]	Time 0.149 (0.161)	Data 0.025 (0.027)	Loss 0.6921 (0.7413)	Prec@1 87.500 (86.170)
[120/391]	Time 0.287 (0.162)	Data 0.020 (0.027)	Loss 0.6102 (0.7385)	Prec@1 89.844 (86.331)
[130/391]	Time 0.158 (0.166)	Data 0.035 (0.027)	Loss 0.5989 (0.7356)	Prec@1 91.406 (86.403)
[140/391]	Time 0.133 (0.163)	Data 0.019 (0.027)	Loss 0.7423 (0.7376)	Prec@1 86.719 (86

Test: [70/79]	Time 0.024 (0.025)	Loss 0.8073 (0.9214)	Prec@1 82.031 (79.632)
 * Prec@1 79.540

===> epoch: 20/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.135 (0.135)	Data 0.020 (0.020)	Loss 0.7414 (0.7414)	Prec@1 85.156 (85.156)
[10/391]	Time 0.146 (0.158)	Data 0.025 (0.024)	Loss 0.7733 (0.7393)	Prec@1 86.719 (86.293)
[20/391]	Time 0.149 (0.161)	Data 0.025 (0.026)	Loss 0.7261 (0.7234)	Prec@1 86.719 (86.644)
[30/391]	Time 0.142 (0.161)	Data 0.019 (0.025)	Loss 0.7875 (0.7399)	Prec@1 85.156 (86.240)
[40/391]	Time 0.139 (0.161)	Data 0.019 (0.026)	Loss 0.6620 (0.7395)	Prec@1 88.281 (86.071)
[50/391]	Time 0.133 (0.161)	Data 0.020 (0.026)	Loss 0.6260 (0.7362)	Prec@1 90.625 (86.428)
[60/391]	Time 0.133 (0.161)	Data 0.020 (0.027)	Loss 0.9059 (0.7384)	Prec@1 83.594 (86.219)
[70/391]	Time 0.132 (0.161)	Data 0.019 (0.027)	Loss 0.8005 (0.7436)	Prec@1 85.156 (86.136)
[80/391]	Time 0.133 (0.162)	Data 0.020 (0.027)	Loss 0.7121 (0.7400)	Prec@1 89.062 (86.352)
[90/391]	Time 0.131 (0.162)	Data 0.

Test: [10/79]	Time 0.019 (0.028)	Loss 0.8912 (0.8048)	Prec@1 82.031 (81.676)
Test: [20/79]	Time 0.019 (0.024)	Loss 0.6840 (0.8760)	Prec@1 82.812 (80.246)
Test: [30/79]	Time 0.019 (0.022)	Loss 0.6865 (0.8611)	Prec@1 83.594 (80.091)
Test: [40/79]	Time 0.024 (0.022)	Loss 0.8839 (0.8624)	Prec@1 78.125 (80.011)
Test: [50/79]	Time 0.019 (0.022)	Loss 0.9121 (0.8599)	Prec@1 79.688 (80.086)
Test: [60/79]	Time 0.047 (0.024)	Loss 0.8726 (0.8635)	Prec@1 80.469 (79.982)
Test: [70/79]	Time 0.024 (0.026)	Loss 0.6349 (0.8571)	Prec@1 83.594 (80.128)
 * Prec@1 80.310

===> epoch: 22/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.136 (0.136)	Data 0.024 (0.024)	Loss 0.6738 (0.6738)	Prec@1 86.719 (86.719)
[10/391]	Time 0.132 (0.159)	Data 0.019 (0.027)	Loss 0.7608 (0.7348)	Prec@1 86.719 (86.435)
[20/391]	Time 0.130 (0.161)	Data 0.019 (0.028)	Loss 0.8550 (0.7480)	Prec@1 81.250 (86.086)
[30/391]	Time 0.136 (0.162)	Data 0.020 (0.028)	Loss 0.7770 (0.7613)	Prec@1 85.938 (85.711)
[40/391]	Time 0.132 (0.162)	

[360/391]	Time 0.145 (0.164)	Data 0.023 (0.027)	Loss 0.6546 (0.7537)	Prec@1 89.062 (86.013)
[370/391]	Time 0.150 (0.164)	Data 0.025 (0.027)	Loss 0.7665 (0.7544)	Prec@1 85.938 (85.986)
[380/391]	Time 0.148 (0.164)	Data 0.025 (0.027)	Loss 0.7632 (0.7553)	Prec@1 85.156 (85.956)
[390/391]	Time 0.095 (0.164)	Data 0.013 (0.027)	Loss 0.8549 (0.7568)	Prec@1 86.250 (85.896)
Testing:
Test: [0/79]	Time 0.020 (0.020)	Loss 1.5247 (1.5247)	Prec@1 73.438 (73.438)
Test: [10/79]	Time 0.051 (0.041)	Loss 1.2506 (1.7252)	Prec@1 75.000 (69.886)
Test: [20/79]	Time 0.024 (0.035)	Loss 1.5156 (1.7454)	Prec@1 70.312 (69.345)
Test: [30/79]	Time 0.019 (0.030)	Loss 1.4385 (1.7261)	Prec@1 73.438 (69.304)
Test: [40/79]	Time 0.020 (0.027)	Loss 1.6917 (1.6874)	Prec@1 67.188 (69.722)
Test: [50/79]	Time 0.030 (0.026)	Loss 1.4704 (1.6881)	Prec@1 69.531 (69.547)
Test: [60/79]	Time 0.021 (0.026)	Loss 1.7132 (1.6821)	Prec@1 74.219 (69.787)
Test: [70/79]	Time 0.028 (0.026)	Loss 2.0963 (1.6856)	Prec@1 64.844 (69.729)
 * Prec@

[310/391]	Time 0.154 (0.157)	Data 0.021 (0.025)	Loss 0.7439 (0.7690)	Prec@1 89.062 (85.510)
[320/391]	Time 0.141 (0.157)	Data 0.020 (0.025)	Loss 0.7955 (0.7700)	Prec@1 82.812 (85.487)
[330/391]	Time 0.139 (0.157)	Data 0.021 (0.026)	Loss 0.7560 (0.7682)	Prec@1 89.062 (85.553)
[340/391]	Time 0.154 (0.157)	Data 0.033 (0.026)	Loss 0.7462 (0.7671)	Prec@1 85.156 (85.582)
[350/391]	Time 0.147 (0.158)	Data 0.020 (0.026)	Loss 0.7232 (0.7674)	Prec@1 86.719 (85.559)
[360/391]	Time 0.138 (0.158)	Data 0.021 (0.026)	Loss 0.8705 (0.7669)	Prec@1 80.469 (85.559)
[370/391]	Time 0.136 (0.158)	Data 0.020 (0.026)	Loss 0.8819 (0.7664)	Prec@1 82.812 (85.580)
[380/391]	Time 0.141 (0.158)	Data 0.021 (0.026)	Loss 0.6690 (0.7658)	Prec@1 91.406 (85.620)
[390/391]	Time 0.124 (0.159)	Data 0.030 (0.026)	Loss 0.7347 (0.7657)	Prec@1 87.500 (85.624)
Testing:
Test: [0/79]	Time 0.045 (0.045)	Loss 0.9852 (0.9852)	Prec@1 78.125 (78.125)
Test: [10/79]	Time 0.025 (0.031)	Loss 0.9587 (1.2250)	Prec@1 78.906 (74.361)
Test: [20/

[260/391]	Time 0.146 (0.157)	Data 0.032 (0.026)	Loss 0.7028 (0.7481)	Prec@1 89.062 (85.976)
[270/391]	Time 0.264 (0.157)	Data 0.061 (0.026)	Loss 0.7972 (0.7484)	Prec@1 85.938 (85.998)
[280/391]	Time 0.147 (0.156)	Data 0.032 (0.026)	Loss 0.7390 (0.7475)	Prec@1 85.156 (86.038)
[290/391]	Time 0.139 (0.157)	Data 0.019 (0.026)	Loss 0.8664 (0.7499)	Prec@1 82.031 (85.994)
[300/391]	Time 0.137 (0.157)	Data 0.020 (0.026)	Loss 0.7508 (0.7495)	Prec@1 86.719 (86.010)
[310/391]	Time 0.133 (0.157)	Data 0.019 (0.026)	Loss 0.8339 (0.7503)	Prec@1 81.250 (85.993)
[320/391]	Time 0.136 (0.157)	Data 0.019 (0.026)	Loss 0.6585 (0.7512)	Prec@1 91.406 (85.940)
[330/391]	Time 0.135 (0.157)	Data 0.020 (0.026)	Loss 0.7026 (0.7534)	Prec@1 91.406 (85.853)
[340/391]	Time 0.135 (0.157)	Data 0.021 (0.026)	Loss 0.6195 (0.7530)	Prec@1 91.406 (85.855)
[350/391]	Time 0.137 (0.158)	Data 0.020 (0.026)	Loss 0.6995 (0.7526)	Prec@1 87.500 (85.860)
[360/391]	Time 0.137 (0.158)	Data 0.020 (0.026)	Loss 0.7984 (0.7536)	Prec@1 82.0

[210/391]	Time 0.243 (0.163)	Data 0.019 (0.027)	Loss 0.7880 (0.7610)	Prec@1 86.719 (85.889)
[220/391]	Time 0.255 (0.164)	Data 0.020 (0.027)	Loss 0.7508 (0.7613)	Prec@1 82.031 (85.849)
[230/391]	Time 0.277 (0.164)	Data 0.061 (0.028)	Loss 0.9170 (0.7641)	Prec@1 81.250 (85.768)
[240/391]	Time 0.174 (0.163)	Data 0.029 (0.027)	Loss 0.8470 (0.7677)	Prec@1 82.031 (85.665)
[250/391]	Time 0.284 (0.163)	Data 0.059 (0.027)	Loss 0.7745 (0.7678)	Prec@1 85.938 (85.670)
[260/391]	Time 0.247 (0.164)	Data 0.043 (0.027)	Loss 0.5947 (0.7692)	Prec@1 90.625 (85.656)
[270/391]	Time 0.193 (0.163)	Data 0.024 (0.027)	Loss 0.7582 (0.7678)	Prec@1 85.156 (85.727)
[280/391]	Time 0.250 (0.163)	Data 0.048 (0.027)	Loss 0.8064 (0.7671)	Prec@1 83.594 (85.757)
[290/391]	Time 0.150 (0.163)	Data 0.026 (0.027)	Loss 0.5687 (0.7650)	Prec@1 91.406 (85.817)
[300/391]	Time 0.213 (0.164)	Data 0.043 (0.027)	Loss 0.7154 (0.7665)	Prec@1 85.938 (85.764)
[310/391]	Time 0.131 (0.164)	Data 0.019 (0.027)	Loss 0.9607 (0.7679)	Prec@1 78.9

[160/391]	Time 0.131 (0.130)	Data 0.018 (0.019)	Loss 0.8626 (0.7559)	Prec@1 82.812 (86.044)
[170/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.7240 (0.7542)	Prec@1 85.938 (86.065)
[180/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.7645 (0.7539)	Prec@1 86.719 (86.058)
[190/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.7846 (0.7572)	Prec@1 89.062 (85.974)
[200/391]	Time 0.129 (0.130)	Data 0.019 (0.019)	Loss 0.7137 (0.7575)	Prec@1 86.719 (85.926)
[210/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.8203 (0.7573)	Prec@1 81.250 (85.912)
[220/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.8027 (0.7555)	Prec@1 85.156 (85.952)
[230/391]	Time 0.129 (0.130)	Data 0.019 (0.019)	Loss 0.9214 (0.7549)	Prec@1 82.031 (85.938)
[240/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.8190 (0.7544)	Prec@1 81.250 (85.938)
[250/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.7406 (0.7532)	Prec@1 85.156 (85.994)
[260/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.9230 (0.7546)	Prec@1 82.8

[110/391]	Time 0.129 (0.131)	Data 0.018 (0.019)	Loss 0.6971 (0.7500)	Prec@1 88.281 (86.142)
[120/391]	Time 0.141 (0.131)	Data 0.019 (0.019)	Loss 0.8222 (0.7528)	Prec@1 85.938 (86.054)
[130/391]	Time 0.128 (0.131)	Data 0.018 (0.019)	Loss 0.7278 (0.7504)	Prec@1 86.719 (86.104)
[140/391]	Time 0.128 (0.131)	Data 0.018 (0.019)	Loss 0.6522 (0.7478)	Prec@1 89.062 (86.165)
[150/391]	Time 0.128 (0.131)	Data 0.018 (0.019)	Loss 0.6968 (0.7485)	Prec@1 86.719 (86.207)
[160/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.7032 (0.7461)	Prec@1 87.500 (86.316)
[170/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.7476 (0.7476)	Prec@1 87.500 (86.234)
[180/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.7279 (0.7503)	Prec@1 88.281 (86.153)
[190/391]	Time 0.127 (0.130)	Data 0.018 (0.019)	Loss 0.6386 (0.7479)	Prec@1 89.062 (86.199)
[200/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.8282 (0.7481)	Prec@1 82.812 (86.175)
[210/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.6485 (0.7483)	Prec@1 89.0

[60/391]	Time 0.128 (0.131)	Data 0.018 (0.019)	Loss 0.8551 (0.7427)	Prec@1 84.375 (86.539)
[70/391]	Time 0.132 (0.131)	Data 0.018 (0.019)	Loss 0.9561 (0.7414)	Prec@1 79.688 (86.620)
[80/391]	Time 0.130 (0.131)	Data 0.019 (0.019)	Loss 0.7368 (0.7422)	Prec@1 85.938 (86.545)
[90/391]	Time 0.133 (0.131)	Data 0.019 (0.019)	Loss 0.7656 (0.7380)	Prec@1 88.281 (86.530)
[100/391]	Time 0.127 (0.130)	Data 0.018 (0.019)	Loss 1.0258 (0.7433)	Prec@1 82.031 (86.463)
[110/391]	Time 0.128 (0.130)	Data 0.019 (0.019)	Loss 0.8318 (0.7443)	Prec@1 86.719 (86.402)
[120/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.6491 (0.7416)	Prec@1 92.969 (86.454)
[130/391]	Time 0.132 (0.130)	Data 0.019 (0.019)	Loss 0.7803 (0.7435)	Prec@1 85.938 (86.397)
[140/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.6990 (0.7493)	Prec@1 86.719 (86.259)
[150/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.8063 (0.7505)	Prec@1 85.156 (86.191)
[160/391]	Time 0.131 (0.130)	Data 0.018 (0.019)	Loss 0.7439 (0.7497)	Prec@1 85.156 (

[10/391]	Time 0.127 (0.129)	Data 0.018 (0.018)	Loss 0.5670 (0.7352)	Prec@1 94.531 (87.642)
[20/391]	Time 0.127 (0.129)	Data 0.018 (0.018)	Loss 0.7998 (0.7571)	Prec@1 85.156 (86.570)
[30/391]	Time 0.127 (0.129)	Data 0.018 (0.018)	Loss 0.7202 (0.7470)	Prec@1 89.844 (86.668)
[40/391]	Time 0.127 (0.129)	Data 0.018 (0.018)	Loss 0.7380 (0.7535)	Prec@1 84.375 (86.319)
[50/391]	Time 0.129 (0.129)	Data 0.018 (0.018)	Loss 0.9596 (0.7666)	Prec@1 79.688 (85.830)
[60/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.6694 (0.7590)	Prec@1 87.500 (86.091)
[70/391]	Time 0.128 (0.129)	Data 0.018 (0.018)	Loss 0.6646 (0.7521)	Prec@1 87.500 (86.323)
[80/391]	Time 0.127 (0.129)	Data 0.018 (0.018)	Loss 0.7611 (0.7433)	Prec@1 82.812 (86.507)
[90/391]	Time 0.129 (0.128)	Data 0.018 (0.018)	Loss 0.6233 (0.7410)	Prec@1 89.844 (86.581)
[100/391]	Time 0.126 (0.128)	Data 0.018 (0.018)	Loss 0.8434 (0.7443)	Prec@1 82.812 (86.463)
[110/391]	Time 0.126 (0.128)	Data 0.018 (0.018)	Loss 0.6837 (0.7428)	Prec@1 89.062 (86.52

Test: [50/79]	Time 0.019 (0.019)	Loss 1.0167 (0.9251)	Prec@1 81.250 (78.784)
Test: [60/79]	Time 0.019 (0.019)	Loss 0.9557 (0.9245)	Prec@1 78.906 (78.650)
Test: [70/79]	Time 0.019 (0.019)	Loss 0.9963 (0.9265)	Prec@1 75.781 (78.642)
 * Prec@1 78.840

===> epoch: 39/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.137 (0.137)	Data 0.020 (0.020)	Loss 0.8433 (0.8433)	Prec@1 85.156 (85.156)
[10/391]	Time 0.128 (0.132)	Data 0.018 (0.019)	Loss 0.7004 (0.7258)	Prec@1 88.281 (86.861)
[20/391]	Time 0.128 (0.131)	Data 0.018 (0.019)	Loss 0.7665 (0.7278)	Prec@1 85.938 (86.756)
[30/391]	Time 0.128 (0.131)	Data 0.018 (0.019)	Loss 0.7139 (0.7334)	Prec@1 89.844 (86.618)
[40/391]	Time 0.130 (0.130)	Data 0.018 (0.019)	Loss 0.6927 (0.7261)	Prec@1 86.719 (86.814)
[50/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.7972 (0.7267)	Prec@1 83.594 (86.719)
[60/391]	Time 0.128 (0.130)	Data 0.018 (0.019)	Loss 0.5981 (0.7248)	Prec@1 93.750 (86.924)
[70/391]	Time 0.129 (0.130)	Data 0.018 (0.019)	Loss 0.7191 (0.7

[390/391]	Time 0.091 (0.132)	Data 0.013 (0.019)	Loss 0.7750 (0.7649)	Prec@1 87.500 (85.652)
Testing:
Test: [0/79]	Time 0.020 (0.020)	Loss 1.6040 (1.6040)	Prec@1 64.062 (64.062)
Test: [10/79]	Time 0.019 (0.021)	Loss 1.3129 (1.6279)	Prec@1 76.562 (66.335)
Test: [20/79]	Time 0.020 (0.021)	Loss 1.8004 (1.6077)	Prec@1 65.625 (67.262)
Test: [30/79]	Time 0.020 (0.021)	Loss 1.4366 (1.5666)	Prec@1 65.625 (67.490)
Test: [40/79]	Time 0.022 (0.021)	Loss 1.5834 (1.5509)	Prec@1 68.750 (67.797)
Test: [50/79]	Time 0.022 (0.021)	Loss 1.7438 (1.5585)	Prec@1 65.625 (67.816)
Test: [60/79]	Time 0.022 (0.021)	Loss 1.1734 (1.5596)	Prec@1 72.656 (67.879)
Test: [70/79]	Time 0.024 (0.021)	Loss 1.6469 (1.5687)	Prec@1 67.969 (67.661)
 * Prec@1 67.730

===> epoch: 41/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.144 (0.144)	Data 0.022 (0.022)	Loss 0.6690 (0.6690)	Prec@1 88.281 (88.281)
[10/391]	Time 0.135 (0.138)	Data 0.022 (0.021)	Loss 0.6970 (0.7549)	Prec@1 89.062 (86.932)
[20/391]	Time 0.135 (0.137)	Data 

[340/391]	Time 0.128 (0.133)	Data 0.018 (0.019)	Loss 0.7386 (0.7583)	Prec@1 86.719 (85.754)
[350/391]	Time 0.130 (0.133)	Data 0.019 (0.019)	Loss 0.7558 (0.7588)	Prec@1 82.031 (85.759)
[360/391]	Time 0.131 (0.133)	Data 0.019 (0.019)	Loss 0.7502 (0.7588)	Prec@1 86.719 (85.786)
[370/391]	Time 0.128 (0.133)	Data 0.018 (0.019)	Loss 0.7388 (0.7592)	Prec@1 89.844 (85.782)
[380/391]	Time 0.130 (0.133)	Data 0.019 (0.019)	Loss 0.8976 (0.7599)	Prec@1 83.594 (85.788)
[390/391]	Time 0.086 (0.133)	Data 0.012 (0.019)	Loss 0.6997 (0.7617)	Prec@1 86.250 (85.724)
Testing:
Test: [0/79]	Time 0.019 (0.019)	Loss 1.2832 (1.2832)	Prec@1 71.094 (71.094)
Test: [10/79]	Time 0.020 (0.019)	Loss 1.0630 (1.2739)	Prec@1 75.781 (74.432)
Test: [20/79]	Time 0.019 (0.019)	Loss 1.2429 (1.3507)	Prec@1 75.000 (73.103)
Test: [30/79]	Time 0.020 (0.019)	Loss 1.5388 (1.3696)	Prec@1 71.094 (72.681)
Test: [40/79]	Time 0.019 (0.019)	Loss 1.3100 (1.3804)	Prec@1 72.656 (72.694)
Test: [50/79]	Time 0.020 (0.019)	Loss 1.3903 (1.3866)	P

[290/391]	Time 0.132 (0.133)	Data 0.019 (0.020)	Loss 0.8031 (0.7434)	Prec@1 85.156 (86.174)
[300/391]	Time 0.130 (0.133)	Data 0.019 (0.020)	Loss 0.6541 (0.7437)	Prec@1 89.844 (86.174)
[310/391]	Time 0.130 (0.133)	Data 0.018 (0.020)	Loss 0.8841 (0.7442)	Prec@1 85.156 (86.143)
[320/391]	Time 0.131 (0.133)	Data 0.018 (0.020)	Loss 0.7673 (0.7455)	Prec@1 85.938 (86.122)
[330/391]	Time 0.129 (0.133)	Data 0.018 (0.020)	Loss 0.8103 (0.7469)	Prec@1 85.156 (86.093)
[340/391]	Time 0.129 (0.133)	Data 0.018 (0.020)	Loss 0.7321 (0.7493)	Prec@1 84.375 (86.013)
[350/391]	Time 0.130 (0.133)	Data 0.018 (0.020)	Loss 0.7395 (0.7503)	Prec@1 86.719 (85.980)
[360/391]	Time 0.141 (0.133)	Data 0.028 (0.020)	Loss 0.6959 (0.7506)	Prec@1 88.281 (85.985)
[370/391]	Time 0.145 (0.133)	Data 0.024 (0.020)	Loss 0.7907 (0.7503)	Prec@1 85.938 (86.018)
[380/391]	Time 0.139 (0.133)	Data 0.019 (0.019)	Loss 0.8464 (0.7508)	Prec@1 78.906 (86.011)
[390/391]	Time 0.084 (0.133)	Data 0.012 (0.019)	Loss 0.8084 (0.7504)	Prec@1 88.7

[240/391]	Time 0.128 (0.132)	Data 0.018 (0.019)	Loss 0.7870 (0.7548)	Prec@1 88.281 (85.999)
[250/391]	Time 0.134 (0.132)	Data 0.019 (0.019)	Loss 0.7666 (0.7562)	Prec@1 86.719 (85.944)
[260/391]	Time 0.132 (0.132)	Data 0.019 (0.019)	Loss 0.6153 (0.7574)	Prec@1 92.188 (85.908)
[270/391]	Time 0.137 (0.132)	Data 0.020 (0.019)	Loss 0.7330 (0.7567)	Prec@1 86.719 (85.923)
[280/391]	Time 0.133 (0.132)	Data 0.019 (0.019)	Loss 0.6341 (0.7557)	Prec@1 91.406 (85.938)
[290/391]	Time 0.138 (0.132)	Data 0.022 (0.019)	Loss 0.9630 (0.7554)	Prec@1 80.469 (85.956)
[300/391]	Time 0.151 (0.133)	Data 0.025 (0.019)	Loss 0.8296 (0.7556)	Prec@1 82.031 (85.899)
[310/391]	Time 0.138 (0.133)	Data 0.020 (0.019)	Loss 0.8835 (0.7570)	Prec@1 80.469 (85.865)
[320/391]	Time 0.132 (0.133)	Data 0.019 (0.019)	Loss 0.8709 (0.7567)	Prec@1 79.688 (85.855)
[330/391]	Time 0.131 (0.133)	Data 0.019 (0.019)	Loss 0.8517 (0.7565)	Prec@1 81.250 (85.862)
[340/391]	Time 0.128 (0.133)	Data 0.019 (0.019)	Loss 0.7977 (0.7563)	Prec@1 85.9

[190/391]	Time 0.133 (0.133)	Data 0.019 (0.019)	Loss 0.8964 (0.7586)	Prec@1 81.250 (85.659)
[200/391]	Time 0.133 (0.133)	Data 0.019 (0.019)	Loss 0.7911 (0.7599)	Prec@1 81.250 (85.623)
[210/391]	Time 0.135 (0.133)	Data 0.020 (0.019)	Loss 0.7649 (0.7617)	Prec@1 85.938 (85.612)
[220/391]	Time 0.133 (0.133)	Data 0.019 (0.019)	Loss 0.8240 (0.7624)	Prec@1 82.031 (85.616)
[230/391]	Time 0.131 (0.133)	Data 0.019 (0.019)	Loss 0.7457 (0.7620)	Prec@1 85.938 (85.653)
[240/391]	Time 0.135 (0.133)	Data 0.020 (0.019)	Loss 0.6940 (0.7614)	Prec@1 87.500 (85.668)
[250/391]	Time 0.131 (0.133)	Data 0.019 (0.019)	Loss 0.7617 (0.7615)	Prec@1 85.938 (85.673)
[260/391]	Time 0.135 (0.133)	Data 0.020 (0.019)	Loss 0.6900 (0.7606)	Prec@1 86.719 (85.674)
[270/391]	Time 0.131 (0.133)	Data 0.019 (0.019)	Loss 0.6518 (0.7593)	Prec@1 87.500 (85.704)
[280/391]	Time 0.132 (0.133)	Data 0.019 (0.019)	Loss 0.7457 (0.7595)	Prec@1 89.062 (85.698)
[290/391]	Time 0.133 (0.133)	Data 0.020 (0.019)	Loss 0.6955 (0.7586)	Prec@1 85.1

[140/391]	Time 0.131 (0.133)	Data 0.020 (0.019)	Loss 0.7170 (0.7503)	Prec@1 88.281 (85.993)
[150/391]	Time 0.130 (0.133)	Data 0.018 (0.019)	Loss 0.7098 (0.7487)	Prec@1 88.281 (86.124)
[160/391]	Time 0.129 (0.133)	Data 0.018 (0.019)	Loss 0.7354 (0.7497)	Prec@1 89.062 (86.146)
[170/391]	Time 0.134 (0.132)	Data 0.018 (0.019)	Loss 0.6483 (0.7513)	Prec@1 88.281 (86.043)
[180/391]	Time 0.130 (0.132)	Data 0.020 (0.019)	Loss 0.6727 (0.7519)	Prec@1 88.281 (86.041)
[190/391]	Time 0.131 (0.132)	Data 0.018 (0.019)	Loss 0.7834 (0.7534)	Prec@1 85.938 (85.999)
[200/391]	Time 0.137 (0.132)	Data 0.023 (0.019)	Loss 0.6503 (0.7524)	Prec@1 90.625 (86.019)
[210/391]	Time 0.129 (0.132)	Data 0.018 (0.019)	Loss 0.8770 (0.7526)	Prec@1 84.375 (86.026)
[220/391]	Time 0.130 (0.132)	Data 0.018 (0.019)	Loss 0.5814 (0.7514)	Prec@1 92.188 (86.068)
[230/391]	Time 0.137 (0.132)	Data 0.019 (0.019)	Loss 0.7420 (0.7519)	Prec@1 87.500 (86.039)
[240/391]	Time 0.131 (0.132)	Data 0.020 (0.019)	Loss 0.6965 (0.7511)	Prec@1 91.4