In [None]:
from __future__ import print_function
import os
import time
import logging
import argparse
import numpy as np
from visdom import Visdom
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from utils import *
from metric.loss import FitNet, AttentionTransfer, RKdAngle, RkdDistance

# Teacher models:
# VGG11/VGG13/VGG16/VGG19, GoogLeNet, AlxNet, ResNet18, ResNet34, 
# ResNet50, ResNet101, ResNet152, ResNeXt29_2x64d, ResNeXt29_4x64d, 
# ResNeXt29_8x64d, ResNeXt29_32x64d, PreActResNet18, PreActResNet34, 
# PreActhttps://www.bing.com/?mkt=zh-CNResNet50, PreActResNet101, PreActResNet152, 
# DenseNet121, DenseNet161, DenseNet169, DenseNet201, 
import models

# Student models:
# myNet, LeNet, FitNet

start_time = time.time()
# os.makedirs('./checkpoint', exist_ok=True)

# Training settings
parser = argparse.ArgumentParser(description='PyTorch ada. FitNet')

parser.add_argument('--dataset',
                    choices=['CIFAR10',
                             'CIFAR100'
                            ],
                    default='CIFAR10')
parser.add_argument('--teachers',
                    choices=['ResNet32',
                             'ResNet50',
                             'ResNet56',
                             'ResNet110',
                             'DenseNet121'
                            ],
                    default=['ResNet32', 'ResNet56', 'ResNet110'],
                    nargs='+')
parser.add_argument('--student',
                    choices=['ResNet8',
                             'ResNet15',
                             'ResNet16',
                             'ResNet20',
                             'myNet'
                            ],
                    default='ResNet20')

parser.add_argument('--kd_ratio', default=0.7, type=float)
parser.add_argument('--n_class', type=int, default=10, metavar='N', help='num of classes')
parser.add_argument('--T', type=float, default=20.0, metavar='Temputure', help='Temputure for distillation')
parser.add_argument('--batch_size', type=int, default=128, metavar='N', help='input batch size for training')
parser.add_argument('--test_batch_size', type=int, default=128, metavar='N', help='input test batch size for training')
parser.add_argument('--epochs', type=int, default=20, metavar='N', help='number of epochs to train (default: 20)')
parser.add_argument('--lr', type=float, default=0.1, metavar='LR', help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='SGD momentum (default: 0.5)')
parser.add_argument('--device', default='cuda:0', type=str, help='device: cuda or cpu')
parser.add_argument('--print_freq', type=int, default=10, metavar='N', help='how many batches to wait before logging training status')

config = ['--epochs', '200', '--T', '5.0', '--device', 'cuda:0']
args = parser.parse_args(config)

device = args.device if torch.cuda.is_available() else 'cpu'
load_dir = './checkpoint/' + args.dataset + '/'

# teachers model
teacher_models = []
for te in args.teachers:
    te_model = getattr(models, te)(num_classes=args.n_class)
#     print(te_model)
    te_model.load_state_dict(torch.load(load_dir + te_model.model_name + '.pth'))
    te_model.to(device)
    teacher_models.append(te_model)

st_model = getattr(models, args.student)(num_classes=args.n_class)  # args.student()
st_model.to(device)

# logging
logfile = load_dir + 'ada_fitnet_' + st_model.model_name + '.log'
if os.path.exists(logfile):
    os.remove(logfile)
def log_out(info):
    f = open(logfile, mode='a')
    f.write(info)
    f.write('\n')
    f.close()
    print(info)
    
# visualizer
vis = Visdom(env='distill')
loss_win = vis.line(
    X=np.array([0]),
    Y=np.array([0]),
    opts=dict(
        title='FitNet ada. loss',
        xtickmin=0,
#         xtickmax=1,
#         xtickstep=5,
        ytickmin=0,
#         ytickmax=1,
        ytickstep=0.5,
#         markers=True,
#         markersymbol='dot',
#         markersize=5,
    ),
    name="loss"
)

acc_win = vis.line(
    X=np.column_stack((0, 0)),
    Y=np.column_stack((0, 0)),
    opts=dict(
        title='FitNet ada. ACC',
        xtickmin=0,
#         xtickstep=5,
        ytickmin=0,
        ytickmax=100,
#         markers=True,
#         markersymbol='dot',
#         markersize=5,
        legend=['train_acc', 'test_acc']
    ),
    name="acc"
)


# adapter model
class Adapter():
    def __init__(self, in_models, pool_size):
        # representations of teachers
        pool_ch = pool_size[1]  # 64
        pool_w = pool_size[2]   # 8
        LR_list = []
        torch.manual_seed(1)
        self.theta = torch.randn(len(in_models), pool_ch).to(device)  # [3, 64]
        self.theta.requires_grad_(True)
   
        self.max_feat = nn.MaxPool2d(kernel_size=(pool_w, pool_w), stride=pool_w).to(device)
        self.W = torch.randn(pool_ch, 1).to(device)
        self.W.requires_grad_(True)
        self.val = False

    def loss(self, y, labels, weighted_logits, T=10.0, alpha=0.7):
        ls = nn.KLDivLoss()(F.log_softmax(y/T), weighted_logits) * (T*T * 2.0 * alpha) + F.cross_entropy(y, labels) * (1. - alpha)
        if not self.val:
            ls += 0.1 * (torch.sum(self.W * self.W) + torch.sum(torch.sum(self.theta * self.theta, dim=1), dim=0))
        return ls
        
    def gradient(self, lr=0.01):
        self.W.data = self.W.data - lr * self.W.grad.data
        # Manually zero the gradients after updating weights
        self.W.grad.data.zero_()
        
    def eval(self):
        self.val = True
        self.theta.detach()
        self.W.detach()
    
    # input size: [64, 8, 8], [128, 3, 10]
    def forward(self, conv_map, te_logits_list):
        beta = self.max_feat(conv_map)
        beta = torch.squeeze(beta)  # [128, 64]
        
        latent_factor = []
        for t in self.theta:
            latent_factor.append(beta * t)
#         latent_factor = torch.stack(latent_factor, dim=0)  # [3, 128, 64]
        alpha = []
        for lf in latent_factor:  # lf.size:[128, 64]
            alpha.append(lf.mm(self.W))
        alpha = torch.stack(alpha, dim=0)  # [3, 128, 1]
        alpha = torch.squeeze(alpha).transpose(0, 1) # [128, 3]
        weight = F.softmax(alpha)  # [128, 3]

        return weight

# adapter instance
_,_,_,pool_m,_ = st_model(torch.randn(1,3, 128, 128).to(device))  # get pool_size of student
# reate adapter instance
adapter = Adapter(teacher_models, pool_m.size())


# data
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, 4),
    transforms.ToTensor(),
    normalize,
])
test_transform = transforms.Compose([transforms.ToTensor(), normalize])
train_set = getattr(datasets, args.dataset)(root='../data', train=True, download=True, transform=train_transform)
test_set = getattr(datasets, args.dataset)(root='../data', train=False, download=False, transform=test_transform)
train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=args.test_batch_size, shuffle=False)
# optim
optimizer_W = optim.SGD([adapter.W], lr=args.lr, momentum=0.9)
optimizer_theta = optim.SGD([adapter.theta], lr=args.lr, momentum=0.9)
optimizer_sgd = optim.SGD(st_model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer_sgd, gamma=0.1, milestones=[100, 150])
lr_scheduler2 = optim.lr_scheduler.MultiStepLR(optimizer_W, milestones=[40, 50])
lr_scheduler3 = optim.lr_scheduler.MultiStepLR(optimizer_theta, milestones=[40, 50])

# loss
dist_criterion = RkdDistance().to(device)
angle_criterion = RKdAngle().to(device)
fitnet_criterion = [FitNet(32, 64), FitNet(64, 64),FitNet(64, 64)]
[f.to(device) for f in fitnet_criterion]


def train_adapter(n_epochs=70, model=st_model):
    print('Training adapter:')
    start_time = time.time()
    model.train()
    adapter.eval()
    for ep in range(n_epochs):
        lr_scheduler2.step()
        lr_scheduler3.step()
        for i, (input, target) in enumerate(train_loader):

            input, target = input.to(device), target.to(device)
            # compute outputs
            b1, b2, b3, pool, output = model(input) # out_feat: 16, 32, 64, 64, - 
            st_maps = [b1, b2, b3, pool]
#             print('b1:{}, b2:{}, b3{}, pool:{}'.format(b1.size(), b2.size(), b3.size(), pool.size()))
# b1:torch.Size([128, 16, 32, 32]), b2:torch.Size([128, 32, 16, 16]), b3torch.Size([128, 64, 8, 8]), pool:torch.Size([128, 64, 1, 1])

            te_scores_list = []
            hint_maps = []
            fit_loss = 0
            for j,te in enumerate(teacher_models):
                te.eval()
                with torch.no_grad():
                    t_b1, t_b2, t_b3, t_pool, t_output = te(input)
#                 print('t_b1:{}, t_b2:{}, t_b3:{}, t_pool:{}'.format(t_b1.size(), t_b2.size(), t_b3.size(), t_pool.size()))
# t_b1:torch.Size([128, 16, 32, 32]), t_b2:torch.Size([128, 32, 16, 16]), t_b3:torch.Size([128, 64, 8, 8]), t_pool:torch.Size([128, 64, 1, 1])
                hint_maps.append(t_pool)
                t_output = F.softmax(t_output/args.T)
                te_scores_list.append(t_output)
            te_scores_Tensor = torch.stack(te_scores_list, dim=1)  # size: [128, 3, 10]
            
            weight = adapter.forward(pool, te_scores_Tensor)
            weight_t = torch.unsqueeze(weight, dim=2)
            weighted_logits = weight_t * te_scores_Tensor  # [128, 3, 10]
            weighted_logits = torch.sum(weighted_logits, dim=1)
            weight_f = F.softmax(torch.mean(weight, dim=0))
            
            optimizer_sgd.zero_grad()
            optimizer_W.zero_grad()
            optimizer_theta.zero_grad()
            
            angle_loss = angle_criterion(output, weighted_logits)
            dist_loss = dist_criterion(output, weighted_logits)
            # compute gradient and do SGD step
            ada_loss = adapter.loss(output, target, weighted_logits, T=args.T, alpha=args.kd_ratio)
            
            for j in range(len(teacher_models)-1):
                fit_loss += fitnet_criterion[j](st_maps[j+1], hint_maps[j]) #weight_f[j] * 
#             fit_loss = fitnet_criterion[0](b2, hint_maps[0][3]) + fitnet_criterion[1](b3, hint_maps[1][3]) + fitnet_criterion(pool, hint_maps[2][3])
            loss = ada_loss + fit_loss #+ dist_loss + angle_loss
            
            loss.backward(retain_graph=True)
            optimizer_sgd.step()
            optimizer_W.step()
            optimizer_theta.step()
            
#          vis.line(np.array([loss.item()]), np.array([ep]), loss_win, update="append")
        log_out('epoch[{}/{}]adapter Loss: {:.4f}'.format(ep, n_epochs, loss.item()))
    end_time = time.time()
    log_out("--- adapter training cost {:.3f} mins ---".format((end_time - start_time)/60))


# train with multi-teacher
def train(epoch, model):
    print('Training:')
    # switch to train mode
    model.train()
    adapter.eval()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    
    end = time.time()
    for i, (input, target) in enumerate(train_loader):

        # measure data loading time
        data_time.update(time.time() - end)

        input, target = input.to(device), target.to(device)
        
        # compute outputs
        b1, b2, b3, pool, output = model(input)
        st_maps = [b1, b2, b3, pool]
        
        te_scores_list = []
        hint_maps = []
        fit_loss = 0
        for j,te in enumerate(teacher_models):
            te.eval()
            with torch.no_grad():
                t_b1, t_b2, t_b3, t_pool, t_output = te(input)
        
            hint_maps.append(t_pool)
            t_output = F.softmax(t_output/args.T)
            te_scores_list.append(t_output)
        te_scores_Tensor = torch.stack(te_scores_list, dim=1)  # size: [128, 3, 10]
        
        weight = adapter.forward(pool, te_scores_Tensor)
        weight_t = torch.unsqueeze(weight, dim=2)
        weighted_logits = weight_t * te_scores_Tensor  # [128, 3, 10]
        weighted_logits = torch.sum(weighted_logits, dim=1)
        weight_f = F.softmax(torch.mean(weight, dim=0))
        
        optimizer_sgd.zero_grad()
        
        angle_loss = angle_criterion(output, weighted_logits)
        dist_loss = dist_criterion(output, weighted_logits)
        
        # compute gradient and do SGD step
        ada_loss = adapter.loss(output, target, weighted_logits, T=args.T, alpha=args.kd_ratio)
        for j in range(len(teacher_models)-1):
            fit_loss += fitnet_criterion[j](st_maps[j+1], hint_maps[j])
        
        loss = ada_loss + fit_loss #+ dist_loss + angle_loss

        loss.backward(retain_graph=True)
        optimizer_sgd.step()

        output = output.float()
        loss = loss.float()
        # measure accuracy and record loss
        train_acc = accuracy(output.data, target.data)[0]
        losses.update(loss.item(), input.size(0))
        top1.update(train_acc, input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            log_out('[{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      i, len(train_loader), batch_time=batch_time,
                      data_time=data_time, loss=losses, top1=top1))
    return losses.avg, train_acc.cpu().numpy()


def test(model):
    print('Testing:')
    # switch to evaluate mode
    model.eval()
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(test_loader):
            input, target = input.to(device), target.to(device)

            # compute output
            _,_,_,_,output = model(input)
            loss = F.cross_entropy(output, target)

            output = output.float()
            loss = loss.float()

            # measure accuracy and record loss
            test_acc = accuracy(output.data, target.data)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(test_acc, input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                log_out('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                          i, len(test_loader), batch_time=batch_time, loss=losses,
                          top1=top1))

    log_out(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))

    return losses.avg, test_acc.cpu().numpy(), top1.avg.cpu().numpy()

# """
print('StudentNet:\n')
print(st_model)
st_model.apply(weights_init_normal)
train_adapter(n_epochs=80)
# st_model.apply(weights_init_normal)
best_acc = 0
for epoch in range(1, args.epochs + 1):
    log_out("\n===> epoch: {}/{}".format(epoch, args.epochs))
    log_out('current lr {:.5e}'.format(optimizer_sgd.param_groups[0]['lr']))
    lr_scheduler.step(epoch)
    train_loss, train_acc = train(epoch, st_model)
    # visaulize loss
    vis.line(np.array([train_loss]), np.array([epoch]), loss_win, update="append")
    _, test_acc, top1 = test(st_model)
    vis.line(np.column_stack((train_acc, top1)), np.column_stack((epoch, epoch)), acc_win, update="append")
    if top1 > best_acc:
        best_acc = top1
            
# release GPU memory
torch.cuda.empty_cache()
log_out("BEST ACC: {:.3f}".format(best_acc))
log_out("--- {:.3f} mins ---".format((time.time() - start_time)/60))
# """

  init.kaiming_normal(m.weight)


Files already downloaded and verified
StudentNet:

ResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (1): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2



epoch[0/80]adapter Loss: 1.6207
epoch[1/80]adapter Loss: 1.3406
epoch[2/80]adapter Loss: 1.1132
epoch[3/80]adapter Loss: 0.9417
epoch[4/80]adapter Loss: 0.8863
epoch[5/80]adapter Loss: 1.0321
epoch[6/80]adapter Loss: 0.7275
epoch[7/80]adapter Loss: 0.6101
epoch[8/80]adapter Loss: 0.7095
epoch[9/80]adapter Loss: 0.8646
epoch[10/80]adapter Loss: 0.7112
epoch[11/80]adapter Loss: 0.6941
epoch[12/80]adapter Loss: 0.8521
epoch[13/80]adapter Loss: 0.6496
epoch[14/80]adapter Loss: 0.5462
epoch[15/80]adapter Loss: 0.6668
epoch[16/80]adapter Loss: 0.7586
epoch[17/80]adapter Loss: 0.4472
epoch[18/80]adapter Loss: 0.5592
epoch[19/80]adapter Loss: 0.5962
epoch[20/80]adapter Loss: 0.4775
epoch[21/80]adapter Loss: 0.6044
epoch[22/80]adapter Loss: 0.5636
epoch[23/80]adapter Loss: 0.4450
epoch[24/80]adapter Loss: 0.7001
epoch[25/80]adapter Loss: 0.5509
epoch[26/80]adapter Loss: 0.4664
epoch[27/80]adapter Loss: 0.5320
epoch[28/80]adapter Loss: 0.6596
epoch[29/80]adapter Loss: 0.5473
epoch[30/80]adapter 



[10/391]	Time 0.153 (0.155)	Data 0.034 (0.040)	Loss 0.4179 (0.4835)	Prec@1 89.844 (87.500)
[20/391]	Time 0.158 (0.154)	Data 0.037 (0.040)	Loss 0.4267 (0.4809)	Prec@1 88.281 (87.426)
[30/391]	Time 0.143 (0.145)	Data 0.033 (0.038)	Loss 0.4615 (0.4712)	Prec@1 88.281 (87.450)
[40/391]	Time 0.153 (0.145)	Data 0.033 (0.038)	Loss 0.4179 (0.4580)	Prec@1 87.500 (87.919)
[50/391]	Time 0.133 (0.149)	Data 0.028 (0.039)	Loss 0.4177 (0.4562)	Prec@1 89.062 (88.006)
[60/391]	Time 0.140 (0.147)	Data 0.034 (0.037)	Loss 0.5587 (0.4601)	Prec@1 85.938 (87.935)
[70/391]	Time 0.168 (0.148)	Data 0.032 (0.038)	Loss 0.5440 (0.4642)	Prec@1 85.938 (87.709)
[80/391]	Time 0.147 (0.150)	Data 0.032 (0.038)	Loss 0.4663 (0.4649)	Prec@1 86.719 (87.703)
[90/391]	Time 0.185 (0.150)	Data 0.039 (0.038)	Loss 0.4062 (0.4625)	Prec@1 87.500 (87.740)
[100/391]	Time 0.151 (0.151)	Data 0.035 (0.038)	Loss 0.4191 (0.4678)	Prec@1 89.062 (87.515)
[110/391]	Time 0.212 (0.152)	Data 0.052 (0.038)	Loss 0.5544 (0.4721)	Prec@1 82.812 (87.33

Test: [40/79]	Time 0.047 (0.043)	Loss 0.7483 (0.7840)	Prec@1 77.344 (81.612)
Test: [50/79]	Time 0.039 (0.041)	Loss 0.6333 (0.7726)	Prec@1 85.938 (81.847)
Test: [60/79]	Time 0.031 (0.040)	Loss 0.7182 (0.7812)	Prec@1 79.688 (81.532)
Test: [70/79]	Time 0.032 (0.039)	Loss 0.5295 (0.7783)	Prec@1 85.156 (81.382)
 * Prec@1 81.400

===> epoch: 3/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.134 (0.134)	Data 0.029 (0.029)	Loss 0.4404 (0.4404)	Prec@1 87.500 (87.500)
[10/391]	Time 0.129 (0.152)	Data 0.032 (0.038)	Loss 0.3935 (0.4561)	Prec@1 88.281 (86.790)
[20/391]	Time 0.146 (0.153)	Data 0.030 (0.039)	Loss 0.3633 (0.4431)	Prec@1 89.062 (87.946)
[30/391]	Time 0.183 (0.151)	Data 0.062 (0.038)	Loss 0.4267 (0.4484)	Prec@1 89.844 (87.853)
[40/391]	Time 0.226 (0.154)	Data 0.043 (0.039)	Loss 0.5196 (0.4541)	Prec@1 85.156 (87.786)
[50/391]	Time 0.149 (0.156)	Data 0.032 (0.039)	Loss 0.4562 (0.4578)	Prec@1 85.938 (87.592)
[60/391]	Time 0.162 (0.153)	Data 0.040 (0.038)	Loss 0.4844 (0.4612)	Prec@1 88.

[380/391]	Time 0.132 (0.148)	Data 0.033 (0.039)	Loss 0.5097 (0.4945)	Prec@1 85.156 (86.614)
[390/391]	Time 0.121 (0.148)	Data 0.029 (0.039)	Loss 0.4362 (0.4948)	Prec@1 93.750 (86.612)
Testing:
Test: [0/79]	Time 0.047 (0.047)	Loss 0.9255 (0.9255)	Prec@1 84.375 (84.375)
Test: [10/79]	Time 0.039 (0.048)	Loss 0.6879 (0.7546)	Prec@1 84.375 (82.386)
Test: [20/79]	Time 0.034 (0.047)	Loss 0.7703 (0.7800)	Prec@1 82.031 (81.622)
Test: [30/79]	Time 0.045 (0.046)	Loss 0.4677 (0.7836)	Prec@1 89.062 (81.401)
Test: [40/79]	Time 0.054 (0.045)	Loss 1.0797 (0.7756)	Prec@1 77.344 (81.555)
Test: [50/79]	Time 0.034 (0.044)	Loss 0.7944 (0.7575)	Prec@1 82.812 (81.909)
Test: [60/79]	Time 0.032 (0.043)	Loss 0.7346 (0.7510)	Prec@1 85.156 (82.070)
Test: [70/79]	Time 0.035 (0.041)	Loss 0.6467 (0.7463)	Prec@1 84.375 (82.218)
 * Prec@1 82.300

===> epoch: 5/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.149 (0.149)	Data 0.034 (0.034)	Loss 0.6049 (0.6049)	Prec@1 80.469 (80.469)
[10/391]	Time 0.146 (0.169)	Data 

[330/391]	Time 0.150 (0.159)	Data 0.050 (0.040)	Loss 0.3138 (0.4902)	Prec@1 92.188 (86.792)
[340/391]	Time 0.153 (0.160)	Data 0.038 (0.041)	Loss 0.4818 (0.4905)	Prec@1 88.281 (86.785)
[350/391]	Time 0.126 (0.160)	Data 0.035 (0.041)	Loss 0.6273 (0.4915)	Prec@1 82.031 (86.728)
[360/391]	Time 0.157 (0.160)	Data 0.019 (0.041)	Loss 0.4574 (0.4915)	Prec@1 88.281 (86.717)
[370/391]	Time 0.146 (0.160)	Data 0.036 (0.040)	Loss 0.5925 (0.4917)	Prec@1 81.250 (86.687)
[380/391]	Time 0.148 (0.160)	Data 0.033 (0.041)	Loss 0.3699 (0.4920)	Prec@1 87.500 (86.670)
[390/391]	Time 0.097 (0.159)	Data 0.029 (0.040)	Loss 0.4741 (0.4922)	Prec@1 88.750 (86.658)
Testing:
Test: [0/79]	Time 0.030 (0.030)	Loss 0.5221 (0.5221)	Prec@1 84.375 (84.375)
Test: [10/79]	Time 0.028 (0.034)	Loss 0.6655 (0.6337)	Prec@1 85.156 (84.020)
Test: [20/79]	Time 0.031 (0.033)	Loss 0.7171 (0.6772)	Prec@1 81.250 (82.999)
Test: [30/79]	Time 0.040 (0.035)	Loss 0.7091 (0.6993)	Prec@1 82.812 (82.560)
Test: [40/79]	Time 0.043 (0.036)	Loss 0.

[280/391]	Time 0.193 (0.153)	Data 0.067 (0.041)	Loss 0.4679 (0.4932)	Prec@1 87.500 (86.758)
[290/391]	Time 0.123 (0.154)	Data 0.040 (0.041)	Loss 0.5199 (0.4917)	Prec@1 86.719 (86.818)
[300/391]	Time 0.163 (0.154)	Data 0.047 (0.042)	Loss 0.4818 (0.4918)	Prec@1 85.938 (86.817)
[310/391]	Time 0.166 (0.154)	Data 0.040 (0.041)	Loss 0.6068 (0.4917)	Prec@1 82.812 (86.827)
[320/391]	Time 0.128 (0.154)	Data 0.034 (0.041)	Loss 0.5752 (0.4918)	Prec@1 84.375 (86.811)
[330/391]	Time 0.132 (0.154)	Data 0.027 (0.041)	Loss 0.6355 (0.4907)	Prec@1 80.469 (86.865)
[340/391]	Time 0.129 (0.154)	Data 0.032 (0.041)	Loss 0.5325 (0.4909)	Prec@1 82.812 (86.865)
[350/391]	Time 0.150 (0.153)	Data 0.054 (0.041)	Loss 0.4729 (0.4914)	Prec@1 89.844 (86.839)
[360/391]	Time 0.168 (0.153)	Data 0.051 (0.041)	Loss 0.4648 (0.4909)	Prec@1 85.156 (86.851)
[370/391]	Time 0.123 (0.153)	Data 0.041 (0.041)	Loss 0.4850 (0.4909)	Prec@1 89.062 (86.843)
[380/391]	Time 0.146 (0.152)	Data 0.035 (0.041)	Loss 0.4026 (0.4913)	Prec@1 89.8

[230/391]	Time 0.132 (0.152)	Data 0.030 (0.040)	Loss 0.4741 (0.4925)	Prec@1 88.281 (86.621)
[240/391]	Time 0.155 (0.152)	Data 0.035 (0.041)	Loss 0.5153 (0.4937)	Prec@1 82.031 (86.573)
[250/391]	Time 0.175 (0.152)	Data 0.037 (0.040)	Loss 0.4595 (0.4937)	Prec@1 88.281 (86.560)
[260/391]	Time 0.128 (0.152)	Data 0.034 (0.040)	Loss 0.4391 (0.4919)	Prec@1 89.062 (86.626)
[270/391]	Time 0.137 (0.152)	Data 0.036 (0.040)	Loss 0.5281 (0.4918)	Prec@1 84.375 (86.644)
[280/391]	Time 0.149 (0.152)	Data 0.027 (0.040)	Loss 0.5347 (0.4928)	Prec@1 86.719 (86.602)
[290/391]	Time 0.148 (0.151)	Data 0.035 (0.040)	Loss 0.4944 (0.4931)	Prec@1 86.719 (86.598)
[300/391]	Time 0.131 (0.152)	Data 0.039 (0.040)	Loss 0.4158 (0.4927)	Prec@1 90.625 (86.594)
[310/391]	Time 0.138 (0.152)	Data 0.034 (0.040)	Loss 0.4127 (0.4916)	Prec@1 85.156 (86.638)
[320/391]	Time 0.165 (0.151)	Data 0.033 (0.040)	Loss 0.4793 (0.4910)	Prec@1 87.500 (86.702)
[330/391]	Time 0.138 (0.151)	Data 0.033 (0.040)	Loss 0.4640 (0.4923)	Prec@1 87.5

[180/391]	Time 0.146 (0.150)	Data 0.037 (0.039)	Loss 0.3988 (0.4876)	Prec@1 90.625 (87.116)
[190/391]	Time 0.142 (0.150)	Data 0.035 (0.039)	Loss 0.6400 (0.4890)	Prec@1 84.375 (87.083)
[200/391]	Time 0.125 (0.150)	Data 0.029 (0.039)	Loss 0.4797 (0.4892)	Prec@1 85.156 (87.041)
[210/391]	Time 0.170 (0.150)	Data 0.051 (0.039)	Loss 0.5224 (0.4901)	Prec@1 83.594 (86.937)
[220/391]	Time 0.124 (0.150)	Data 0.034 (0.039)	Loss 0.5396 (0.4909)	Prec@1 83.594 (86.920)
[230/391]	Time 0.158 (0.150)	Data 0.027 (0.039)	Loss 0.4610 (0.4931)	Prec@1 89.844 (86.874)
[240/391]	Time 0.174 (0.150)	Data 0.055 (0.039)	Loss 0.4001 (0.4935)	Prec@1 91.406 (86.894)
[250/391]	Time 0.132 (0.151)	Data 0.041 (0.039)	Loss 0.5223 (0.4945)	Prec@1 84.375 (86.853)
[260/391]	Time 0.131 (0.150)	Data 0.041 (0.039)	Loss 0.4829 (0.4942)	Prec@1 88.281 (86.886)
[270/391]	Time 0.146 (0.149)	Data 0.037 (0.039)	Loss 0.5497 (0.4934)	Prec@1 84.375 (86.892)
[280/391]	Time 0.156 (0.150)	Data 0.048 (0.039)	Loss 0.4680 (0.4926)	Prec@1 85.1

[130/391]	Time 0.153 (0.148)	Data 0.033 (0.039)	Loss 0.3817 (0.4684)	Prec@1 89.844 (87.232)
[140/391]	Time 0.157 (0.148)	Data 0.039 (0.040)	Loss 0.4643 (0.4711)	Prec@1 86.719 (87.134)
[150/391]	Time 0.210 (0.149)	Data 0.062 (0.040)	Loss 0.3832 (0.4736)	Prec@1 87.500 (87.034)
[160/391]	Time 0.147 (0.149)	Data 0.050 (0.040)	Loss 0.6264 (0.4774)	Prec@1 82.812 (86.952)
[170/391]	Time 0.129 (0.149)	Data 0.028 (0.040)	Loss 0.4879 (0.4822)	Prec@1 85.156 (86.796)
[180/391]	Time 0.120 (0.150)	Data 0.035 (0.040)	Loss 0.4971 (0.4845)	Prec@1 85.156 (86.758)
[190/391]	Time 0.136 (0.149)	Data 0.036 (0.040)	Loss 0.6012 (0.4863)	Prec@1 79.688 (86.739)
[200/391]	Time 0.156 (0.149)	Data 0.030 (0.040)	Loss 0.4625 (0.4874)	Prec@1 84.375 (86.703)
[210/391]	Time 0.148 (0.149)	Data 0.034 (0.040)	Loss 0.4735 (0.4882)	Prec@1 89.062 (86.685)
[220/391]	Time 0.120 (0.148)	Data 0.033 (0.040)	Loss 0.4236 (0.4880)	Prec@1 88.281 (86.683)
[230/391]	Time 0.140 (0.148)	Data 0.057 (0.040)	Loss 0.3713 (0.4885)	Prec@1 92.1

[80/391]	Time 0.186 (0.157)	Data 0.059 (0.040)	Loss 0.5290 (0.4966)	Prec@1 87.500 (86.449)
[90/391]	Time 0.161 (0.156)	Data 0.046 (0.039)	Loss 0.4412 (0.4979)	Prec@1 88.281 (86.444)
[100/391]	Time 0.157 (0.155)	Data 0.045 (0.040)	Loss 0.4880 (0.4980)	Prec@1 84.375 (86.417)
[110/391]	Time 0.133 (0.154)	Data 0.028 (0.040)	Loss 0.4616 (0.4964)	Prec@1 89.062 (86.508)
[120/391]	Time 0.190 (0.155)	Data 0.061 (0.040)	Loss 0.4905 (0.4960)	Prec@1 85.156 (86.493)
[130/391]	Time 0.142 (0.154)	Data 0.053 (0.040)	Loss 0.3859 (0.4935)	Prec@1 91.406 (86.594)
[140/391]	Time 0.155 (0.154)	Data 0.042 (0.040)	Loss 0.4751 (0.4930)	Prec@1 86.719 (86.553)
[150/391]	Time 0.156 (0.154)	Data 0.035 (0.040)	Loss 0.5164 (0.4920)	Prec@1 85.938 (86.657)
[160/391]	Time 0.157 (0.154)	Data 0.037 (0.040)	Loss 0.5244 (0.4928)	Prec@1 83.594 (86.607)
[170/391]	Time 0.196 (0.153)	Data 0.070 (0.040)	Loss 0.4985 (0.4945)	Prec@1 87.500 (86.518)
[180/391]	Time 0.187 (0.152)	Data 0.037 (0.040)	Loss 0.5681 (0.4947)	Prec@1 83.594

[30/391]	Time 0.155 (0.137)	Data 0.030 (0.036)	Loss 0.5720 (0.4981)	Prec@1 82.812 (86.971)
[40/391]	Time 0.147 (0.141)	Data 0.033 (0.037)	Loss 0.4680 (0.4912)	Prec@1 86.719 (87.119)
[50/391]	Time 0.130 (0.142)	Data 0.039 (0.037)	Loss 0.4059 (0.4894)	Prec@1 92.188 (87.102)
[60/391]	Time 0.167 (0.144)	Data 0.030 (0.037)	Loss 0.5474 (0.4941)	Prec@1 86.719 (86.949)
[70/391]	Time 0.139 (0.145)	Data 0.037 (0.037)	Loss 0.5426 (0.4991)	Prec@1 83.594 (86.708)
[80/391]	Time 0.149 (0.146)	Data 0.034 (0.037)	Loss 0.4590 (0.4954)	Prec@1 89.062 (86.873)
[90/391]	Time 0.138 (0.146)	Data 0.027 (0.037)	Loss 0.4809 (0.4939)	Prec@1 85.156 (86.925)
[100/391]	Time 0.157 (0.148)	Data 0.034 (0.037)	Loss 0.5258 (0.4889)	Prec@1 85.938 (87.044)
[110/391]	Time 0.163 (0.148)	Data 0.033 (0.037)	Loss 0.3962 (0.4889)	Prec@1 89.844 (86.923)
[120/391]	Time 0.102 (0.148)	Data 0.029 (0.037)	Loss 0.4216 (0.4859)	Prec@1 89.062 (87.087)
[130/391]	Time 0.175 (0.148)	Data 0.039 (0.037)	Loss 0.4581 (0.4834)	Prec@1 87.500 (87.

Test: [60/79]	Time 0.029 (0.039)	Loss 0.7579 (0.9374)	Prec@1 82.031 (79.226)
Test: [70/79]	Time 0.036 (0.039)	Loss 0.7759 (0.9391)	Prec@1 85.938 (79.126)
 * Prec@1 79.060

===> epoch: 20/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.166 (0.166)	Data 0.069 (0.069)	Loss 0.5354 (0.5354)	Prec@1 85.156 (85.156)
[10/391]	Time 0.145 (0.149)	Data 0.041 (0.040)	Loss 0.5286 (0.4952)	Prec@1 85.156 (87.500)
[20/391]	Time 0.142 (0.146)	Data 0.029 (0.037)	Loss 0.6155 (0.4720)	Prec@1 80.469 (87.835)
[30/391]	Time 0.141 (0.142)	Data 0.033 (0.035)	Loss 0.4117 (0.4667)	Prec@1 88.281 (88.029)
[40/391]	Time 0.123 (0.142)	Data 0.032 (0.035)	Loss 0.3083 (0.4666)	Prec@1 91.406 (88.167)
[50/391]	Time 0.182 (0.145)	Data 0.061 (0.037)	Loss 0.4547 (0.4693)	Prec@1 88.281 (88.021)
[60/391]	Time 0.157 (0.148)	Data 0.036 (0.037)	Loss 0.4119 (0.4698)	Prec@1 92.969 (87.743)
[70/391]	Time 0.136 (0.148)	Data 0.034 (0.038)	Loss 0.5062 (0.4769)	Prec@1 88.281 (87.456)
[80/391]	Time 0.130 (0.148)	Data 0.026 (0.038)	Lo

Test: [10/79]	Time 0.050 (0.038)	Loss 0.6261 (0.6875)	Prec@1 84.375 (82.670)
Test: [20/79]	Time 0.040 (0.038)	Loss 0.6574 (0.7218)	Prec@1 82.031 (81.734)
Test: [30/79]	Time 0.032 (0.039)	Loss 0.6226 (0.7083)	Prec@1 84.375 (82.056)
Test: [40/79]	Time 0.033 (0.040)	Loss 0.5517 (0.6983)	Prec@1 83.594 (82.679)
Test: [50/79]	Time 0.034 (0.040)	Loss 0.8032 (0.6988)	Prec@1 82.812 (82.767)
Test: [60/79]	Time 0.034 (0.040)	Loss 0.8316 (0.7032)	Prec@1 78.125 (82.518)
Test: [70/79]	Time 0.048 (0.041)	Loss 0.4893 (0.7073)	Prec@1 82.812 (82.361)
 * Prec@1 82.360

===> epoch: 22/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.169 (0.169)	Data 0.033 (0.033)	Loss 0.3273 (0.3273)	Prec@1 92.969 (92.969)
[10/391]	Time 0.159 (0.171)	Data 0.046 (0.039)	Loss 0.5136 (0.4975)	Prec@1 84.375 (87.571)
[20/391]	Time 0.170 (0.163)	Data 0.035 (0.040)	Loss 0.4187 (0.4929)	Prec@1 87.500 (87.054)
[30/391]	Time 0.149 (0.163)	Data 0.034 (0.041)	Loss 0.4522 (0.4866)	Prec@1 88.281 (86.946)
[40/391]	Time 0.192 (0.159)	

[360/391]	Time 0.146 (0.151)	Data 0.031 (0.039)	Loss 0.4589 (0.4856)	Prec@1 86.719 (86.909)
[370/391]	Time 0.126 (0.151)	Data 0.033 (0.039)	Loss 0.5162 (0.4845)	Prec@1 87.500 (86.946)
[380/391]	Time 0.153 (0.151)	Data 0.036 (0.039)	Loss 0.4228 (0.4847)	Prec@1 89.844 (86.953)
[390/391]	Time 0.110 (0.151)	Data 0.040 (0.039)	Loss 0.5174 (0.4863)	Prec@1 85.000 (86.900)
Testing:
Test: [0/79]	Time 0.037 (0.037)	Loss 0.7478 (0.7478)	Prec@1 82.031 (82.031)
Test: [10/79]	Time 0.034 (0.040)	Loss 0.7745 (0.9157)	Prec@1 82.812 (79.688)
Test: [20/79]	Time 0.034 (0.038)	Loss 0.8160 (0.9539)	Prec@1 81.250 (78.757)
Test: [30/79]	Time 0.037 (0.041)	Loss 0.9557 (0.9438)	Prec@1 82.031 (78.931)
Test: [40/79]	Time 0.028 (0.039)	Loss 0.9310 (0.9296)	Prec@1 73.438 (79.287)
Test: [50/79]	Time 0.035 (0.040)	Loss 0.7606 (0.9187)	Prec@1 83.594 (79.366)
Test: [60/79]	Time 0.083 (0.041)	Loss 0.8018 (0.9337)	Prec@1 79.688 (79.188)
Test: [70/79]	Time 0.038 (0.040)	Loss 0.7722 (0.9331)	Prec@1 83.594 (79.159)
 * Prec@

[310/391]	Time 0.146 (0.149)	Data 0.037 (0.038)	Loss 0.5303 (0.4887)	Prec@1 83.594 (86.761)
[320/391]	Time 0.111 (0.149)	Data 0.025 (0.038)	Loss 0.4572 (0.4892)	Prec@1 85.156 (86.736)
[330/391]	Time 0.146 (0.149)	Data 0.049 (0.038)	Loss 0.4672 (0.4896)	Prec@1 89.844 (86.740)
[340/391]	Time 0.161 (0.149)	Data 0.033 (0.038)	Loss 0.6009 (0.4914)	Prec@1 82.812 (86.675)
[350/391]	Time 0.149 (0.149)	Data 0.028 (0.038)	Loss 0.5416 (0.4924)	Prec@1 87.500 (86.641)
[360/391]	Time 0.160 (0.149)	Data 0.065 (0.038)	Loss 0.5137 (0.4931)	Prec@1 84.375 (86.604)
[370/391]	Time 0.141 (0.149)	Data 0.034 (0.038)	Loss 0.4526 (0.4930)	Prec@1 87.500 (86.605)
[380/391]	Time 0.154 (0.149)	Data 0.032 (0.038)	Loss 0.4787 (0.4933)	Prec@1 86.719 (86.579)
[390/391]	Time 0.110 (0.150)	Data 0.025 (0.038)	Loss 0.5694 (0.4936)	Prec@1 78.750 (86.568)
Testing:
Test: [0/79]	Time 0.059 (0.059)	Loss 0.5303 (0.5303)	Prec@1 83.594 (83.594)
Test: [10/79]	Time 0.036 (0.037)	Loss 0.5705 (0.6285)	Prec@1 86.719 (83.594)
Test: [20/

[260/391]	Time 0.146 (0.148)	Data 0.042 (0.039)	Loss 0.4098 (0.4796)	Prec@1 90.625 (86.934)
[270/391]	Time 0.149 (0.149)	Data 0.031 (0.039)	Loss 0.4976 (0.4800)	Prec@1 86.719 (86.921)
[280/391]	Time 0.144 (0.149)	Data 0.030 (0.039)	Loss 0.3984 (0.4799)	Prec@1 92.188 (86.950)
[290/391]	Time 0.165 (0.149)	Data 0.044 (0.039)	Loss 0.4600 (0.4817)	Prec@1 86.719 (86.907)
[300/391]	Time 0.148 (0.148)	Data 0.049 (0.039)	Loss 0.4584 (0.4808)	Prec@1 90.625 (86.950)
[310/391]	Time 0.131 (0.148)	Data 0.030 (0.039)	Loss 0.4855 (0.4810)	Prec@1 86.719 (86.952)
[320/391]	Time 0.186 (0.148)	Data 0.046 (0.039)	Loss 0.4341 (0.4809)	Prec@1 84.375 (86.938)
[330/391]	Time 0.138 (0.148)	Data 0.036 (0.039)	Loss 0.3860 (0.4798)	Prec@1 89.844 (86.981)
[340/391]	Time 0.128 (0.148)	Data 0.027 (0.039)	Loss 0.5789 (0.4801)	Prec@1 88.281 (87.007)
[350/391]	Time 0.156 (0.148)	Data 0.040 (0.039)	Loss 0.3782 (0.4798)	Prec@1 89.844 (87.019)
[360/391]	Time 0.128 (0.148)	Data 0.032 (0.039)	Loss 0.6022 (0.4808)	Prec@1 83.5

[210/391]	Time 0.151 (0.151)	Data 0.032 (0.040)	Loss 0.4953 (0.4933)	Prec@1 88.281 (86.678)
[220/391]	Time 0.156 (0.152)	Data 0.035 (0.040)	Loss 0.4811 (0.4941)	Prec@1 84.375 (86.645)
[230/391]	Time 0.146 (0.152)	Data 0.043 (0.040)	Loss 0.4653 (0.4937)	Prec@1 85.156 (86.624)
[240/391]	Time 0.143 (0.151)	Data 0.035 (0.040)	Loss 0.4727 (0.4935)	Prec@1 86.719 (86.625)
[250/391]	Time 0.188 (0.151)	Data 0.025 (0.040)	Loss 0.3442 (0.4926)	Prec@1 90.625 (86.669)
[260/391]	Time 0.144 (0.152)	Data 0.034 (0.040)	Loss 0.5672 (0.4907)	Prec@1 86.719 (86.755)
[270/391]	Time 0.133 (0.153)	Data 0.032 (0.040)	Loss 0.5315 (0.4908)	Prec@1 88.281 (86.750)
[280/391]	Time 0.167 (0.153)	Data 0.039 (0.040)	Loss 0.5936 (0.4923)	Prec@1 82.812 (86.652)
[290/391]	Time 0.161 (0.153)	Data 0.036 (0.040)	Loss 0.4875 (0.4940)	Prec@1 85.938 (86.598)
[300/391]	Time 0.117 (0.153)	Data 0.028 (0.040)	Loss 0.4848 (0.4936)	Prec@1 87.500 (86.625)
[310/391]	Time 0.128 (0.152)	Data 0.037 (0.040)	Loss 0.5441 (0.4932)	Prec@1 85.9

[160/391]	Time 0.139 (0.150)	Data 0.035 (0.038)	Loss 0.4604 (0.4829)	Prec@1 89.844 (86.981)
[170/391]	Time 0.131 (0.150)	Data 0.037 (0.038)	Loss 0.4087 (0.4843)	Prec@1 89.062 (86.915)
[180/391]	Time 0.203 (0.150)	Data 0.065 (0.039)	Loss 0.5858 (0.4842)	Prec@1 84.375 (86.870)
[190/391]	Time 0.136 (0.149)	Data 0.034 (0.038)	Loss 0.4213 (0.4838)	Prec@1 89.062 (86.882)
[200/391]	Time 0.145 (0.148)	Data 0.033 (0.038)	Loss 0.5131 (0.4871)	Prec@1 87.500 (86.715)
[210/391]	Time 0.181 (0.150)	Data 0.048 (0.039)	Loss 0.5074 (0.4898)	Prec@1 86.719 (86.619)
[220/391]	Time 0.167 (0.150)	Data 0.037 (0.039)	Loss 0.5367 (0.4922)	Prec@1 85.156 (86.538)
[230/391]	Time 0.183 (0.150)	Data 0.039 (0.039)	Loss 0.6738 (0.4952)	Prec@1 86.719 (86.499)
[240/391]	Time 0.138 (0.150)	Data 0.028 (0.039)	Loss 0.5282 (0.4948)	Prec@1 82.031 (86.524)
[250/391]	Time 0.166 (0.151)	Data 0.064 (0.039)	Loss 0.4885 (0.4944)	Prec@1 86.719 (86.579)
[260/391]	Time 0.183 (0.151)	Data 0.065 (0.039)	Loss 0.4236 (0.4952)	Prec@1 89.0

[110/391]	Time 0.165 (0.147)	Data 0.064 (0.040)	Loss 0.5497 (0.4611)	Prec@1 90.625 (87.810)
[120/391]	Time 0.121 (0.149)	Data 0.039 (0.040)	Loss 0.4177 (0.4607)	Prec@1 89.062 (87.797)
[130/391]	Time 0.136 (0.149)	Data 0.041 (0.040)	Loss 0.4864 (0.4618)	Prec@1 86.719 (87.750)
[140/391]	Time 0.155 (0.150)	Data 0.027 (0.040)	Loss 0.4967 (0.4654)	Prec@1 84.375 (87.616)
[150/391]	Time 0.224 (0.150)	Data 0.068 (0.040)	Loss 0.4239 (0.4693)	Prec@1 89.062 (87.448)
[160/391]	Time 0.155 (0.151)	Data 0.031 (0.040)	Loss 0.3575 (0.4693)	Prec@1 90.625 (87.398)
[170/391]	Time 0.158 (0.151)	Data 0.044 (0.039)	Loss 0.7208 (0.4713)	Prec@1 80.469 (87.340)
[180/391]	Time 0.135 (0.152)	Data 0.039 (0.039)	Loss 0.3764 (0.4740)	Prec@1 93.750 (87.310)
[190/391]	Time 0.169 (0.152)	Data 0.035 (0.040)	Loss 0.4300 (0.4756)	Prec@1 88.281 (87.193)
[200/391]	Time 0.160 (0.152)	Data 0.036 (0.039)	Loss 0.5936 (0.4764)	Prec@1 84.375 (87.150)
[210/391]	Time 0.173 (0.153)	Data 0.057 (0.039)	Loss 0.4504 (0.4775)	Prec@1 85.1

[60/391]	Time 0.183 (0.152)	Data 0.039 (0.038)	Loss 0.5627 (0.4843)	Prec@1 85.156 (86.629)
[70/391]	Time 0.186 (0.152)	Data 0.063 (0.039)	Loss 0.4377 (0.4853)	Prec@1 92.188 (86.565)
[80/391]	Time 0.180 (0.152)	Data 0.055 (0.039)	Loss 0.4618 (0.4861)	Prec@1 89.844 (86.526)
[90/391]	Time 0.164 (0.154)	Data 0.039 (0.039)	Loss 0.4893 (0.4845)	Prec@1 88.281 (86.684)
[100/391]	Time 0.148 (0.153)	Data 0.029 (0.039)	Loss 0.4757 (0.4807)	Prec@1 92.188 (86.920)
[110/391]	Time 0.165 (0.153)	Data 0.036 (0.038)	Loss 0.3960 (0.4807)	Prec@1 92.188 (86.909)
[120/391]	Time 0.171 (0.153)	Data 0.049 (0.039)	Loss 0.7037 (0.4835)	Prec@1 84.375 (86.900)
[130/391]	Time 0.174 (0.153)	Data 0.044 (0.039)	Loss 0.5689 (0.4846)	Prec@1 85.156 (86.838)
[140/391]	Time 0.136 (0.153)	Data 0.034 (0.038)	Loss 0.4238 (0.4850)	Prec@1 88.281 (86.708)
[150/391]	Time 0.148 (0.153)	Data 0.039 (0.038)	Loss 0.5453 (0.4857)	Prec@1 85.938 (86.698)
[160/391]	Time 0.136 (0.153)	Data 0.032 (0.038)	Loss 0.4113 (0.4842)	Prec@1 89.062 (

[10/391]	Time 0.133 (0.124)	Data 0.046 (0.037)	Loss 0.5561 (0.4444)	Prec@1 85.938 (88.565)
[20/391]	Time 0.178 (0.132)	Data 0.050 (0.035)	Loss 0.4427 (0.4656)	Prec@1 88.281 (87.574)
[30/391]	Time 0.158 (0.140)	Data 0.036 (0.037)	Loss 0.3749 (0.4658)	Prec@1 89.844 (87.450)
[40/391]	Time 0.136 (0.142)	Data 0.023 (0.036)	Loss 0.5203 (0.4679)	Prec@1 88.281 (87.595)
[50/391]	Time 0.169 (0.146)	Data 0.037 (0.036)	Loss 0.6479 (0.4623)	Prec@1 79.688 (87.883)
[60/391]	Time 0.158 (0.147)	Data 0.048 (0.037)	Loss 0.5078 (0.4663)	Prec@1 85.938 (87.666)
[70/391]	Time 0.139 (0.146)	Data 0.027 (0.036)	Loss 0.4675 (0.4704)	Prec@1 86.719 (87.335)
[80/391]	Time 0.158 (0.147)	Data 0.039 (0.036)	Loss 0.3824 (0.4692)	Prec@1 91.406 (87.365)
[90/391]	Time 0.132 (0.146)	Data 0.033 (0.036)	Loss 0.5312 (0.4660)	Prec@1 81.250 (87.509)
[100/391]	Time 0.142 (0.146)	Data 0.036 (0.037)	Loss 0.4674 (0.4669)	Prec@1 88.281 (87.430)
[110/391]	Time 0.138 (0.146)	Data 0.034 (0.037)	Loss 0.4443 (0.4712)	Prec@1 85.938 (87.26

Test: [40/79]	Time 0.033 (0.039)	Loss 1.5837 (1.3121)	Prec@1 66.406 (72.199)
Test: [50/79]	Time 0.041 (0.038)	Loss 1.3296 (1.2980)	Prec@1 71.875 (72.457)
Test: [60/79]	Time 0.035 (0.039)	Loss 0.9588 (1.3066)	Prec@1 79.688 (72.515)
Test: [70/79]	Time 0.044 (0.038)	Loss 1.2082 (1.2991)	Prec@1 76.562 (72.766)
 * Prec@1 72.840

===> epoch: 39/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.177 (0.177)	Data 0.040 (0.040)	Loss 0.4595 (0.4595)	Prec@1 86.719 (86.719)
[10/391]	Time 0.124 (0.146)	Data 0.031 (0.040)	Loss 0.5559 (0.4819)	Prec@1 82.031 (87.500)
[20/391]	Time 0.112 (0.148)	Data 0.037 (0.042)	Loss 0.4056 (0.4674)	Prec@1 89.844 (87.723)
[30/391]	Time 0.158 (0.153)	Data 0.041 (0.042)	Loss 0.4373 (0.4728)	Prec@1 88.281 (87.550)
[40/391]	Time 0.183 (0.150)	Data 0.044 (0.039)	Loss 0.3546 (0.4636)	Prec@1 91.406 (87.900)
[50/391]	Time 0.152 (0.152)	Data 0.038 (0.040)	Loss 0.4209 (0.4631)	Prec@1 87.500 (88.006)
[60/391]	Time 0.137 (0.153)	Data 0.028 (0.040)	Loss 0.5139 (0.4635)	Prec@1 88

[380/391]	Time 0.142 (0.150)	Data 0.051 (0.039)	Loss 0.4654 (0.4828)	Prec@1 86.719 (87.061)
[390/391]	Time 0.151 (0.150)	Data 0.052 (0.039)	Loss 0.4035 (0.4832)	Prec@1 86.250 (87.042)
Testing:
Test: [0/79]	Time 0.036 (0.036)	Loss 0.8753 (0.8753)	Prec@1 82.031 (82.031)
Test: [10/79]	Time 0.036 (0.035)	Loss 1.5283 (1.0207)	Prec@1 71.875 (76.918)
Test: [20/79]	Time 0.039 (0.036)	Loss 1.0634 (1.1188)	Prec@1 80.469 (76.339)
Test: [30/79]	Time 0.041 (0.037)	Loss 0.7629 (1.0949)	Prec@1 78.125 (76.462)
Test: [40/79]	Time 0.040 (0.039)	Loss 1.3979 (1.0664)	Prec@1 72.656 (77.153)
Test: [50/79]	Time 0.034 (0.039)	Loss 0.8051 (1.0538)	Prec@1 79.688 (77.191)
Test: [60/79]	Time 0.034 (0.040)	Loss 1.4290 (1.0476)	Prec@1 75.000 (77.241)
Test: [70/79]	Time 0.032 (0.039)	Loss 0.9584 (1.0440)	Prec@1 77.344 (77.212)
 * Prec@1 77.290

===> epoch: 41/200
current lr 1.00000e-01
Training:
[0/391]	Time 0.140 (0.140)	Data 0.037 (0.037)	Loss 0.4678 (0.4678)	Prec@1 90.625 (90.625)
[10/391]	Time 0.139 (0.145)	Data

[330/391]	Time 0.143 (0.149)	Data 0.030 (0.038)	Loss 0.4928 (0.4903)	Prec@1 88.281 (86.728)
