In [4]:
import os
import torch
import torch.nn as nn
from conf import global_settings as settings
import torch.optim as optim

import torchvision
from torch.utils.data import DataLoader
from torchvision import transforms
from torch.utils.tensorboard import SummaryWriter

import time
from datetime import datetime

from utils import  get_training_dataloader, get_test_dataloader, WarmUpLR, \
    most_recent_folder, most_recent_weights, last_epoch, best_acc_weights



In [5]:

class BasicBlock(nn.Module):
    """Basic Block for resnet 18 and resnet 34

    """

    #BasicBlock and BottleNeck block
    #have different output size
    #we use class attribute expansion
    #to distinct
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()

        #residual function
        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels * BasicBlock.expansion)
        )

        #shortcut
        self.shortcut = nn.Sequential()

        #the shortcut output dimension is not the same with residual function
        #use 1*1 convolution to match the dimension
        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * BasicBlock.expansion)
            )

    def forward(self, x):
        return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))

In [6]:
class ResNet(nn.Module):

    def __init__(self, block, num_block, num_classes=100):
        super().__init__()

        self.in_channels = 64

        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True))
        #we use a different inputsize than the original paper
        #so conv2_x's stride is 1
        self.conv2_x = self._make_layer(block, 64, num_block[0], 1)
        self.conv3_x = self._make_layer(block, 128, num_block[1], 2)
        self.conv4_x = self._make_layer(block, 256, num_block[2], 2)
        self.conv5_x = self._make_layer(block, 512, num_block[3], 2)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, num_blocks, stride):
        """make resnet layers(by layer i didnt mean this 'layer' was the
        same as a neuron netowork layer, ex. conv layer), one layer may
        contain more than one residual block

        Args:
            block: block type, basic block or bottle neck block
            out_channels: output depth channel number of this layer
            num_blocks: how many blocks per layer
            stride: the stride of the first block of this layer

        Return:
            return a resnet layer
        """

        # we have num_block blocks per layer, the first block
        # could be 1 or 2, other blocks would always be 1
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_channels, out_channels, stride))
            self.in_channels = out_channels * block.expansion

        return nn.Sequential(*layers)

    def forward(self, x):
        output = self.conv1(x)
        output = self.conv2_x(output)
        output = self.conv3_x(output)
        output = self.conv4_x(output)
        output = self.conv5_x(output)
        output = self.avg_pool(output)
        output = output.view(output.size(0), -1)
        output = self.fc(output)

        return output

In [7]:
def resnet18():
    """ return a ResNet 18 object
    """
    return ResNet(BasicBlock, [2, 2, 2, 2])

In [5]:
def train(epoch):

    start = time.time()
    net.train()
    for batch_index, (images, labels) in enumerate(cifar100_training_loader):

        if settings.GPU:
            labels = labels.cuda()
            images = images.cuda()

        optimizer.zero_grad()
        outputs = net(images)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        n_iter = (epoch - 1) * len(cifar100_training_loader) + batch_index + 1

        last_layer = list(net.children())[-1]
        for name, para in last_layer.named_parameters():
            if 'weight' in name:
                writer.add_scalar('LastLayerGradients/grad_norm2_weights', para.grad.norm(), n_iter)
            if 'bias' in name:
                writer.add_scalar('LastLayerGradients/grad_norm2_bias', para.grad.norm(), n_iter)

        print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format(
            loss.item(),
            optimizer.param_groups[0]['lr'],
            epoch=epoch,
            trained_samples=batch_index * settings.BATCH_SIZE + len(images),
            total_samples=len(cifar100_training_loader.dataset)
        ))

        #update training loss for each iteration
        writer.add_scalar('Train/loss', loss.item(), n_iter)

        if epoch <= settings.BATCH_SIZE:
            warmup_scheduler.step()

    for name, param in net.named_parameters():
        layer, attr = os.path.splitext(name)
        attr = attr[1:]
        writer.add_histogram("{}/{}".format(layer, attr), param, epoch)

    finish = time.time()

    print('epoch {} training time consumed: {:.2f}s'.format(epoch, finish - start))


In [6]:
@torch.no_grad()
def eval_training(epoch=0, tb=True):

    start = time.time()
    net.eval()

    test_loss = 0.0 # cost function error
    correct = 0.0

    for (images, labels) in cifar100_test_loader:

        if settings.GPU:
            images = images.cuda()
            labels = labels.cuda()

        outputs = net(images)
        loss = loss_function(outputs, labels)

        test_loss += loss.item()
        _, preds = outputs.max(1)
        correct += preds.eq(labels).sum()

    finish = time.time()
    if settings.GPU:
        print('GPU INFO.....')
        print(torch.cuda.memory_summary(), end='')
    print('Evaluating Network.....')
    print('Test set: Epoch: {}, Average loss: {:.4f}, Accuracy: {:.4f}, Time consumed:{:.2f}s'.format(
        epoch,
        test_loss / len(cifar100_test_loader.dataset),
        correct.float() / len(cifar100_test_loader.dataset),
        finish - start
    ))
    print()

    #add informations to tensorboard
    if tb:
        writer.add_scalar('Test/Average loss', test_loss / len(cifar100_test_loader.dataset), epoch)
        writer.add_scalar('Test/Accuracy', correct.float() / len(cifar100_test_loader.dataset), epoch)

    return correct.float() / len(cifar100_test_loader.dataset)

In [7]:
def get_training_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
    """ return training dataloader
    Args:
        mean: mean of cifar100 training dataset
        std: std of cifar100 training dataset
        path: path to cifar100 training python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle
    Returns: train_data_loader:torch dataloader object
    """

    transform_train = transforms.Compose([
        #transforms.ToPILImage(),
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    #cifar100_training = CIFAR100Train(path, transform=transform_train)
    cifar100_training = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
    cifar100_training_loader = DataLoader(
        cifar100_training, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar100_training_loader

In [8]:
def get_test_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
    """ return training dataloader
    Args:
        mean: mean of cifar100 test dataset
        std: std of cifar100 test dataset
        path: path to cifar100 test python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle
    Returns: cifar100_test_loader:torch dataloader object
    """

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    #cifar100_test = CIFAR100Test(path, transform=transform_test)
    cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
    cifar100_test_loader = DataLoader(
        cifar100_test, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar100_test_loader


In [9]:
'''
训练阶段
'''



'\n训练阶段\n'

In [10]:
net=resnet18()
if settings.GPU:
    net=net.cuda()
#data preprocessing:
cifar100_training_loader = get_training_dataloader(
    settings.CIFAR100_TRAIN_MEAN,
    settings.CIFAR100_TRAIN_STD,
    num_workers=4,
    batch_size=settings.BATCH_SIZE,
    shuffle=True
)

cifar100_test_loader = get_test_dataloader(
    settings.CIFAR100_TRAIN_MEAN,
    settings.CIFAR100_TRAIN_STD,
    num_workers=4,
    batch_size=settings.BATCH_SIZE,
    shuffle=True
)


loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=settings.LR, momentum=0.9, weight_decay=5e-4)
train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay
iter_per_epoch = len(cifar100_training_loader)
warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * settings.WARMUP)



Files already downloaded and verified
Files already downloaded and verified


In [11]:
if settings.RESUME:
    recent_folder = most_recent_folder(os.path.join(settings.CHECKPOINT_PATH, settings.NET), fmt=settings.DATE_FORMAT)
    if not recent_folder:
        raise Exception('no recent folder were found')
    checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, settings.NET, recent_folder)
else:
    checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, settings.NET, settings.TIME_NOW)

#use tensorboard
if not os.path.exists(settings.LOG_DIR):
    os.mkdir(settings.LOG_DIR)

#since tensorboard can't overwrite old values
#so the only way is to create a new tensorboard log
writer = SummaryWriter(log_dir=os.path.join(
        settings.LOG_DIR, settings.NET, settings.TIME_NOW))
input_tensor = torch.Tensor(1, 3, 32, 32)
if settings.GPU:
    input_tensor = input_tensor.cuda()
writer.add_graph(net, input_tensor)

#create checkpoint folder to save model
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')

best_acc = 0.0
if settings.RESUME:
    best_weights = best_acc_weights(os.path.join(settings.CHECKPOINT_PATH, settings.NET, recent_folder))
    if best_weights:
        weights_path = os.path.join(settings.CHECKPOINT_PATH, settings.NET, recent_folder, best_weights)
        print('found best acc weights file:{}'.format(weights_path))
        print('load best training file to test acc...')
        net.load_state_dict(torch.load(weights_path))
        best_acc = eval_training(tb=False)
        print('best acc is {:0.2f}'.format(best_acc))

    recent_weights_file = most_recent_weights(os.path.join(settings.CHECKPOINT_PATH, settings.NET, recent_folder))
    if not recent_weights_file:
        raise Exception('no recent weights file were found')
    weights_path = os.path.join(settings.CHECKPOINT_PATH, settings.NET, recent_folder, recent_weights_file)
    print('loading weights file {} to resume training.....'.format(weights_path))
    net.load_state_dict(torch.load(weights_path))

    resume_epoch = last_epoch(os.path.join(settings.CHECKPOINT_PATH, settings.NET, recent_folder))

NameError: name 'resume_epoch' is not defined

In [None]:
for epoch in range(1, settings.EPOCH + 1):
    if epoch > settings.WARMUP:
        train_scheduler.step(epoch)
    if settings.RESUME:
        if epoch <= resume_epoch:
            continue
    train(epoch)
    acc = eval_training(epoch)
    #start to save best performance model after learning rate decay to 0.01
    if epoch > settings.MILESTONES[1] and best_acc < acc:
        weights_path = checkpoint_path.format(net=settings.NET, epoch=epoch, type='best')
        print('saving weights file to {}'.format(weights_path))
        torch.save(net.state_dict(), weights_path)
        best_acc = acc
        continue

    if not epoch % settings.SAVE_EPOCH:
        weights_path = checkpoint_path.format(net=settings.NET, epoch=epoch, type='regular')
        print('saving weights file to {}'.format(weights_path))
        torch.save(net.state_dict(), weights_path)

writer.close()

Training Epoch: 1 [128/50000]	Loss: 4.7416	LR: 0.000000
Training Epoch: 1 [256/50000]	Loss: 4.7507	LR: 0.000256
Training Epoch: 1 [384/50000]	Loss: 4.7553	LR: 0.000512
Training Epoch: 1 [512/50000]	Loss: 4.7575	LR: 0.000767
Training Epoch: 1 [640/50000]	Loss: 4.7439	LR: 0.001023
Training Epoch: 1 [768/50000]	Loss: 4.7048	LR: 0.001279
Training Epoch: 1 [896/50000]	Loss: 4.7390	LR: 0.001535
Training Epoch: 1 [1024/50000]	Loss: 4.7267	LR: 0.001790
Training Epoch: 1 [1152/50000]	Loss: 4.6013	LR: 0.002046
Training Epoch: 1 [1280/50000]	Loss: 4.6719	LR: 0.002302
Training Epoch: 1 [1408/50000]	Loss: 4.6062	LR: 0.002558
Training Epoch: 1 [1536/50000]	Loss: 4.6495	LR: 0.002813
Training Epoch: 1 [1664/50000]	Loss: 4.5117	LR: 0.003069
Training Epoch: 1 [1792/50000]	Loss: 4.6379	LR: 0.003325
Training Epoch: 1 [1920/50000]	Loss: 4.5186	LR: 0.003581
Training Epoch: 1 [2048/50000]	Loss: 4.5133	LR: 0.003836
Training Epoch: 1 [2176/50000]	Loss: 4.5414	LR: 0.004092
Training Epoch: 1 [2304/50000]	Loss: 4



In [None]:
'''
测试阶段
'''
net = resnet18()

cifar100_test_loader = get_test_dataloader(
    settings.CIFAR100_TRAIN_MEAN,
    settings.CIFAR100_TRAIN_STD,
    #settings.CIFAR100_PATH,
    num_workers=4,
    batch_size=settings.BATCH_SIZE,
)

net.load_state_dict(torch.load('checkpoint\resnet18\Tuesday_26_July_2022_06h_54m_48s\resnet18-121-best.pth'))
print(net)
net.eval()

correct_1 = 0.0
correct_5 = 0.0
total = 0

with torch.no_grad():
    for n_iter, (image, label) in enumerate(cifar100_test_loader):
        print("iteration: {}\ttotal {} iterations".format(n_iter + 1, len(cifar100_test_loader)))

        if settings.GPU:
            image = image.cuda()
            label = label.cuda()
            print('GPU INFO.....')
            print(torch.cuda.memory_summary(), end='')


        output = net(image)
        _, pred = output.topk(5, 1, largest=True, sorted=True)

        label = label.view(label.size(0), -1).expand_as(pred)
        correct = pred.eq(label).float()

        #compute top 5
        correct_5 += correct[:, :5].sum()

        #compute top1
        correct_1 += correct[:, :1].sum()

if settings.GPU:
    print('GPU INFO.....')
    print(torch.cuda.memory_summary(), end='')

print()
print("Top 1 err: ", 1 - correct_1 / len(cifar100_test_loader.dataset))
print("Top 5 err: ", 1 - correct_5 / len(cifar100_test_loader.dataset))
print("Parameter numbers: {}".format(sum(p.numel() for p in net.parameters())))



Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data\cifar-100-python.tar.gz


14.5%IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

19.4%IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

20.3%