In [1]:
#imports
import numpy as np
import torch
import torch.nn as nn
import pickle
import torchvision
from typing import Union, List, Dict, Any, cast
from torchvision.transforms import transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import Caltech256, CIFAR10, STL10, Caltech101
from torchvision.models import VGG

from torch.nn.parameter import Parameter
from torch.autograd import Function
import torch.nn.functional as F

In [2]:
# group norm
def make_gn_layers(cfg: List[Union[str, int]], batch_norm: bool = False, norm_layer = None) -> nn.Sequential:
    layers: List[nn.Module] = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            v = cast(int, v)
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            # v is the output channel
            if batch_norm:
                if norm_layer is None:
                    raise Error("Please specify a norm layer")
                # @group if want to use this, please refer to the higher order function
                # in the next block
                layers += [conv2d, norm_layer(v//2, v)(), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

def make_gn_vgg(arch: str, cfg: str, batch_norm: bool, pretrained: bool, progress: bool, norm_layer=None, num_classes = None, **kwargs: Any) -> VGG:
    cfgs: Dict[str, List[Union[str, int]]] = {
        'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
        'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
    }
    if pretrained:
        kwargs['init_weights'] = False
    model = VGG(make_gn_layers(cfgs[cfg], batch_norm=batch_norm, norm_layer=norm_layer), num_classes = num_classes, **kwargs)
    if pretrained:
        raise NotImplementedError()
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

def vgg11_gn(pretrained: bool = False, progress: bool = True, norm_layer = None, num_classes = None, **kwargs: Any) -> VGG:
    r"""
    Makes the group norm version of VGG11
    VGG 11-layer model (configuration "A") with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
    The required minimum input size of the model is 32x32.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    assert num_classes is not None, "give a number of class in accordance to dataset"
    return make_gn_vgg('vgg11_bn', 'A', True, pretrained, progress, norm_layer = norm_layer, num_classes = num_classes, **kwargs)



In [3]:
# channel batch
def make_bc_layers(cfg: List[Union[str, int]], batch_norm: bool = False, norm_layer = None) -> nn.Sequential:
    layers: List[nn.Module] = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            v = cast(int, v)
            conv2d = WConv2d(in_channels, v, kernel_size=3, padding=1)
            # v is the output channel
            if batch_norm:
                # @group if want to use this, please refer to the higher order function
                # in the next block
                # hyperparameters
                num_groups = v//2
                eps=1e-08
                layers += [conv2d, BCNorm(v, num_groups, eps), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

def make_bc_vgg(arch: str, cfg: str, batch_norm: bool, pretrained: bool, progress: bool, norm_layer=None, num_classes = None, **kwargs: Any) -> VGG:
    cfgs: Dict[str, List[Union[str, int]]] = {
        'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
        'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
    }
    if pretrained:
        kwargs['init_weights'] = False
    model = VGG(make_bc_layers(cfgs[cfg], batch_norm=batch_norm, norm_layer=norm_layer), num_classes = num_classes, **kwargs)
    if pretrained:
        raise NotImplementedError()
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

def vgg11_bcwn(pretrained: bool = False, progress: bool = True, norm_layer = None, num_classes = None, **kwargs: Any) -> VGG:
    r"""
    Makes the group norm version of VGG11
    VGG 11-layer model (configuration "A") with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
    The required minimum input size of the model is 32x32.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    assert num_classes is not None, "give a number of class in accordance to dataset"
    return make_bc_vgg('vgg11_bn', 'A', True, pretrained, progress, norm_layer = norm_layer, num_classes = num_classes, **kwargs)



In [4]:
# channel batch
def make_bn_layers(cfg: List[Union[str, int]], batch_norm: bool = False, norm_layer = None) -> nn.Sequential:
    layers: List[nn.Module] = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            v = cast(int, v)
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            # v is the output channel
            if batch_norm:
                # @group if want to use this, please refer to the higher order function
                # in the next block
                # hyperparameters
                eps=1e-08
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

def make_bn_vgg(arch: str, cfg: str, batch_norm: bool, pretrained: bool, progress: bool, norm_layer=None, num_classes = None, **kwargs: Any) -> VGG:
    cfgs: Dict[str, List[Union[str, int]]] = {
        'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
        'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
    }
    if pretrained:
        kwargs['init_weights'] = False
    model = VGG(make_bn_layers(cfgs[cfg], batch_norm=batch_norm, norm_layer=norm_layer), num_classes = num_classes, **kwargs)
    if pretrained:
        raise NotImplementedError()
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

def vgg11_bn(pretrained: bool = False, progress: bool = True, norm_layer = None, num_classes = None, **kwargs: Any) -> VGG:
    r"""
    Makes the group norm version of VGG11
    VGG 11-layer model (configuration "A") with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
    The required minimum input size of the model is 32x32.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    assert num_classes is not None, "give a number of class in accordance to dataset"
    return make_bn_vgg('vgg11_bn', 'A', True, pretrained, progress, norm_layer = norm_layer, num_classes = num_classes, **kwargs)



In [5]:
# channel batch
def make_layers(cfg: List[Union[str, int]], batch_norm: bool = False, norm_layer = None) -> nn.Sequential:
    layers: List[nn.Module] = []
    in_channels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            v = cast(int, v)
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            # v is the output channel
            if batch_norm:
                # @group if want to use this, please refer to the higher order function
                # in the next block
                # hyperparameters
                raise NotImplemented
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)

def make_vgg(arch: str, cfg: str, batch_norm: bool, pretrained: bool, progress: bool, norm_layer=None, num_classes = None, **kwargs: Any) -> VGG:
    cfgs: Dict[str, List[Union[str, int]]] = {
        'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
        'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
    }
    if pretrained:
        kwargs['init_weights'] = False
    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm, norm_layer=norm_layer), num_classes = num_classes, **kwargs)
    if pretrained:
        raise NotImplementedError()
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

def vgg11(pretrained: bool = False, progress: bool = True, norm_layer = None, num_classes = None, **kwargs: Any) -> VGG:
    r"""
    Makes the group norm version of VGG11
    VGG 11-layer model (configuration "A") with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_.
    The required minimum input size of the model is 32x32.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    assert num_classes is not None, "give a number of class in accordance to dataset"
    return make_vgg('vgg11', 'A', False, pretrained, progress, norm_layer = norm_layer, num_classes = num_classes, **kwargs)


In [6]:
# helper functions
def get_group_norm_layer(in_channel, out_channel):
    def fun():
        return nn.GroupNorm(in_channel, out_channel)
    return fun

# channel norm + weight
# weight
class WConv2d(nn.Conv2d):

    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1, bias=True):
        super(WConv2d, self).__init__(in_channels, out_channels, kernel_size, stride,
                 padding, dilation, groups, bias)

    def forward(self, x):
        weight = self.weight
        weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2,
                                  keepdim=True).mean(dim=3, keepdim=True)
        weight = weight - weight_mean
        std = torch.pow(weight.view(weight.size(0), -1).var(dim=1) + 1e-5, 0.5).view(-1, 1, 1, 1)
        weight = weight / std.expand_as(weight)
        return F.conv2d(x, weight, self.bias, self.stride,
                        self.padding, self.dilation, self.groups)



class BCNorm(nn.Module):
#     eps = epsilon
    def __init__(self, num_channels, num_groups, eps, estimate=False):
        super(BCNorm, self).__init__()
        self.num_channels = num_channels
        self.num_groups = num_groups
        self.eps = eps
        self.weight = Parameter(torch.ones(1, num_groups, 1))
        self.bias = Parameter(torch.zeros(1, num_groups, 1))
        if estimate:
            self.bn = EstBN(num_channels)
        else:
            self.bn = nn.BatchNorm2d(num_channels)

    def forward(self, inp):
        out = self.bn(inp)
        out = out.view(1, inp.size(0) * self.num_groups, -1)
        out = torch.batch_norm(out, None, None, None, None, True, 0, self.eps, True)
        out = out.view(inp.size(0), self.num_groups, -1)
        out = self.weight * out + self.bias
        out = out.view_as(inp)
        return out
# batch norm


In [7]:
# dataset loading code
class GreyscaleToRGBTransform(object):    
    def __call__(self, image):  
        if image.shape[0] == 1:
            return transforms.Lambda(lambda x: x.repeat(3, 1, 1))(image)
        return image
    
def get_caltech_dataset(batch_size):
    # only works with Caltech256
    # define transforms
    train_transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         #transforms.Lambda(lambda x: x.repeat(3, 1, 1))  if x.shape[0] == 1  else NoneTransform(),                
         transforms.ToTensor(),
         GreyscaleToRGBTransform(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])])
    val_transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])])
    # download link is broken
    dataset = Caltech256(root="../data", download=False, transform=train_transform)
    print(dataset)
    train_set, val_set = torch.utils.data.random_split(dataset, [24486, 6122])
    train_set = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
    val_set = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
    return train_set, val_set

def get_stl_dataset(batch_size):
    # define transforms
    train_transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         #transforms.Lambda(lambda x: x.repeat(3, 1, 1))  if x.shape[0] == 1  else NoneTransform(),                
         transforms.ToTensor(),
         GreyscaleToRGBTransform(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])])
    val_transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])])
    dataset = STL10(root="../data", download=False, transform=train_transform)
    print(dataset)
    train_set, val_set = torch.utils.data.random_split(dataset, [4000, 1000])
    train_set = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
    val_set = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
    return train_set, val_set

def get_cifar_dataset(batch_size):
    # define transforms
    train_transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         #transforms.Lambda(lambda x: x.repeat(3, 1, 1))  if x.shape[0] == 1  else NoneTransform(),                
         transforms.ToTensor(),
         GreyscaleToRGBTransform(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])])
    val_transform = transforms.Compose(
        [transforms.Resize((224, 224)),
         transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])])
    dataset = CIFAR10(root="../data", download=True, transform=train_transform)
    print(dataset)
    train_set, val_set = torch.utils.data.random_split(dataset, [40000, 10000])
    train_set = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
    val_set = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
    return train_set, val_set


In [8]:
def train(args, model, device, train_loader, optimizer, clip_grad, epoch, save_name):
    model = model.train()
    model = model.to(device)
    train_losses = []
    for e in range(epoch):
        for batch_idx, (data, target) in enumerate(train_loader):
            target = torch.as_tensor(target) # caltech256 target is int
            data, target = data.to(device), target.to(device)        
            optimizer.zero_grad()
            output = model(data)
            #print(output.shape)
            #print(target.shape)
            loss = torch.nn.CrossEntropyLoss()(output, target)
            loss.backward()
            if clip_grad:
                nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
            optimizer.step()
            train_losses.append(loss.item())
            if batch_idx % args["log_interval"] == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    e, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
                if args["dry_run"]:
                    break
        #cache model
        path = "{}_{}.p".format(save_name, e)
        torch.save({
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            "loss": train_losses
            }, path)
        print("saved model state and loss to {}".format(path))

In [9]:
def run_tests(args):
    datasets = [get_cifar_dataset(args["batch_size"]), get_stl_dataset(args["batch_size"]), get_caltech_dataset(args["batch_size"])]
    dataset_names = ["CIFAR10", "STL10", "CALTECH256"]
    device = torch.device(args["device"])
    num_classess = [10, 10, 257]
    for num_classes, dn, (train_set, val_set) in zip(num_classess, dataset_names, datasets):
        models = [vgg11_bcwn(num_classes = num_classes), vgg11_gn(norm_layer = get_group_norm_layer, num_classes = num_classes), vgg11_bn(num_classes = num_classes), vgg11(num_classes = num_classes)]
        model_names = ["vgg11_bcwn", "vgg11_gn", "vgg_bn", "vgg_nn"]
        for model, mn in zip(models, model_names):
            if mn == "vgg_nn":
                clip_grad = True
            else:
                clip_grad = False
            optimizer = torch.optim.Adam(model.parameters(), lr=args["lr"])
            name = "{}_{}".format(mn, dn)
            trained_model = train(args, model, args["device"], train_set, optimizer, clip_grad, args["epoch"], name)
        exit(0)

In [10]:
def main():
    # in theory load from cmd, but ... jupyter
    args = dict()
    args["device"] = "cuda:3" 
    args["lr"] = 1e-4 # learning rate
    args["epoch"] = 1
    args["batch_size"] = 32
    args["log_interval"] = 10
    args["dry_run"] = True
    run_tests(args)

In [None]:
main()

Files already downloaded and verified
Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ../data
    Split: Train
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
               ToTensor()
               <__main__.GreyscaleToRGBTransform object at 0x7f91002e79d0>
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )
Dataset STL10
    Number of datapoints: 5000
    Root location: ../data
    Split: train
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
               ToTensor()
               <__main__.GreyscaleToRGBTransform object at 0x7f907e265730>
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )
Dataset Caltech256
    Number of datapoints: 30608
    Root location: ../data/caltech256
    StandardTransform
Transform:

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


saved model state and loss to vgg11_bcwn_CIFAR10_0.p
saved model state and loss to vgg11_gn_CIFAR10_0.p
saved model state and loss to vgg_bn_CIFAR10_0.p
