In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from tensorboardX import SummaryWriter

import os
import random
import numpy as np

from others.datasets import get_dataset
from models.spectral_normalization_deflate_complex_both_bn import SpectralNorm


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('==========', device)



First let's define regular train and test functions:

In [3]:
def train(trainloader, net, epoch, optimizer, scheduler, criterion, writer=None, model_path="./checkpoints/"):
    print('\nEpoch: %d' % epoch)
    global count_setp
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    batch_idx = -1

    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        count_setp += 1

    writer.add_scalar('train/acc', 100.*correct/total, epoch)
    writer.add_scalar('train/loss', train_loss/(batch_idx+1), epoch)
    print('train/acc', 100.*correct/total)
    print('train/loss', train_loss/(batch_idx+1))

    scheduler.step()
    return train_loss/(batch_idx+1), 100.*correct/total


def test(testloader, net, epoch, criterion, optimizer, scheduler, writer=None, model_path="./checkpoints/"):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    batch_idx = -1
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    writer.add_scalar('test/acc', 100.*correct/total, epoch)
    writer.add_scalar('test/loss', test_loss/(batch_idx+1), epoch)
    print(' Test acc', 100.*correct/total)

    return test_loss/(batch_idx+1), 100.*correct/total

Now let's define a simple convolutional model. To control the spectral norm of the convolutional and linear layer, we just wrap them in the SpectralNorm module which automatically keeps track of the largest singular value and bounds it to arbitrary values. To only keep track of the largest singular value without changing it, the clip_flag has to be set to False, and to store the spectral norm in at each iteration and make the plots in tensorboard, set the summary flag to True. 

In [4]:
class ConvModel(nn.Module):
    def __init__(self, in_chan=1, out_chan=64, kernel_size=3, padding=1, width=28, writer=None):
        super(ConvModel, self).__init__()
        outdim = (width - kernel_size+2*padding) + 1
        linear_input_size = outdim*outdim*out_chan

        self.conv1 = SpectralNorm(nn.Conv2d(in_chan, out_chan, kernel_size=kernel_size, padding=padding), summary=True, writer=writer, clip_flag=True, clip=1.)
        self.bn1 = SpectralNorm(nn.BatchNorm2d(out_chan), writer=writer, clip_flag=False, summary=True)
        self.fc1 = SpectralNorm(nn.Linear(linear_input_size, 10), writer=writer, clip_flag=True, clip=1., summary=True)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

Now let's train the model on MNIST. 

In [5]:
epochs = 60
best_acc = 0  # best test accuracy
count_setp = 0

seed_val = 1
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
np.random.seed(seed_val)
random.seed(seed_val)

trainset = get_dataset('mnist', 'train')
testset = get_dataset('mnist', 'test')
trainloader = torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=128, num_workers=1)
testloader = torch.utils.data.DataLoader(testset, shuffle=False, batch_size=128, num_workers=1)

In [4]:

outdir = 'simpleTest/'
if not os.path.exists(outdir):
    os.makedirs(outdir)
print('------------> Output Directory: ', outdir)
writer = SummaryWriter(outdir) ## -> if you want to keep track of the largest singular values of the layers during the training, you need to pass this writer to the model. If not, you can just pass None.

net = ConvModel(writer=writer) 
net = net.to(device)

model_path =  outdir + 'ckpt.pth'
model_path_test =  outdir + 'ckpt_best_test.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1)

for epoch in range(epochs):
    tr_loss, tr_acc = train(trainloader, net, epoch, optimizer, scheduler, criterion, writer=writer, model_path=model_path)
    ts_loss, ts_acc = test(testloader, net, epoch, criterion, optimizer, scheduler, writer=writer, model_path=model_path_test)
    net.zero_grad()


------------> Output Directory:  simpleTest/
Conv2d
!!!!!!! Clipping is active !!!!!!!! clip val:  1.0
BatchNorm2d
Linear
!!!!!!! Clipping is active !!!!!!!! clip val:  1.0

Epoch: 0
train/acc 87.025
train/loss 0.4081151445410145
 Test acc 92.27

Epoch: 1
train/acc 92.24666666666667
train/loss 0.246363132493074
 Test acc 93.01

Epoch: 2
train/acc 93.52333333333333
train/loss 0.20507483705401675
 Test acc 93.26

Epoch: 3
train/acc 93.96666666666667
train/loss 0.18773841408333544
 Test acc 93.47

Epoch: 4
train/acc 94.65833333333333
train/loss 0.1681281467601816
 Test acc 94.66

Epoch: 5
train/acc 94.84833333333333
train/loss 0.16057465913326247
 Test acc 94.91

Epoch: 6
train/acc 95.19
train/loss 0.15159145486094297
 Test acc 94.74

Epoch: 7
train/acc 95.23833333333333
train/loss 0.14695879793179822
 Test acc 94.51

Epoch: 8
train/acc 95.47666666666667
train/loss 0.14061766603131537
 Test acc 94.83

Epoch: 9
train/acc 95.57166666666667
train/loss 0.13725143281826332
 Test acc 92.43

Epo

To visualize the singular values of each layer during training, use tensorboardX:

```
tensorboard --logdir simpleTest/ --port 6008
```

### Composition of conv layer and batch normalization: 

In the previosu model, we kept track of the spectral norm of the conv layer, dense layer, and batch normalization layer; however, we did not clip the spectral norm of the batch norm layer by setting the clip_flag to False. If this flag is set to True, the same approach as in Gouk et al. (2021) will be used. As pointed out in our paper, this method is not recommended as it impedes the training of the model and leads to very low accuracy on both training and test samples. Instead, we proposed the  application of our method to the composition of the convolutional layer and its succeeding batch norm layer. In order to do this for the previous model, we first make a helper module that represents the composition of the two:

In [6]:

class CNNBN(nn.Module):
    def __init__(self, in_planes, out_planes, kernel_size=3, padding=1, device='cpu', writer=None, bn=True):
        super(CNNBN, self).__init__()
        self.sub_conv1 = SpectralNorm(nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, padding=padding), 
                                      device=device, clip_flag=True, clip=1., writer=writer, summary=True, identifier='_conv')
        self.bn1 = SpectralNorm(nn.BatchNorm2d(out_planes, momentum=0.1, track_running_stats=True), 
                                device=device, clip_flag=False, clip=1., writer=writer, summary=True, identifier='_bn')
        self.bn_flag = bn

    def forward(self, x):
        x = self.sub_conv1(x)
        if self.bn_flag:
            x = self.bn1(x)
        return x

As you can see in CNNBN module, we wrapped the convolutional layer and the batch norm layer in the with the SpectralNorm module so that we can both keep trak of their spectral norms. By wrapping a CNNBN instance with our SpectralNorm module, we can keep track of the spectral norm of the composition of these two layers as well. For controlling the spectral norm of the composition of these layers, we follow the findings of our paper which recommends clipping the spectral norm of CNNBN and its constituent convolutional layer. This allows the batch normalization layer to have much larger spectral norm while controlling the spectral norm of the composition. 

In [9]:
class ConvModel_v2(nn.Module):
    def __init__(self, in_chan=1, out_chan=64, kernel_size=3, padding=1, width=28, writer=None, bn=True):
        super(ConvModel_v2, self).__init__()
        outdim = (width - kernel_size+2*padding) + 1
        linear_input_size = outdim*outdim*out_chan

        self.conv1_ = CNNBN(in_chan, out_chan, kernel_size=kernel_size, padding=padding, device=device, writer=writer, bn=bn)
        self.conv1 = SpectralNorm(self.conv1_, device=device, clip_flag=True, clip=1., writer=writer, summary=True, identifier='_concat')

        self.fc1 = SpectralNorm(nn.Linear(linear_input_size, 10), writer=writer, clip_flag=True, clip=1., summary=True)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

Now we do the same training procedure for the new model:

In [12]:

epochs = 80
outdir = 'simpleTest_v2/'
if not os.path.exists(outdir):
    os.makedirs(outdir)
print('------------> Output Directory: ', outdir)
writer = SummaryWriter(outdir) ## -> if you want to keep track of the largest singular values of the layers during the training, you need to pass this writer to the model. If not, you can just pass None.

net = ConvModel_v2(writer=writer) 
net = net.to(device)

model_path =  outdir + 'ckpt.pth'
model_path_test =  outdir + 'ckpt_best_test.pth'

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

for epoch in range(epochs):
    tr_loss, tr_acc = train(trainloader, net, epoch, optimizer, scheduler, criterion, writer=writer, model_path=model_path)
    ts_loss, ts_acc = test(testloader, net, epoch, criterion, optimizer, scheduler, writer=writer, model_path=model_path_test)
    net.zero_grad()

------------> Output Directory:  simpleTest_v2/
!!!!!!! Clipping is active !!!!!!!! clip val:  1.0
!!!!!!! Clipping is active !!!!!!!! clip val:  1.0
Linear
!!!!!!! Clipping is active !!!!!!!! clip val:  1.0

Epoch: 0


train/acc 87.025
train/loss 0.4227368251474173
 Test acc 90.75

Epoch: 1
train/acc 86.475
train/loss 0.6154305604475139
 Test acc 85.92

Epoch: 2
train/acc 82.58833333333334
train/loss 0.6999289380716108
 Test acc 83.81

Epoch: 3
train/acc 82.99
train/loss 0.6370966461167407
 Test acc 84.71

Epoch: 4
train/acc 83.45
train/loss 0.6012959308080328
 Test acc 85.74

Epoch: 5
train/acc 83.415
train/loss 0.6110366621632566
 Test acc 78.56

Epoch: 6
train/acc 83.38166666666666
train/loss 0.6101108016108653
 Test acc 85.07

Epoch: 7
train/acc 83.42833333333333
train/loss 0.609099855555146
 Test acc 80.09

Epoch: 8
train/acc 83.50166666666667
train/loss 0.6029768122284651
 Test acc 84.29

Epoch: 9
train/acc 83.66166666666666
train/loss 0.6020307180596821
 Test acc 86.01

Epoch: 10
train/acc 83.61666666666666
train/loss 0.6053915920415158
 Test acc 82.03

Epoch: 11
train/acc 83.77666666666667
train/loss 0.593336756168398
 Test acc 84.03

Epoch: 12
train/acc 83.92
train/loss 0.5906891341148409
 T


To visualize the singular values of each layer during training, use tensorboardX:

```
tensorboard --logdir simpleTest_v2/ --port 6008
```

The application of our method to the concatenation of convolutional and batch norm layer might not be as stable as the application of our method to individual layers and might need additional effort observing the model's behavior and tuning the hyperparameters. 

For the detail of our method, comprehensive results and discussions, please refer to our paper:

### References:

1. Miyato, Takeru, et al. "Spectral normalization for generative adversarial networks." arXiv preprint arXiv:1802.05957 (2018).
2. Gouk, Henry, et al. "Regularisation of neural networks by enforcing lipschitz continuity." Machine Learning 110 (2021): 393-416.
3. Senderovich, Alexandra, et al. "Towards practical control of singular values of convolutional layers." Advances in Neural Information Processing Systems 35 (2022): 10918-10930.
4. Delattre, Blaise, et al. "Efficient Bound of Lipschitz Constant for Convolutional Layers by Gram Iteration." arXiv preprint arXiv:2305.16173 (2023).



For the models (ResNet18 and DLA) and training them on cifar-10 we used codes from this repository: https://github.com/kuangliu/pytorch-cifar/tree/master

For the adversarial attacks and and MNIST data, we used the code from this repository: https://github.com/AI-secure/Transferability-Reduced-Smooth-Ensemble/tree/main 