In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn.functional as F
import numpy as np

from ssgd import StreamingSGD

In [3]:
torch.set_printoptions(precision=7)

In [4]:
torch.manual_seed(10)

<torch._C.Generator at 0x7f51eee9d3f0>

# Model definition

A StreamingSGD compatible model now needs to be able to "detach" layers as well as gather input/output and gradients. It also needs a list of layers. See below for implementation example. In the future we want to implement this using PyTorch hooks and modules.

In [5]:
class ExampleNet(torch.nn.Module):
    def __init__(self):
        super(ExampleNet, self).__init__()
        
        self.input_layer = torch.nn.Conv2d(3, 3, kernel_size=3, padding=1)
        self.layers = [self.input_layer]

        for i in range(6):  # use 9 for 8194 x 8194 images
            self.add_block(i)
        
        final_conv5 = torch.nn.Conv2d(3, 1, kernel_size=8)
        self.add_module("final", final_conv5)
        
        self.layers.extend([final_conv5])
        
    def add_block(self, i):
        conv1 = torch.nn.Conv2d(3, 3, kernel_size=3, padding=1)
        conv2 = torch.nn.Conv2d(3, 3, kernel_size=3, padding=1)
        conv3 = torch.nn.Conv2d(3, 3, kernel_size=3, padding=1)
        maxpool = torch.nn.MaxPool2d(2, stride=2)
        
        self.add_module("conv1-" + str(i), conv1)
        self.add_module("conv2-" + str(i), conv2)
        self.add_module("conv3-" + str(i), conv3)
        self.add_module("maxpool-" + str(i), maxpool)

        self.layers.extend([conv1, conv2, conv3, maxpool])

    def forward(self, x, stop_at_layer=None, start_at_layer=None):
        stop_index, start_index = -1, 0
        for i, (name, layer) in enumerate(self.named_modules()):
            if name == stop_at_layer:
                stop_index = i
            if name == start_at_layer:
                start_index = i
        
        for i, layer in enumerate(self.layers[start_index:]):
            if i == stop_index:
                break
                
            if i + 1 == len(self.layers[start_index:]):
                x = layer(x)
                x = x.view(-1, 1)
                x = F.sigmoid(x)
            else:
                x = F.relu(layer(x))

        return x

model = ExampleNet()
model = model.double()

Weight initialization; we use positive values to generate large gradients, better for testing if final gradients are correct.

In [6]:
for i, layer in enumerate(model.modules()):
    if isinstance(layer, torch.nn.Conv2d):
        layer.weight.data *= 2.5
        layer.bias.data.zero_()

In [7]:
print(model)

ExampleNet(
  (input_layer): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv1-0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2-0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3-0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (maxpool-0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv1-1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2-1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3-1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (maxpool-1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv1-2): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2-2): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3-2): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (maxpool-2): MaxPool2d(kernel_si

In [8]:
stop_layer = 'conv1-3'  # use 'conv3-4' for 8194x8194 images
img_size = 512  # try 8194, see last segment for details

cuda = False # execute this notebook on the GPU
verbose = True  # enable / disable logging
divide_in = 8  # tip: use 25 for 8194x8194 when memory constraint

In [9]:
sCNN = StreamingSGD(model, stream_to_layer=stop_layer, 
               input_shape=(1, 3, img_size, img_size), 
               divide_in=divide_in, 
               cuda=cuda, 
               verbose=verbose)

Feature map to be reconstructed shape: (64.0, 64.0)
Feature map divided in tile sizes: (16.0, 16.0)
back (IOShape batch:0.0 channels:0.0 height:244.0 width:244.0) (IOShape batch:1.0 channels:3.0 height:30.0 width:30.0)
new (IOShape batch:0.0 channels:0.0 height:248.0 width:248.0) (IOShape batch:1.0 channels:3.0 height:31.0 width:31.0)
Tile size forward: (128, 128)
Tile size backward (for forward pass): (248, 248)
*** Approximate memory reduction of streaming: 76.5% ***


# Configure streaming SGD

In [10]:
if cuda:
    model.cuda()

# Generate random image and fake label

In the current implementation the whole image needs to be able to fit in memory (RAM).

In [11]:
image = torch.FloatTensor(3, img_size, img_size).normal_(0, 1)
target = torch.FloatTensor(1, 1).fill_(0)

image = image.double()
target = target.double()

if cuda:
    target = target.cuda()

In [12]:
image_var = torch.autograd.Variable(image)
image_var = image_var.double()

In [13]:
criterion = torch.nn.BCELoss()

In [14]:
output, feature_map = sCNN.forward(image_var)

 23%|██▎       | 15/64 [00:00<00:00, 144.44it/s]

Doing forward pass...


100%|██████████| 64/64 [00:00<00:00, 210.17it/s]


In [15]:
output

tensor([[0.5487426]], dtype=torch.float64)

In [16]:
output.data.numpy()

array([[ 0.54874258]])

In [17]:
loss = criterion(output, torch.autograd.Variable(target)); loss

tensor(0.7957173, dtype=torch.float64)

In [18]:
full_gradients = sCNN.backward(image_var, feature_map, loss, fill_gradients=True)

  3%|▎         | 1/36 [00:00<00:04,  8.39it/s]

Doing backward pass...


100%|██████████| 36/36 [00:03<00:00, 11.76it/s]

Everything reconstructed:
 True





"Everything filled" means that all gradients were reconstructed succesfully!

Save the gradients of the conv2d layer to compare with normal SGD:

In [19]:
streaming_conv_gradients = []

for i, layer in enumerate(model.layers):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            streaming_conv_gradients.append(layer.weight.grad.clone()) 

# Compare to normal SGD

Reset the gradients and perform a normal for backward pass.

In [20]:
for i, layer in enumerate(model.layers):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            layer.weight.grad.data.zero_()
            layer.bias.grad.data.zero_()

In [21]:
output_full = model(image_var[None]); output_full

tensor([[0.5487426]], dtype=torch.float64)

This output should be the same as the streaming SGD output, if so the loss will also be the same:

In [22]:
loss = criterion(output_full, target); loss

tensor(0.7957173, dtype=torch.float64)

Here we do a normal PyTorch backward pass:

In [23]:
loss.backward()

# Compare the gradients of the conv2d layers

In [24]:
normal_conv_gradients = []

for i, layer in enumerate(model.layers):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            normal_conv_gradients.append(layer.weight.grad) 

In [25]:
for i in reversed(range(len(streaming_conv_gradients))):
    print("Conv layer", len(streaming_conv_gradients) - i, 
          "\taverage gradient size:", float(torch.mean(torch.abs(streaming_conv_gradients[-(i+1)].data))))

Conv layer 1 	average gradient size: 0.5637252520502388
Conv layer 2 	average gradient size: 0.401020140176803
Conv layer 3 	average gradient size: 0.298340982137735
Conv layer 4 	average gradient size: 0.2839051401053676
Conv layer 5 	average gradient size: 0.5841406850347816
Conv layer 6 	average gradient size: 0.5016770921018061
Conv layer 7 	average gradient size: 0.1350360971577124
Conv layer 8 	average gradient size: 0.12467806661259845
Conv layer 9 	average gradient size: 0.13593417220524862
Conv layer 10 	average gradient size: 0.12390978649517517
Conv layer 11 	average gradient size: 0.1342508475250845
Conv layer 12 	average gradient size: 0.05376380760589953
Conv layer 13 	average gradient size: 0.036953496522598585
Conv layer 14 	average gradient size: 0.03073011884630139
Conv layer 15 	average gradient size: 0.03167988371318072
Conv layer 16 	average gradient size: 0.06534912477767163
Conv layer 17 	average gradient size: 0.035020591714705895
Conv layer 18 	average gradient

In [26]:
for i in reversed(range(len(streaming_conv_gradients))):
    max_diff = torch.max(torch.abs(streaming_conv_gradients[-(i + 1)].data - 
                         normal_conv_gradients[-(i + 1)].data))
    print("Conv layer", len(streaming_conv_gradients) - i, 
          "\tmax difference between gradients:", float(max_diff))

Conv layer 1 	max difference between gradients: 2.6645352591003757e-15
Conv layer 2 	max difference between gradients: 1.9984014443252818e-15
Conv layer 3 	max difference between gradients: 1.3322676295501878e-15
Conv layer 4 	max difference between gradients: 1.9984014443252818e-15
Conv layer 5 	max difference between gradients: 2.220446049250313e-15
Conv layer 6 	max difference between gradients: 3.1086244689504383e-15
Conv layer 7 	max difference between gradients: 1.5543122344752192e-15
Conv layer 8 	max difference between gradients: 4.996003610813204e-16
Conv layer 9 	max difference between gradients: 2.7755575615628914e-16
Conv layer 10 	max difference between gradients: 1.3877787807814457e-16
Conv layer 11 	max difference between gradients: 2.7755575615628914e-16
Conv layer 12 	max difference between gradients: 0.0
Conv layer 13 	max difference between gradients: 0.0
Conv layer 14 	max difference between gradients: 0.0
Conv layer 15 	max difference between gradients: 0.0
Conv la

As you can see the difference of the gradients of the conv2d layers between the methods is (almost) numerically equivalent. The small differences are because of loss of significance with the floating points calculations. 

---

# Things to try:

* Use doubles instead of floats to reduce the difference (use model.double() and image_var.double())
* Make the image bigger than would fit on a GPU 
    - e.g. 8194x8194, make sure to add 3 more blocks in the model (see comments)
* If you want you can compare the reconstructed input gradients of each layer: 
    - pass fill_gradient=True in backward() function
    - compare full_gradients with self.model.gradients after the full model backward pass.
* For testing purposes the number of filters is small in this notebook, try increasing them