In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn.functional as F
import numpy as np

from ssgd import StreamingSGD

# Model definition

A StreamingSGD compatible model now needs to be able to "detach" layers as well as gather input/output and gradients. It also needs a list of layers. See below for implementation example. In the future we want to implement this using PyTorch hooks and modules.

In [3]:
class ExampleNet(torch.nn.Module):
    def __init__(self):
        super(ExampleNet, self).__init__()
        
        self.input_layer = torch.nn.Conv2d(3, 3, kernel_size=3, padding=1)
        self.layers = [self.input_layer]

        for i in range(6):  # use 9 for 8194 x 8194 images
            self.add_block(i)
        
        final_conv5 = torch.nn.Conv2d(3, 1, kernel_size=8)
        self.add_module("final", final_conv5)
        
        self.layers.extend([final_conv5])
        
    def add_block(self, i):
        conv1 = torch.nn.Conv2d(3, 3, kernel_size=3, padding=1)
        conv2 = torch.nn.Conv2d(3, 3, kernel_size=3, padding=1)
        conv3 = torch.nn.Conv2d(3, 3, kernel_size=3, padding=1)
        maxpool = torch.nn.MaxPool2d(2, stride=2)
        
        self.add_module("conv1-" + str(i), conv1)
        self.add_module("conv2-" + str(i), conv2)
        self.add_module("conv3-" + str(i), conv3)
        self.add_module("maxpool-" + str(i), maxpool)

        self.layers.extend([conv1, conv2, conv3, maxpool])
        
    def backward(self, gradient):
        self.gradients = []
        for i, output in reversed(list(enumerate(self.output))):
            if i < (len(self.output) - 1):
                gradient = self.input[i+1].grad
            output.backward(gradient=gradient, retain_graph=True)
            self.gradients.append(gradient)

    def forward(self, x, stop_index=-1, start_index=0, detach=False):
        if detach:
            self.output = []
            self.input = []
        for i, layer in enumerate(self.layers[start_index:]):
            if detach:
                x = torch.autograd.Variable(x.data, requires_grad=detach)
                self.input.append(x)
            if i == stop_index:
                break
            if i + 1 == len(self.layers[start_index:]):
                x = layer(x)
                x = x.view(-1, 1)
                x = F.sigmoid(x)
            else:
                x = F.relu(layer(x))
            if detach:
                self.output.append(x)
        return x

model = ExampleNet()
# model = model.double()

Weight initialization; we use positive values to generate large gradients, better for testing if final gradients are correct.

In [4]:
for i, layer in enumerate(model.modules()):
    if isinstance(layer, torch.nn.Conv2d):
        layer.weight.data.fill_(0.04)
        layer.bias.data.zero_()

In [5]:
print(model)

ExampleNet(
  (input_layer): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv1-0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2-0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3-0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (maxpool-0): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
  (conv1-1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2-1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3-1): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (maxpool-1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
  (conv1-2): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2-2): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3-2): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (maxpool-2): MaxPool2d(k

In [6]:
sCNN = StreamingSGD(model)

# Configurations

In [7]:
stop_index = 13  # use 21 for 8194x8194 images
img_size = 514  # try 8194, see last segment for details

cuda = False  # execute this notebook on the GPU
verbose = True  # enable / disable logging
divide_in = 2  # tip: use 25 for 8194x8194 when memory constraint

# Configure streaming SGD

In [8]:
if cuda:
    model.cuda()

In [9]:
sCNN.configure(model.layers, stop_index, (img_size, img_size, 3), 2, cuda, verbose)

Calculating patch boxes...
[1, 3, 514.0, 514.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 514.0, 514.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 514.0, 514.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 514.0, 514.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 257.0, 257.0] [2.0, 2.0] [0.0, 0.0, 0.0, 0.0] MaxPool2d
[1, 3, 257.0, 257.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 257.0, 257.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 257.0, 257.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 128.0, 128.0] [2.0, 2.0] [0.0, 1.0, 0.0, 1.0] MaxPool2d
[1, 3, 128.0, 128.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 128.0, 128.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 128.0, 128.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 64.0, 64.0] [2.0, 2.0] [0.0, 0.0, 0.0, 0.0] MaxPool2d
Embedding divided in tile sizes: (32, 32) 

[1, 3, 386.0, 386.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 386.0, 386.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 386.0, 386.0] [

In [10]:
sCNN._getreconstructioninformation(model.layers, (1, img_size, img_size, 3));

[1, 3, 514.0, 514.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 514.0, 514.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 514.0, 514.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 514.0, 514.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 257.0, 257.0] [2.0, 2.0] [0.0, 0.0, 0.0, 0.0] MaxPool2d
[1, 3, 257.0, 257.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 257.0, 257.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 257.0, 257.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 128.0, 128.0] [2.0, 2.0] [0.0, 1.0, 0.0, 1.0] MaxPool2d
[1, 3, 128.0, 128.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 128.0, 128.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 128.0, 128.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 64.0, 64.0] [2.0, 2.0] [0.0, 0.0, 0.0, 0.0] MaxPool2d
[1, 3, 64.0, 64.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 64.0, 64.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 64.0, 64.0] [1.0, 1.0] [1.0, 1.0, 1.0, 1.0] Conv2d
[1, 3, 32.0, 32.0] [2.0, 2.0] [0.0, 0.0

# Generate random image and fake label

In the current implementation the whole image needs to be able to fit in memory (RAM).

In [11]:
image = torch.FloatTensor(3, img_size, img_size).normal_(0, 1)
target = torch.FloatTensor(1, 1).fill_(0)

# image = image.double()
# target = target.double()

if cuda:
    target = target.cuda()

In [12]:
image_var = torch.autograd.Variable(image)
# image_var = image_var.double()

In [13]:
criterion = torch.nn.BCELoss()

In [14]:
output, feature_map = sCNN.forward(image_var)

  0%|          | 0/4 [00:00<?, ?it/s]

Doing forward pass...


100%|██████████| 4/4 [00:00<00:00, 47.49it/s]


In [15]:
output.data.numpy()

array([[ 0.96422863]], dtype=float32)

In [16]:
loss = criterion(output, torch.autograd.Variable(target)); loss

Variable containing:
 3.3306
[torch.FloatTensor of size 1]

In [17]:
full_gradients = sCNN.backward(image_var, feature_map, loss, fill_gradients=False)

  0%|          | 0/4 [00:00<?, ?it/s]

Doing backward pass...


100%|██████████| 4/4 [00:00<00:00,  7.60it/s]


Filled gradient sizes:

   48    64    64
   94   128   131
   89   128   133
   88   128   138
   87   128   142
  177   257   295
  172   257   297
  171   257   301
  170   257   304
  344   514   616
  340   514   617
  339   514   620
  338   514   622
[torch.FloatTensor of size 13x3]
 

Everything filled:
 True





"Everything filled" means that all gradients were reconstructed succesfully!

Save the gradients of the conv2d layer to compare with normal SGD:

In [18]:
streaming_conv_gradients = []

for i, layer in enumerate(model.layers):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            streaming_conv_gradients.append(layer.weight.grad.clone()) 

# Compare to normal SGD

Reset the gradients and perform a normal for backward pass.

In [19]:
for i, layer in enumerate(model.layers):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            layer.weight.grad.data.zero_()
            layer.bias.grad.data.zero_()

In [20]:
output = model.forward(image_var[None], detach=False);

In [21]:
output.data.numpy()

array([[ 0.96422863]], dtype=float32)

This output should be the same as the streaming SGD output, if so the loss will also be the same:

In [22]:
loss = criterion(output, torch.autograd.Variable(target)); loss

Variable containing:
 3.3306
[torch.FloatTensor of size 1]

Here we do a normal PyTorch backward pass:

In [23]:
loss.backward()

# Compare the gradients of the conv2d layers

In [24]:
normal_conv_gradients = []

for i, layer in enumerate(model.layers):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            normal_conv_gradients.append(layer.weight.grad) 

In [25]:
for i in range(len(streaming_conv_gradients)):
    max_diff = torch.max(torch.abs(streaming_conv_gradients[i].data - 
                      normal_conv_gradients[i].data))
    print("Layer", i, "\taverage gradient size:", torch.mean(streaming_conv_gradients[i].data))

Layer 0 	average gradient size: 0.9803536254682659
Layer 1 	average gradient size: 0.9803535717504995
Layer 2 	average gradient size: 0.9803535298064903
Layer 3 	average gradient size: 0.9803536423930416
Layer 4 	average gradient size: 0.9803535629201818
Layer 5 	average gradient size: 0.9803536048641911
Layer 6 	average gradient size: 0.9803536269399855
Layer 7 	average gradient size: 0.9803535982414529
Layer 8 	average gradient size: 0.9803536159020884
Layer 9 	average gradient size: 0.9794397994324013
Layer 10 	average gradient size: 0.9803535629201818
Layer 11 	average gradient size: 0.9803535982414529
Layer 12 	average gradient size: 0.9803536048641911
Layer 13 	average gradient size: 0.9803536136945089
Layer 14 	average gradient size: 0.9803535540898641
Layer 15 	average gradient size: 0.9803536335627238
Layer 16 	average gradient size: 0.9803536401854621
Layer 17 	average gradient size: 0.9803536931673685
Layer 18 	average gradient size: 0.9803536070717705
Layer 19 	average grad

In [26]:
for i in range(len(streaming_conv_gradients)):
    max_diff = torch.max(torch.abs(streaming_conv_gradients[i].data - 
                      normal_conv_gradients[i].data))
    print("Layer", i, "\tmax difference between gradients:", max_diff)

Layer 0 	max difference between gradients: 9.5367431640625e-07
Layer 1 	max difference between gradients: 8.940696716308594e-07
Layer 2 	max difference between gradients: 8.940696716308594e-07
Layer 3 	max difference between gradients: 1.430511474609375e-06
Layer 4 	max difference between gradients: 4.172325134277344e-07
Layer 5 	max difference between gradients: 4.172325134277344e-07
Layer 6 	max difference between gradients: 4.76837158203125e-07
Layer 7 	max difference between gradients: 1.7881393432617188e-07
Layer 8 	max difference between gradients: 2.980232238769531e-07
Layer 9 	max difference between gradients: 0.0018034577369689941
Layer 10 	max difference between gradients: 0.0
Layer 11 	max difference between gradients: 0.0
Layer 12 	max difference between gradients: 0.0
Layer 13 	max difference between gradients: 0.0
Layer 14 	max difference between gradients: 0.0
Layer 15 	max difference between gradients: 0.0
Layer 16 	max difference between gradients: 0.0
Layer 17 	max di

As you can see the difference of the gradients of the conv2d layers between the methods is (almost) numerically equivalent. The small differences are because of loss of significance with the floating points calculations. 

---

# Things to try:

* Use doubles instead of floats to reduce the difference (use model.double() and image_var.double())
* Make the image bigger than would fit on a GPU 
    - e.g. 8194x8194, make sure to add 3 more blocks in the model (see comments)
* If you want you can compare the reconstructed input gradients of each layer: 
    - pass fill_gradient=True in backward() function
    - compare full_gradients with self.model.gradients after the full model backward pass.
* For testing purposes the number of filters is small in this notebook, try increasing them