In [1]:
import torch

from ssgd import StreamingCNN

In [2]:
torch.set_printoptions(precision=10)

# Model definition

In [3]:
padding = 0

stream_net = torch.nn.Sequential(
    torch.nn.Conv2d(3, 16, kernel_size=3, padding=padding), torch.nn.ReLU(),
    torch.nn.Conv2d(16, 16, kernel_size=3, padding=padding), torch.nn.ReLU(),
    torch.nn.MaxPool2d(2),
    torch.nn.Conv2d(16, 16, kernel_size=3, padding=padding), torch.nn.ReLU(),
    torch.nn.Conv2d(16, 16, kernel_size=3, padding=padding), torch.nn.ReLU(),
    torch.nn.MaxPool2d(2),
    torch.nn.Conv2d(16, 16, kernel_size=3, padding=padding), torch.nn.ReLU(),
    torch.nn.Conv2d(16, 16, kernel_size=3, padding=padding), torch.nn.ReLU(),
    torch.nn.MaxPool2d(2))

In [4]:
for i, layer in enumerate(stream_net.modules()):
    if isinstance(layer, torch.nn.Conv2d):
        layer.weight.data *= 2.5
        layer.bias.data.zero_()

In [5]:
print(stream_net)

Sequential(
  (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
  (11): ReLU()
  (12): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
  (13): ReLU()
  (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)


# Configurations

In [6]:
tile_size = 512
img_size = 1024
cuda = True  # execute this notebook on the GPU
verbose = True   # enable / disable logging
dtype = torch.double  # test with double precision

# Configure streaming SGD

In [7]:
if cuda:
    stream_net.cuda()
    stream_net.type(dtype)

In [8]:
sCNN = StreamingCNN(stream_net, tile_shape=(1, 3, tile_size, tile_size), verbose=True)


 Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1)) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1)) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1)) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1)) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1)) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1)) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 
 (Lost top:0.0 left:0.0 bottom:0.0 right:0.0)

 Output lost (Lost top:0.0 left:0.


# Generate random image and fake label

In the current implementation the whole image needs to be able to fit in memory (RAM).

In [9]:
image = torch.FloatTensor(3, img_size, img_size).normal_(0, 1)
target = torch.tensor(50.)  # big value so we get larger gradients

image = image.type(dtype)
target = target.type(dtype)

if cuda:
    target = target.cuda()
    image = image.cuda()

In [10]:
criterion = torch.nn.BCELoss()

# Run through network using streaming

In [11]:
stream_output = sCNN.forward(image[None]); stream_output.shape

torch.Size([1, 16, 124, 124])

In [12]:
stream_output.requires_grad = True

In [13]:
output = torch.sigmoid(torch.mean(stream_output)); output

tensor(0.8080944048, device='cuda:0', dtype=torch.float64,
       grad_fn=<SigmoidBackward>)

In [14]:
loss = criterion(output, target); loss

tensor(-70.2330147828, device='cuda:0', dtype=torch.float64,
       grad_fn=<BinaryCrossEntropyBackward>)

In [15]:
loss.backward()

In [16]:
full_gradients = sCNN.backward(image[None], stream_output.grad)

  0%|          | 0/3 [00:00<?, ?it/s]

Number of tiles in backprop: 9


100%|██████████| 3/3 [00:11<00:00,  3.88s/it]


In [17]:
sCNN.disable()

"Everything reconstructed" means that all gradients were reconstructed succesfully!

Save the gradients of the conv2d layer to compare with normal SGD:

In [18]:
streaming_conv_gradients = []

for i, layer in enumerate(stream_net.modules()):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            streaming_conv_gradients.append(layer.weight.grad.clone()) 

# Compare to normal SGD

Reset the gradients and perform a normal for backward pass.

In [19]:
for i, layer in enumerate(stream_net.modules()):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            layer.weight.grad.data.zero_()
            layer.bias.grad.data.zero_()

This output should be the same as the streaming SGD output, if so the loss will also be the same:

In [20]:
conventional_output = stream_net(image[None]); conventional_output.shape

torch.Size([])

In [21]:
# NOTE: sometimes output can be slightly bigger (if tiles do not fit nicely on input image)
max_error = torch.abs(stream_output - conventional_output).max().item()

if max_error < 1e-7:
    print("Equal output to streaming")
else:
    print("NOT equal output to streaming"),
    print("error:", max_error)

Equal output to streaming


In [22]:
output = torch.sigmoid(torch.mean(conventional_output)); output

tensor(0.8080944048, device='cuda:0', dtype=torch.float64,
       grad_fn=<SigmoidBackward>)

In [23]:
loss = criterion(output, target); loss

tensor(-70.2330147828, device='cuda:0', dtype=torch.float64,
       grad_fn=<BinaryCrossEntropyBackward>)

In [24]:
loss.backward()

# Compare the gradients of the conv2d layers

Save the gradients of the conv2d layer to compare with normal SGD:

In [25]:
normal_conv_gradients = []
j = 0
for i, layer in enumerate(stream_net.modules()):
    if isinstance(layer, torch.nn.Conv2d):
        if layer.weight.grad is not None:
            normal_conv_gradients.append(layer.weight.grad) 
            print('Conv layer', j, '\t', layer)
            j += 1

Conv layer 0 	 Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))
Conv layer 1 	 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
Conv layer 2 	 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
Conv layer 3 	 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
Conv layer 4 	 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))
Conv layer 5 	 Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1))


In [26]:
print('Conventional', '\n')

for i in range(len(streaming_conv_gradients)):
    print("Conv layer", i, "\t average gradient size:", 
          float(torch.mean(torch.abs(streaming_conv_gradients[i].data))))

Conventional 

Conv layer 0 	 average gradient size: 0.6633402018338846
Conv layer 1 	 average gradient size: 1.6737470297611399
Conv layer 2 	 average gradient size: 2.774223800907205
Conv layer 3 	 average gradient size: 2.8184983626562485
Conv layer 4 	 average gradient size: 2.4635763198365033
Conv layer 5 	 average gradient size: 2.2630631082562744


In [27]:
print('Streaming', '\n')
for i in range(len(normal_conv_gradients)):
    print("Conv layer", i, "\t average gradient size:", 
          float(torch.mean(torch.abs(normal_conv_gradients[i].data))))

Streaming 

Conv layer 0 	 average gradient size: 0.6633402018338848
Conv layer 1 	 average gradient size: 1.6737470297611394
Conv layer 2 	 average gradient size: 2.7742238009072038
Conv layer 3 	 average gradient size: 2.818498362656249
Conv layer 4 	 average gradient size: 2.463576319836503
Conv layer 5 	 average gradient size: 2.2630631082562727


In [28]:
for i in range(len(streaming_conv_gradients)):
    diff = torch.abs(streaming_conv_gradients[i].data - normal_conv_gradients[i].data)
    max_diff = diff.max()
    print("Conv layer", i, "\tmax difference between kernel gradients:", float(max_diff))

Conv layer 0 	max difference between kernel gradients: 7.37188088351104e-14
Conv layer 1 	max difference between kernel gradients: 1.8385293287792592e-13
Conv layer 2 	max difference between kernel gradients: 6.821210263296962e-13
Conv layer 3 	max difference between kernel gradients: 3.410605131648481e-13
Conv layer 4 	max difference between kernel gradients: 3.801403636316536e-13
Conv layer 5 	max difference between kernel gradients: 7.460698725481052e-14


As you can see the difference of the gradients of the conv2d layers between the methods is (almost) numerically equivalent. The small differences are because of loss of significance with the floating points calculations. 

---

# Things to try:

* Use doubles instead of floats to reduce the difference (use model.double() and image_var.double())
* Make the image bigger than would fit on a GPU 
    - e.g. 8194x8194, make sure to add 3 more blocks in the model (see comments)
* If you want you can compare the reconstructed input gradients of each layer: 
    - pass fill_gradient=True in backward() function
    - compare full_gradients with self.model.gradients after the full model backward pass.
* For testing purposes the number of filters is small in this notebook, try increasing them