# Octave Convolution Tests

We can use this notebook to test our implementation of the OctConv module.

The OctConv module itself is defined under `modules.py`.

## Setup

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
%load_ext autoreload
%autoreload 2

from modules import OctConv2dStackable, OctConv2dBN, get_stacked_4, get_stacked_4BN
from octconv_tests import test_octconv_shapes, test_octconv_as_conv

In [None]:
USE_GPU = True

dtype = torch.float32

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('using device:', device)

## Testing OctConv Behavior

Testing code is located in `octconv_tests.py`.

We can disregard the 'nn.Upsample' warning and safely use `nn.Upsample` as a layer according to these [posts](https://discuss.pytorch.org/t/which-function-is-better-for-upsampling-upsampling-or-interpolate/21811/12)

In [4]:
# Example test for Octconv layer with padding and stride
oc = OctConv2dStackable(16, 32, (3, 3), 0.25, 0.25, stride=1, padding=1)
input_stacked = torch.randn(128, 13, 32, 32)
out = oc(input_stacked)
assert out.shape == (128, 26, 32, 32), "Shape mismatch for stride=1, padding=1"



In [None]:
test_octconv_shapes()
test_octconv_as_conv()

## Building an Octconv Network

Here we use the `FourLayerOctConvNet` defined in `modules.py`. That code is not super flexible, but it proves that a network built with OctConv layers can overfit a small dataset.

In [28]:
# Initialize random training data
N, C, H, W, D_out = 64, 3, 32, 32, 10
x = torch.randn(N, C, H, W, dtype=dtype, device=device)
y = torch.randint(0, D_out, (N, ), dtype=dtype, device=device)

In [29]:
# Create our model
alpha, freq_ratio, hidden_channels = .25, 2, 32
model = get_stacked_4(alpha, freq_ratio, hidden_channels, C, H, W, D_out)

In [34]:
for name, param in list(model.named_parameters())[:10]:
    if param.requires_grad:
        print(name)

0.conv_hh.weight
0.conv_hh.bias
0.conv_hl.weight
0.conv_hl.bias
2.conv_hh.weight
2.conv_hh.bias
2.conv_ll.weight
2.conv_ll.bias
2.conv_lh.weight
2.conv_lh.bias


In [33]:
# Overfit on our fake dataset
# This training code shamelessy adapted from Justin Johnson's Pytorch examples
model = model.to(device=device)
x = x.to(device=device, dtype=dtype)
y = y.to(device=device, dtype=torch.long)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(250):
    y_pred = model(x)
    
    loss = F.cross_entropy(y_pred, y)
    if t % 25 == 0:
        _, class_preds = torch.max(y_pred, 1)
        correct = (class_preds == y).sum()
        print("Iteration {}, loss: {}, train accuracy: {}".format(t, loss.item(), float(correct) / len(y)))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
y_pred = model(x)


Iteration 0, loss: 2.2931301593780518, train accuracy: 0.15625
Iteration 25, loss: 2.1890764236450195, train accuracy: 0.265625
Iteration 50, loss: 2.0960497856140137, train accuracy: 0.3125
Iteration 75, loss: 1.9096107482910156, train accuracy: 0.4375
Iteration 100, loss: 1.5080409049987793, train accuracy: 0.90625
Iteration 125, loss: 0.7316662073135376, train accuracy: 1.0
Iteration 150, loss: 0.17228272557258606, train accuracy: 1.0
Iteration 175, loss: 0.038950126618146896, train accuracy: 1.0
Iteration 200, loss: 0.01598169282078743, train accuracy: 1.0
Iteration 225, loss: 0.009169764816761017, train accuracy: 1.0


## Building an OctConv network with Batchnorm

In [35]:
model = get_stacked_4BN(alpha, freq_ratio, hidden_channels, C, H, W, D_out)
for name, param in list(model.named_parameters())[:10]:
    if param.requires_grad:
        print(name)

0.conv_hh.weight
0.conv_hh.bias
0.conv_hl.weight
0.conv_hl.bias
0.bn_h.weight
0.bn_h.bias
0.bn_l.weight
0.bn_l.bias
2.conv_hh.weight
2.conv_hh.bias


In [37]:
# Overfit on our fake dataset
# As expected, Batchnorm speeds up training by 2x - 3x!
# This training code shamelessy adapted from Justin Johnson's Pytorch examples
model = model.to(device=device)
x = x.to(device=device, dtype=dtype)
y = y.to(device=device, dtype=torch.long)

model.train()
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(250):
    y_pred = model(x)
    
    loss = F.cross_entropy(y_pred, y)
    if t % 25 == 0:
        _, class_preds = torch.max(y_pred, 1)
        correct = (class_preds == y).sum()
        print("Iteration {}, loss: {}, train accuracy: {}".format(t, loss.item(), float(correct) / len(y)))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

model.eval()
y_pred = model(x)

Iteration 0, loss: 2.330103874206543, train accuracy: 0.109375
Iteration 25, loss: 1.5433381795883179, train accuracy: 0.8125
Iteration 50, loss: 0.9170371890068054, train accuracy: 1.0
Iteration 75, loss: 0.4700774550437927, train accuracy: 1.0
Iteration 100, loss: 0.23350805044174194, train accuracy: 1.0
Iteration 125, loss: 0.12808947265148163, train accuracy: 1.0
Iteration 150, loss: 0.07951658964157104, train accuracy: 1.0
Iteration 175, loss: 0.054273564368486404, train accuracy: 1.0
Iteration 200, loss: 0.03954865783452988, train accuracy: 1.0
Iteration 225, loss: 0.030208997428417206, train accuracy: 1.0
