In [1]:
import torch
from models.CNN import CNN
from models.RNN import RNN
from models.two_stream import two_stream_model

In [2]:
def conv_output_size(input_size, kernel_size, stride=1, padding=0, dilation=1):
    output_size = (input_size + 2*padding - dilation*(kernel_size-1) - 1) / stride + 1
    if not output_size.is_integer():
        print(f'Fractional output size: {output_size}')
    return int(output_size)

# Functions to check Conv2d and ConvTranspose2d architecture output size
def check_Conv2d_architecture(input_size, conv_layers):
    print(f'# In: {input_size} x {input_size}')
    for conv_layer in conv_layers:
        output_size = conv_output_size(input_size, *conv_layer)
        print(f"nn.Conv2d(..., {str(conv_layer).strip('()')}), # {output_size} x {output_size}")
        input_size = output_size
    print(f'# Out: {output_size} x {output_size}')

In [3]:
input_size = 216
num_classes = 174

## LRCN

In [4]:
check_Conv2d_architecture(input_size, [
    (7, 1, 0), # conv1
    (2, 2, 0), # pool1
    (5, 1, 0), # conv2
    (2, 2, 0), # pool2
    (3, 1, 0), # conv3
    (3, 1, 0), # conv4
    (3, 1, 0), # conv5
    (2, 2, 0), # pool5
])

cnn = CNN(num_classes)
x = torch.randn(32, 3, input_size, input_size)
y = cnn(x)
print(f'CNN output shape: {y.shape}')

rnn = RNN(input_size, num_classes)

num_video_frames = 500
x = torch.randn(32, num_video_frames, input_size)
y = rnn(x)
print(f'RNN output shape: {y.shape}')

# In: 216 x 216
nn.Conv2d(..., 7, 1, 0), # 210 x 210
nn.Conv2d(..., 2, 2, 0), # 105 x 105
nn.Conv2d(..., 5, 1, 0), # 101 x 101
Fractional output size: 50.5
nn.Conv2d(..., 2, 2, 0), # 50 x 50
nn.Conv2d(..., 3, 1, 0), # 48 x 48
nn.Conv2d(..., 3, 1, 0), # 46 x 46
nn.Conv2d(..., 3, 1, 0), # 44 x 44
nn.Conv2d(..., 2, 2, 0), # 22 x 22
# Out: 22 x 22
CNN output shape: torch.Size([32, 174])
RNN output shape: torch.Size([32, 174])


## Two Stream

In [5]:
# Spatial CNN architecture
check_Conv2d_architecture(input_size, [
    (7, 2, 0), # conv1
    (2, 2, 0), # pool1
    (5, 2, 0), # conv2
    (2, 2, 0), # pool2
    (3, 1, 1), # conv3
    (3, 1, 1), # conv4
    (3, 1, 1), # conv5
    (2, 2, 0), # pool5
])

# In: 216 x 216
Fractional output size: 105.5
nn.Conv2d(..., 7, 2, 0), # 105 x 105
Fractional output size: 52.5
nn.Conv2d(..., 2, 2, 0), # 52 x 52
Fractional output size: 24.5
nn.Conv2d(..., 5, 2, 0), # 24 x 24
nn.Conv2d(..., 2, 2, 0), # 12 x 12
nn.Conv2d(..., 3, 1, 1), # 12 x 12
nn.Conv2d(..., 3, 1, 1), # 12 x 12
nn.Conv2d(..., 3, 1, 1), # 12 x 12
nn.Conv2d(..., 2, 2, 0), # 6 x 6
# Out: 6 x 6


In [6]:
two_stream = two_stream_model(num_classes)

temporal_input = torch.randn(32, 18, input_size, input_size)
spatial_input = torch.randn(32, 3, input_size, input_size)
y = two_stream(spatial_input, temporal_input)
print(f'Two stream output shape: {y.shape}')

Two stream output shape: torch.Size([32, 174])
