In [13]:
import timeit
import torch
from torch import nn

In [2]:
'''
Check speed/memory torch operations on gpu.
Conv2d VS depthwise + pointwise VS grouped
with different batch_size / in_channels / hw / kernel_size
'''

'\nCheck speed/memory torch operations on gpu.\nConv2d VS depthwise + pointwise VS grouped\nwith different batch_size / in_channels / hw / kernel_size\n'

In [3]:
# the best for memory
# torch.backends.cudnn.deterministic=True

In [4]:
# the best for speed
# torch.backends.cudnn.benchmark=True

In [7]:
class SeparableConv2d(nn.Module):
    ''' depthwise + pointwise
        depthwise 
        groups=in_channels, out_channels = kernel_size * in_channels '''

    def __init__(self, in_channels, out_channels, kernel_size, bias=False):
        super().__init__()
        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, 
                                   groups=in_channels, bias=bias, padding=1)
        self.pointwise = nn.Conv2d(in_channels, out_channels, 
                                   kernel_size=1, bias=bias)

    def forward(self, x):
        out = self.depthwise(x)
        out = self.pointwise(out)
        return out

In [8]:
def benchmark(in_channels, out_channels, kernel_size, groups, H, W, batch_size, conv_type=None):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    
    start.record()
    if conv_type == 'depth':
        conv = SeparableConv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size)
    else:
        conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups)
    
    xs = torch.rand((batch_size, in_channels, H, W))

    conv.to(device)
    
    print('parameters:', sum(param.numel() for param in conv.parameters()))
    
    xs = xs.to(device)
    xs.requires_grad = True
    
    ys = conv(xs)

    mb = 1024 * 1024

    print('forward, gpu memory:', torch.cuda.memory_allocated()/mb, 'Mb')
    
    ys.backward(torch.ones_like(ys))
    end.record()
    print('backward, gpu memory:', torch.cuda.memory_allocated()/mb, 'Mb')
    print('backward, gpu memory (max):', torch.cuda.max_memory_allocated()/mb, 'Mb')
    
    torch.cuda.synchronize()
    print('Execution time:', start.elapsed_time(end) / 1000, 'sec')


In [21]:
def check_benchmark(in_channels, out_channels, kernel_size, H, W, batch_size):
    
    # usual convolution
    print('in_channels =', in_channels, 'out_channels =', out_channels, 'kernel_size =', kernel_size, 'H =', H, 'W =', W, 'batch_size =', batch_size)
    print()
    print('Usual convolution')
    print()
    benchmark(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=1, H=H, W=W, batch_size=batch_size)
    print()
    print('depthwise + pointwise convolution')
    print()
    benchmark(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups, H=H, W=W, batch_size=batch_size, conv_type='depth')
    print()
    # grouped
    print('grouped convolution')
    print()
    benchmark(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups, H=H, W=W, batch_size=batch_size)

In [24]:
in_channels = [16, 32]
out_channels = 256
kernel_size = [3, 5]
groups = 4
H = [160, 224]
W = [160, 224]
batch_size = [8, 16]

In [25]:
check_benchmark(in_channels[0], out_channels, kernel_size[0], H[0], W[0], batch_size[0])

in_channels = 16 out_channels = 256 kernel_size = 3 H = 160 W = 160 batch_size = 8

Usual convolution

parameters: 37120
forward, gpu memory: 207.6728515625 Mb
backward, gpu memory: 220.314453125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.044849151611328124 sec

depthwise + pointwise convolution

parameters: 4240
forward, gpu memory: 225.0166015625 Mb
backward, gpu memory: 225.033203125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.03440534210205078 sec

grouped convolution

parameters: 9472
forward, gpu memory: 207.5673828125 Mb
backward, gpu memory: 220.103515625 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.03783747100830078 sec


In [26]:
check_benchmark(in_channels[1], out_channels, kernel_size[0], H[0], W[0], batch_size[0])

in_channels = 32 out_channels = 256 kernel_size = 3 H = 160 W = 160 batch_size = 8

Usual convolution

parameters: 73984
forward, gpu memory: 220.3134765625 Mb
backward, gpu memory: 245.595703125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.08809017944335938 sec

depthwise + pointwise convolution

parameters: 8480
forward, gpu memory: 250.03271484375 Mb
backward, gpu memory: 250.0654296875 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.07118029022216797 sec

grouped convolution

parameters: 18688
forward, gpu memory: 220.1025390625 Mb
backward, gpu memory: 245.173828125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.06709410858154297 sec


In [27]:
check_benchmark(in_channels[1], out_channels, kernel_size[1], H[0], W[0], batch_size[0])

in_channels = 32 out_channels = 256 kernel_size = 5 H = 160 W = 160 batch_size = 8

Usual convolution

parameters: 205056
forward, gpu memory: 215.9072265625 Mb
backward, gpu memory: 241.689453125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.08014546966552734 sec

depthwise + pointwise convolution

parameters: 8992
forward, gpu memory: 244.44482421875 Mb
backward, gpu memory: 245.1005859375 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.06526604461669921 sec

grouped convolution

parameters: 51456
forward, gpu memory: 215.3212890625 Mb
backward, gpu memory: 240.517578125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.11503616333007813 sec


In [28]:
check_benchmark(in_channels[1], out_channels, kernel_size[1], H[1], W[1], batch_size[0])

in_channels = 32 out_channels = 256 kernel_size = 5 H = 224 W = 224 batch_size = 8

Usual convolution

parameters: 205056
forward, gpu memory: 427.9072265625 Mb
backward, gpu memory: 477.689453125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.19270005798339843 sec

depthwise + pointwise convolution

parameters: 8992
forward, gpu memory: 483.06591796875 Mb
backward, gpu memory: 483.1005859375 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.14364035034179687 sec

grouped convolution

parameters: 51456
forward, gpu memory: 427.3212890625 Mb
backward, gpu memory: 476.517578125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.24001741027832033 sec


In [29]:
check_benchmark(in_channels[1], out_channels, kernel_size[1], H[1], W[1], batch_size[1])

in_channels = 32 out_channels = 256 kernel_size = 5 H = 224 W = 224 batch_size = 16

Usual convolution

parameters: 205056
forward, gpu memory: 855.0322265625 Mb
backward, gpu memory: 953.814453125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.3881612548828125 sec

depthwise + pointwise convolution

parameters: 8992
forward, gpu memory: 964.35498046875 Mb
backward, gpu memory: 966.1318359375 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.21024520874023436 sec

grouped convolution

parameters: 51456
forward, gpu memory: 854.4462890625 Mb
backward, gpu memory: 952.642578125 Mb
backward, gpu memory (max): 3566.70703125 Mb
Execution time: 0.23246234130859375 sec
