In [1]:
%load_ext autoreload
%env CUDA_VISIBLE_DEVICES = 2 python benchmark.ipynb

env: CUDA_VISIBLE_DEVICES=2 python benchmark.ipynb


In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import gc

from ptflops import get_model_complexity_info

%autoreload 2
pd.set_option("display.precision", 2)

In [3]:
device = 'cuda'
device

'cuda'

In [4]:
class Catchtime:
    def __enter__(self):
        self.start = torch.cuda.Event(enable_timing=True)
        self.end = torch.cuda.Event(enable_timing=True)
        self.start.record()
        return self

    def __exit__(self, type, value, traceback):
        self.end.record()
        torch.cuda.synchronize()
        
    def __float__(self):
        
        return self.start.elapsed_time(self.end)

In [5]:
class Catchmemory:
    def __enter__(self, device=device):
        self.device=device
        self.start_memory = get_memory(device=self.device)
        
        return self

    def __exit__(self, type, value, traceback):
        self.end_memory = get_memory(reset_memory=False, device=self.device) 
        
    def __float__(self):
        
        return float(self.end_memory - self.start_memory)

In [6]:
def get_memory(reset_memory=True, device=None):
    """Take current max allocated memory, either with or without resetting"""
    if reset_memory:
        torch.cuda.reset_peak_memory_stats()
            
    max_memory = torch.cuda.max_memory_allocated(device)
        
    return max_memory
    
    
def make_initialization_inputs(inputs, device=None):
    """ Take either tensor, shape tuple or list of them, and always return tensor or list of them. """
    if isinstance(inputs, torch.Tensor):
        pass
    elif isinstance(inputs, tuple):
        inputs = torch.rand(*inputs, device=device)
    elif isinstance(inputs, list):
        inputs = [make_initialization_inputs(item, device=device) for item in inputs]
    return inputs

    
    
def tracker(module, inputs, repeats=300, warmup=40, device=None, track_backward=True, channels_last=False, amp=False) -> dict:
    """Track module #macs, #parameters, time and memory consumption on forward and backward pass for a given inputs tensor or inputs shape"""
    
    with Catchtime() as total_time:
    
        result = {}

        torch.cuda.empty_cache()

        inputs = make_initialization_inputs(inputs=inputs, device=device)
        module.to(device)

        if channels_last:
            inputs.to(memory_format=torch.channels_last)
            module.to(memory_format=torch.channels_last)          

        forward_timings = []
        backward_timings = []


        for i in range(repeats + warmup):

            with torch.cuda.amp.autocast(enabled=amp):
                if i < warmup:
                    outputs = module(inputs)
                    del outputs
                    i += 1
                    continue
                # calculate forward operation time  
                with Catchtime() as t:
                    module(inputs)

                forward_time = float(t)        
                forward_timings.append(forward_time) 

            if track_backward:
                # calculate backward operation time 
                outputs = module(inputs)
                with Catchtime() as t:
                    outputs.backward(outputs)
                backward_time = float(t) 
                backward_timings.append(backward_time)

        result['forward time mean(ms)'] = np.mean(forward_timings)
        result['forward time std(ms)'] = np.std(forward_timings)

        # calculate forward memory
        with Catchmemory() as cathed_memory:
            module(inputs)
        forward_memory = float(cathed_memory)
        result['forward memory(MB)'] = forward_memory / 2**20

        if track_backward:
            result['backward time mean(ms)'] = np.mean(backward_timings)
            result['backward time std(ms)'] = np.std(backward_timings)
            
            # calculate backward memory
            outputs = module(inputs)
            with Catchmemory() as cathed_memory:
                outputs.backward(outputs)
            backward_memory = float(cathed_memory)
            result['backward memory(MB)'] = backward_memory / 2**20

        macs, params = get_model_complexity_info(module, tuple(inputs.shape[1:]), as_strings=False, print_per_layer_stat=False)
        result['macs'] = macs
        result['parameters'] = float(params)

    result['time total(ms)'] = float(total_time)
    
    return result

In [7]:
shape = (1, 64, 128, 128)

module_collection = {'conv_64_512_1x1': nn.Conv2d(kernel_size=1, in_channels=64, out_channels=512), 
                     'conv_64_512_3x3': nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512), 
                     'bottleneck_64_512_3x3': nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=32), 
                                                             nn.Conv2d(kernel_size=3, in_channels=32, out_channels=512),
                                                              ]),
                      'conv_64_512_3x3_g2': nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=2),
                      'conv_64_512_3x3_g8': nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=8),
                      'conv_padding': nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512, padding=1),
                      'conv_nn_Padding': nn.Sequential(nn.ZeroPad2d(1),
                                                       nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512))
                    }

In [8]:
# contiguous
module_collection_stats = pd.DataFrame(index=module_collection.keys(), 
                                               columns=['forward time mean(ms)', 'forward time std(ms)',
                                                        'backward time mean(ms)', 'backward time std(ms)',
                                                        'forward memory(MB)','backward memory(MB)',
                                                        'macs', 'parameters', 'time total(ms)'])
        
for module_name, module_value in module_collection.items():
    module_collection_stats.loc[module_name] = tracker(module_value, inputs=shape, device=device, track_backward=True)

module_collection_stats

Unnamed: 0,forward time mean(ms),forward time std(ms),backward time mean(ms),backward time std(ms),forward memory(MB),backward memory(MB),macs,parameters,time total(ms)
conv_64_512_1x1,0.41,0.06,0.66,6.42,32.22,0.12,545259520.0,33280.0,1453.55
conv_64_512_3x3,0.82,0.03,1.02,0.01,35.13,1.12,4690151424.0,295424.0,845.21
bottleneck_64_512_3x3,0.75,0.03,1.27,0.09,35.57,2.56,2383218688.0,150048.0,869.4
conv_64_512_3x3_g2,1.98,0.03,2.43,0.02,33.23,3.06,11956733952.0,803328.0,2019.34
conv_64_512_3x3_g8,0.7,0.02,1.24,0.04,31.46,0.77,2994898944.0,201216.0,838.7
conv_padding,0.79,0.01,0.71,0.07,35.13,87.12,4840226816.0,295424.0,742.36
conv_nn_Padding,0.87,0.02,1.05,0.01,39.38,1.12,4840226816.0,295424.0,905.57


In [9]:
# channels last
module_collection_stats = pd.DataFrame(index=module_collection.keys(), 
                                               columns=['forward time mean(ms)', 'forward time std(ms)',
                                                        'backward time mean(ms)', 'backward time std(ms)',
                                                        'forward memory(MB)','backward memory(MB)',
                                                        'macs', 'parameters','time total(ms)'])
        
for module_name, module_value in module_collection.items():
    module_collection_stats.loc[module_name] = tracker(module_value, inputs=shape, device=device, track_backward=True, channels_last=True)

module_collection_stats

Unnamed: 0,forward time mean(ms),forward time std(ms),backward time mean(ms),backward time std(ms),forward memory(MB),backward memory(MB),macs,parameters,time total(ms)
conv_64_512_1x1,0.5,0.32,0.45,0.52,36.47,34.0,545259520.0,33280.0,460.42
conv_64_512_3x3,1.03,0.02,1.1,0.03,37.22,34.0,4690151424.0,295424.0,1013.03
bottleneck_64_512_3x3,0.92,0.03,2.13,0.11,39.65,34.0,2383218688.0,150048.0,1222.53
conv_64_512_3x3_g2,1.94,0.03,13.2,0.06,37.42,39.2,11956733952.0,803328.0,5244.86
conv_64_512_3x3_g8,1.09,0.01,11.72,0.07,35.64,34.77,2994898944.0,201216.0,4243.04
conv_padding,1.02,0.01,1.76,0.06,38.19,34.0,4840226816.0,295424.0,1197.79
conv_nn_Padding,1.1,0.02,1.13,0.2,42.25,34.0,4840226816.0,295424.0,1057.94


In [10]:
# amp
module_collection_stats = pd.DataFrame(index=module_collection.keys(), 
                                               columns=['forward time mean(ms)', 'forward time std(ms)',
                                                        'backward time mean(ms)', 'backward time std(ms)',
                                                        'forward memory(MB)','backward memory(MB)',
                                                        'macs', 'parameters','time total(ms)'])
        
for module_name, module_value in module_collection.items():
    module_collection_stats.loc[module_name] = tracker(module_value, inputs=shape, device=device, track_backward=True, amp=True)

module_collection_stats

Unnamed: 0,forward time mean(ms),forward time std(ms),backward time mean(ms),backward time std(ms),forward memory(MB),backward memory(MB),macs,parameters,time total(ms)
conv_64_512_1x1,0.45,0.01,0.37,0.06,36.22,34.0,545259520.0,33280.0,433.37
conv_64_512_3x3,0.53,0.02,1.09,0.01,37.22,34.0,4690151424.0,295424.0,835.27
bottleneck_64_512_3x3,0.74,0.03,2.09,0.06,39.65,34.0,2383218688.0,150048.0,1161.14
conv_64_512_3x3_g2,10.61,0.03,13.21,0.04,38.16,39.2,11956733952.0,803328.0,8190.9
conv_64_512_3x3_g8,2.57,0.02,11.73,0.05,36.38,34.77,2994898944.0,201216.0,4759.84
conv_padding,0.52,0.02,1.73,0.02,38.0,34.0,4840226816.0,295424.0,1017.54
conv_nn_Padding,0.59,0.02,1.1,0.01,42.25,34.0,4840226816.0,295424.0,875.46


In [11]:
# amp + channels last
module_collection_stats = pd.DataFrame(index=module_collection.keys(), 
                                               columns=['forward time mean(ms)', 'forward time std(ms)',
                                                        'backward time mean(ms)', 'backward time std(ms)',
                                                        'forward memory(MB)','backward memory(MB)',
                                                        'macs', 'parameters','time total(ms)'])
        
for module_name, module_value in module_collection.items():
    module_collection_stats.loc[module_name] = tracker(module_value, inputs=shape, device=device, track_backward=True, channels_last=True, amp=True)

module_collection_stats

Unnamed: 0,forward time mean(ms),forward time std(ms),backward time mean(ms),backward time std(ms),forward memory(MB),backward memory(MB),macs,parameters,time total(ms)
conv_64_512_1x1,0.44,0.05,0.39,0.06,36.22,34.0,545259520.0,33280.0,432.85
conv_64_512_3x3,0.51,0.02,1.1,0.02,37.22,34.0,4690151424.0,295424.0,828.32
bottleneck_64_512_3x3,0.71,0.04,2.13,0.09,39.65,34.0,2383218688.0,150048.0,1157.54
conv_64_512_3x3_g2,10.56,0.03,13.2,0.04,38.16,39.2,11956733952.0,803328.0,8158.94
conv_64_512_3x3_g8,2.52,0.01,11.73,0.04,36.38,34.77,2994898944.0,201216.0,4726.05
conv_padding,0.49,0.01,1.74,0.02,38.0,34.0,4840226816.0,295424.0,1008.89
conv_nn_Padding,0.55,0.01,1.12,0.01,42.25,34.0,4840226816.0,295424.0,858.61
