In [1]:
%load_ext autoreload
%env CUDA_VISIBLE_DEVICES = 2

env: CUDA_VISIBLE_DEVICES=2


In [4]:
import numpy as np
import time
import pandas as pd
from statistics import mean

import torch
import torch.nn as nn

from ptflops import get_model_complexity_info

%autoreload 2
pd.set_option("display.precision", 2)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# try/except/finally
# if `track_backward` (+)
# make one function (module, inputs=(shape or tensor), device, repeats, warmup) -> dict (+)
# torch.cuda.empty_cache() in the very beginning (+)
# please type list instead of array (+)
# module `repr` into the dataframe/dict

In [18]:
def calc_memory(reset_memory=True):
        
    mb = 2 ** 20
    if reset_memory:
        torch.cuda.reset_peak_memory_stats()
            
    max_memory = torch.cuda.max_memory_allocated(device) / mb
        
    return max_memory
    
    
def tracker(module, shape, device, repeats, warmup, track_backward=True) -> dict:
    
    torch.cuda.empty_cache()
    
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    
    # calculate time and memory of the inputs 
    start_memory = calc_memory()
        
    start.record()
    inputs = torch.randn(shape)
    inputs = inputs.to(device)
    end.record()
        
    torch.cuda.synchronize()
        
    inputs_memory = calc_memory(reset_memory=False) - start_memory
    
    # calculate time and memory for to(device) operation
    current_memory = calc_memory()
        
    start.record()
    module.to(device)
    end.record()
    
    torch.cuda.synchronize()
    loading_time = start.elapsed_time(end)
        
    new_current_memory = calc_memory()
    module_memory_consumption = new_current_memory - current_memory
    
    # calculate macs and parameters number
    macs, params = get_model_complexity_info(module, tuple(inputs.shape[1:]), print_per_layer_stat=False)
       
    inputs.requires_grad = True
    
    
    for i in range(warmup):
        outputs = module(inputs)
        del outputs
    
    # calculate time and memory for forward operation
    forward_start_memory = calc_memory()
    temp_outputs = module(inputs)
    forward_end_memory = calc_memory(reset_memory=False) - forward_start_memory
    
    # calculate time and memory for backward operation 
    if track_backward:
        backward_start_memory = calc_memory()
        temp_outputs.backward(torch.ones_like(temp_outputs))
        backward_end_memory = calc_memory(reset_memory=False) - backward_start_memory
        backward_timings = []
        
    del temp_outputs
    
    forward_timings = []
    
    for i in range(warmup, repeats):
                
        start.record()
        outputs = module(inputs)
        end.record()

        torch.cuda.synchronize()

        forward_timings.append(start.elapsed_time(end)) 
        
        if track_backward:
            start.record()
            outputs.backward(torch.ones_like(outputs))
            end.record()

            torch.cuda.synchronize()

            backward_timings.append(start.elapsed_time(end))

        del outputs
        
    result = {'forward time (s)': mean(forward_timings), 'forward memory (MB)': forward_end_memory,
              'module size (to_cuda) (MB)': module_memory_consumption, 'loading time (s)': loading_time,
              'macs': macs, 'parameters': params}
    
    if track_backward:
        result['backward time (s)'] = mean(backward_timings)
        result['backward memory (MB)'] = backward_end_memory
        
    del module
    del inputs
    
    return result

In [13]:
shape = (1, 64, 224, 224)

module_collection = {'conv_64_512_1x1': nn.Conv2d(kernel_size=1, in_channels=64, out_channels=512), 
                     'conv_64_512_3x3': nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512), 
                     'bottleneck_64_512_3x3': nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=16), 
                                              nn.Conv2d(kernel_size=3, in_channels=16, out_channels=512),
                                              ]),
                      'conv_64_512_3x3_g2': nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=2),
                      'conv_64_512_3x3_g8': nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=8),
                      'conv_padding': nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512, padding=1),
                      'conv_nn_Padding': nn.Sequential(nn.ZeroPad2d(1),
                                                       nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512))
                    }

In [19]:
module_collection_stats = pd.DataFrame(index=module_collection.keys(), 
                                               columns=['forward time (s)', 'backward time (s)',
                                                        'forward memory (MB)','backward memory (MB)',
                                                        'module size (to_cuda) (MB)', 'loading time (s)',
                                                        'macs', 'parameters'])
        
for module_name, module_value in module_collection.items():
    module_collection_stats.loc[module_name] = tracker(module_value, shape=shape, device=device, repeats=100,
                                                       warmup=10, track_backward=False)

In [20]:
module_collection_stats

Unnamed: 0,forward time (s),backward time (s),forward memory (MB),backward memory (MB),module size (to_cuda) (MB),loading time (s),macs,parameters
conv_64_512_1x1,0.87,,98.41,,0.0,0.21,1.67 GMac,33.28 k
conv_64_512_3x3,2.38,,99.39,,0.0,0.12,14.56 GMac,295.42 k
bottleneck_64_512_3x3,1.11,,99.89,,0.0,0.14,3.71 GMac,75.28 k
conv_64_512_3x3_g2,6.32,,97.01,,0.0,0.11,38.18 GMac,803.33 k
conv_64_512_3x3_g8,1.9,,95.77,,0.0,0.12,9.56 GMac,201.22 k
conv_padding,2.14,,101.13,,0.0,0.16,14.82 GMac,295.42 k
conv_nn_Padding,2.17,,113.6,,0.0,0.11,14.82 GMac,295.42 k
