In [1]:
%load_ext autoreload
%env CUDA_VISIBLE_DEVICES = 3

env: CUDA_VISIBLE_DEVICES=3


In [2]:
import numpy as np
import time
import pandas as pd

import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 

from ptflops import get_model_complexity_info
import torchvision.models as models

# from models import Resnet, ResnetD, ResNext

%autoreload 2
pd.set_option("display.precision", 2)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
class Tracker:
    def __init__(self, device, repeats=100, warmup=0, verbose=True):
        self.device = device 
        self.repeats = repeats
        self.warmup = warmup  
        self.verbose = verbose
        self.start = torch.cuda.Event(enable_timing=True)
        self.end = torch.cuda.Event(enable_timing=True)
        
    
    def track(self, module, inputs): # add no_grad ?
        
        current_memory = self.calc_memory()
        
        self.start.record()
        module.to(self.device)
        self.end.record()
        
        torch.cuda.synchronize()
        loading_time = self.start.elapsed_time(self.end)
        
        new_current_memory = self.calc_memory()
        module_memory_consumption = new_current_memory - current_memory
        
        macs, params = get_model_complexity_info(module, tuple(inputs.shape[1:]), print_per_layer_stat=False)
        
        timings = np.zeros((3, self.repeats))
        
        inputs.requires_grad = True
        
        for i in range(self.warmup):
            outputs = module(inputs)
            
        forward_start_memory = self.calc_memory()
        temp_outputs = module(inputs)
        forward_end_memory = self.calc_memory(reset_memory=False) - forward_start_memory
        
        backward_start_memory = self.calc_memory()
        temp_outputs.backward(torch.ones_like(temp_outputs))
        backward_end_memory = self.calc_memory(reset_memory=False) - backward_start_memory
        
        del temp_outputs
        
        for i in range(self.warmup, self.repeats):
                
                self.start.record()
                outputs = module(inputs)
                self.end.record()
                
                torch.cuda.synchronize()
                
                timings[0][i] = self.start.elapsed_time(self.end)
                
                self.start.record()
                outputs.backward(torch.ones_like(outputs))
                self.end.record()
                           
                torch.cuda.synchronize()
                           
                timings[1][i] = self.start.elapsed_time(self.end)

                del outputs
        
        
        result = {'forward time (s)':timings[0].mean(), 'backward time (s)':timings[1].mean(),
                  'forward memory (MB)':forward_end_memory, 'backward memory (MB)': backward_end_memory,
                  'module size (to_cuda) (MB)': module_memory_consumption, 'loading time (s)': loading_time,
                  'macs': macs, 'parameters': params}
        
        del module
        del inputs
            
        return result
    
    def calc_memory(self, reset_memory=True):
        
        mb = 2 ** 20
        if reset_memory:
            torch.cuda.reset_peak_memory_stats()
            
        max_memory = torch.cuda.max_memory_allocated(device) / mb
        
        return max_memory
    
    def track_module_collection(self, module_collection, shape):
        
        start_memory = self.calc_memory()
        
        self.start.record()
        inputs = torch.randn(shape)
        inputs = inputs.to(self.device)
        self.end.record()
        
        torch.cuda.synchronize()
        
        inputs_memory = self.calc_memory(reset_memory=False) - start_memory
        
        module_collection_stats = np.zeros((len(module_collection), 2))
        module_collection_stats = pd.DataFrame(index=module_collection.keys(), 
                                               columns=['forward time (s)', 'backward time (s)',
                                                        'forward memory (MB)','backward memory (MB)',
                                                        'module size (to_cuda) (MB)', 'loading time (s)',
                                                        'macs', 'parameters'])
        
        for i, module_name in enumerate(module_collection):
            module_collection_stats.loc[module_name] = self.track(module=module_collection[module_name], inputs=inputs)
            
        return module_collection_stats
        

In [5]:
tracker = Tracker(device)
shape = (1, 64, 224, 224)
inputs = torch.randn(shape)

module_collection = {'conv_64_512_1x1': nn.Conv2d(kernel_size=1, in_channels=64, out_channels=512), 
                     'conv_64_512_3x3': nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512), 
                     'bottleneck_64_512_3x3': nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=16), 
                                              nn.Conv2d(kernel_size=3, in_channels=16, out_channels=512),
                                              ]),
                      'conv_64_512_3x3_g2': nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=2),
                      'conv_64_512_3x3_g8': nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=8),
                    }

module_collection_stats = tracker.track_module_collection(module_collection=module_collection, shape=shape)

In [6]:
module_collection_stats

Unnamed: 0,forward time (s),backward time (s),forward memory (MB),backward memory (MB),module size (to_cuda) (MB),loading time (s),macs,parameters
conv_64_512_1x1,0.92,1.7,98.41,122.92,0.13,0.59,1.67 GMac,33.28 k
conv_64_512_3x3,2.32,6.15,99.39,122.51,1.75,0.64,14.56 GMac,295.42 k
bottleneck_64_512_3x3,1.19,4.47,99.89,121.33,0.29,0.34,3.71 GMac,75.28 k
conv_64_512_3x3_g2,5.8,17.32,96.43,108.51,3.06,1.44,38.18 GMac,803.33 k
conv_64_512_3x3_g8,1.81,15.78,95.77,105.84,0.77,0.39,9.56 GMac,201.22 k
