In [None]:
# TODO
# add no_grad
# compare resnets
# look into memory_stats 

In [1]:
%load_ext autoreload
%env CUDA_VISIBLE_DEVICES = 0

env: CUDA_VISIBLE_DEVICES=0


In [47]:
import numpy as np
import gc
import time

import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 

from models import Resnet, ResnetD, ResNext

%autoreload 2

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [38]:
class Tracker:
    def __init__(self, device, repeats=100, warmup=0, verbose=True):
        self.device = device 
        self.repeats = repeats
        self.warmup = warmup  
        self.verbose = verbose
        self.start = torch.cuda.Event(enable_timing=True)
        self.end = torch.cuda.Event(enable_timing=True)
        
        
    def get_current_memory(self):
        return torch.cuda.memory_stats(device=self.device)['active_bytes.all.current'] * 1e-6
    
    
    def track(self, module, inputs): # add no_grad ?
        
        current_memory = self.get_current_memory()
        
        self.start.record()
        module.to(self.device)
        self.end.record()
        
        torch.cuda.synchronize()
        loading_time = self.start.elapsed_time(self.end)
        
        new_current_memory = self.get_current_memory()
        module_memory_consumption = new_current_memory - current_memory
        
        timings = np.zeros(self.repeats)
        memory_on_runs = np.zeros(self.repeats)
        
        for i in range(self.warmup):
            outputs = module(inputs)
        
        for i in range(self.warmup, self.repeats):

            self.start.record()
            outputs = module(inputs)
            self.end.record()

            torch.cuda.synchronize()

            timings[i] = self.start.elapsed_time(self.end)
            memory_on_runs[i] = self.get_current_memory() - current_memory
            
            del outputs
            
        if self.verbose:
            print(f' {module}: run takes {timings.mean():.2f} +- {timings.std():.2f}s, loaded on cuda in {loading_time}s, \n'
                  f'    module size {module_memory_consumption:.2f} MB, outputs size {memory_on_runs.mean():.2f} MB') 
            
        del module
        del inputs
            
        return timings.mean(), memory_on_runs.mean()
    
    
    def track_module_collection(self, module_collection, inputs):
        gc.collect()
        torch.cuda.empty_cache()
        
        current_memory = self.get_current_memory()
        
        self.start.record()
        inputs = inputs.to(self.device)
        self.end.record()
        
        torch.cuda.synchronize()
        
        inputs_memory = self.get_current_memory() - current_memory
        
        if self.verbose:
            print(f'inputs: shape: {inputs.shape},'
                  f'loaded on cuda in: {self.start.elapsed_time(self.end):.2f}s, '
                  f'memory consumption: {inputs_memory:.2f} MB')
        
        
        module_collection_stats = np.zeros((len(module_collection), 2))
        
        for i, module in enumerate(module_collection):
            module_collection_stats[i] = self.track(module=module, inputs=inputs)
        
        if self.verbose:
            print(f'\nfastest module is {module_collection[module_collection_stats[:, 0].argmin()]} '
                    f'with {module_collection_stats[:, 0].min():.2f}s')
            print(f'slowest module is {module_collection[module_collection_stats[:, 0].argmax()]} '
                    f'with {module_collection_stats[:, 0].max():.2f}s\n')
            print(f'tiniest module is {module_collection[module_collection_stats[:, 1].argmin()]} '
                    f'with {module_collection_stats[:, 1].min():.2f}MB')
            print(f'largest module is {module_collection[module_collection_stats[:, 1].argmax()]} '
                    f'with {module_collection_stats[:, 1].max():.2f}MB')
            
        return module_collection_stats
            

In [45]:
tracker = Tracker(device)
inputs = torch.randn((1, 64, 224, 224))

module_collection = [nn.Conv2d(kernel_size=1, in_channels=64, out_channels=512), 
                      nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512), 
                      nn.Conv2d(kernel_size=5, in_channels=64, out_channels=512),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=2),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=4),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=8),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=16),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=64),
                     ]
module_collection_stats = tracker.track_module_collection(module_collection=module_collection, inputs=inputs)

inputs: shape: torch.Size([1, 64, 224, 224]),loaded on cuda in: 3.81s, memory consumption: 12.85 MB
 Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)): run takes 0.90 +- 0.03s, loaded on cuda in 0.5769919753074646s, 
    module size 0.13 MB, outputs size 102.89 MB
 Conv2d(64, 512, kernel_size=(3, 3), stride=(1, 1)): run takes 2.40 +- 0.17s, loaded on cuda in 0.48287999629974365s, 
    module size 1.18 MB, outputs size 102.12 MB
 Conv2d(64, 512, kernel_size=(5, 5), stride=(1, 1)): run takes 6.08 +- 0.07s, loaded on cuda in 0.8868160247802734s, 
    module size 3.28 MB, outputs size 102.40 MB
 Conv2d(64, 512, kernel_size=(7, 7), stride=(1, 1)): run takes 11.34 +- 0.13s, loaded on cuda in 1.5015360116958618s, 
    module size 6.42 MB, outputs size 103.75 MB
 Conv2d(64, 512, kernel_size=(7, 7), stride=(1, 1), groups=2): run takes 5.90 +- 0.05s, loaded on cuda in 0.8563519716262817s, 
    module size 3.21 MB, outputs size 100.54 MB
 Conv2d(64, 512, kernel_size=(7, 7), stride=(1, 1), groups

In [46]:
warmup_tracker = Tracker(device, warmup=10)
inputs = torch.randn((1, 64, 224, 224))

module_collection = [nn.Conv2d(kernel_size=1, in_channels=64, out_channels=512), 
                      nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512), 
                      nn.Conv2d(kernel_size=5, in_channels=64, out_channels=512),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=2),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=4),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=8),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=16),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=64),
                     ]
warmup_module_collection_stats = warmup_tracker.track_module_collection(module_collection=module_collection, inputs=inputs)

inputs: shape: torch.Size([1, 64, 224, 224]),loaded on cuda in: 4.15s, memory consumption: 12.85 MB
 Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)): run takes 0.81 +- 0.27s, loaded on cuda in 0.6057279706001282s, 
    module size 0.13 MB, outputs size 92.60 MB
 Conv2d(64, 512, kernel_size=(3, 3), stride=(1, 1)): run takes 2.21 +- 0.74s, loaded on cuda in 0.4540799856185913s, 
    module size 1.18 MB, outputs size 91.90 MB
 Conv2d(64, 512, kernel_size=(5, 5), stride=(1, 1)): run takes 5.50 +- 1.83s, loaded on cuda in 0.8712000250816345s, 
    module size 3.28 MB, outputs size 92.16 MB
 Conv2d(64, 512, kernel_size=(7, 7), stride=(1, 1)): run takes 10.22 +- 3.41s, loaded on cuda in 1.5380480289459229s, 
    module size 6.42 MB, outputs size 93.38 MB
 Conv2d(64, 512, kernel_size=(7, 7), stride=(1, 1), groups=2): run takes 5.30 +- 1.77s, loaded on cuda in 0.8895360231399536s, 
    module size 3.21 MB, outputs size 90.49 MB
 Conv2d(64, 512, kernel_size=(7, 7), stride=(1, 1), groups=4): r