In [None]:
# TODO
# add no_grad
# compare resnets
# look into memory_stats 
# in_channels / hw 
# loggers

In [1]:
%load_ext autoreload
%env CUDA_VISIBLE_DEVICES = 0

env: CUDA_VISIBLE_DEVICES=0


In [2]:
import numpy as np
import gc
import time

import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 

from models import Resnet, ResnetD, ResNext

%autoreload 2

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [18]:
curr_memory = torch.cuda.memory_stats(device=device)['active_bytes.all.current'] * 1e-6
print(f'initial memory: {curr_memory}')

big_tensor = torch.randn((1000, 1000)).to(device)
with_tensor_memory = torch.cuda.memory_stats(device=device)['active_bytes.all.current'] * 1e-6
print(f'memory with tensor: {with_tensor_memory}') 

del big_tensor 
memory_with_deleted_tensor = torch.cuda.memory_stats(device=device)['active_bytes.all.current'] * 1e-6
print(f'memory after tensor deletion: {memory_with_deleted_tensor}') 

initial memory: 0.262144
memory with tensor: 4.2623999999999995
memory after tensor deletion: 0.262144


In [12]:
class Tracker:
    def __init__(self, device, repeats=100, warmup=0, verbose=True):
        self.device = device 
        self.repeats = repeats
        self.warmup = warmup  
        self.verbose = verbose
        self.start = torch.cuda.Event(enable_timing=True)
        self.end = torch.cuda.Event(enable_timing=True)
        
        
    def get_current_memory(self):
        return torch.cuda.memory_stats(device=self.device)['active_bytes.all.current'] * 1e-6
    
    
    def track(self, module, inputs): # add no_grad ?
        
        current_memory = self.get_current_memory()
        
        self.start.record()
        module.to(self.device)
        self.end.record()
        
        torch.cuda.synchronize()
        loading_time = self.start.elapsed_time(self.end)
        
        new_current_memory = self.get_current_memory()
        module_memory_consumption = new_current_memory - current_memory
        
        timings = np.zeros(self.repeats)
        memory_on_runs = np.zeros(self.repeats)
        
        for i in range(self.warmup):
            outputs = module(inputs)
        
        for i in range(self.warmup, self.repeats):

            self.start.record()
            outputs = module(inputs)
            self.end.record()

            torch.cuda.synchronize()

            timings[i] = self.start.elapsed_time(self.end)
            memory_on_runs[i] = self.get_current_memory() - current_memory
            
            del outputs
            
        if self.verbose:
            print(f' {module}:\n        run takes {timings.mean():.2f} +- {timings.std():.2f}s, loaded on cuda in {loading_time}s, \n'
                  f'        module size {module_memory_consumption:.2f} MB, outputs size {memory_on_runs.mean():.2f} MB') 
            
        del module
        del inputs
            
        return timings.mean(), memory_on_runs.mean()
    
    
    def track_module_collection(self, module_collection, inputs):
        gc.collect()
        torch.cuda.empty_cache()
        
        current_memory = self.get_current_memory()
        
        self.start.record()
        inputs = inputs.to(self.device)
        self.end.record()
        
        torch.cuda.synchronize()
        
        inputs_memory = self.get_current_memory() - current_memory
        
        if self.verbose:
            print(f'inputs: shape: {inputs.shape},'
                  f'loaded on cuda in: {self.start.elapsed_time(self.end):.2f}s, '
                  f'memory consumption: {inputs_memory:.2f} MB')
        
        
        module_collection_stats = np.zeros((len(module_collection), 2))
        
        for i, module in enumerate(module_collection):
            module_collection_stats[i] = self.track(module=module, inputs=inputs)
        
        if self.verbose:
            print(f'\nfastest module is {module_collection[module_collection_stats[:, 0].argmin()]} '
                    f'with {module_collection_stats[:, 0].min():.2f}s')
            print(f'slowest module is {module_collection[module_collection_stats[:, 0].argmax()]} '
                    f'with {module_collection_stats[:, 0].max():.2f}s\n')
            print(f'tiniest module is {module_collection[module_collection_stats[:, 1].argmin()]} '
                    f'with {module_collection_stats[:, 1].min():.2f}MB')
            print(f'largest module is {module_collection[module_collection_stats[:, 1].argmax()]} '
                    f'with {module_collection_stats[:, 1].max():.2f}MB')
            
        return module_collection_stats
            

In [54]:
tracker = Tracker(device)
inputs = torch.randn((1, 64, 224, 224))

module_collection = [nn.Conv2d(kernel_size=1, in_channels=64, out_channels=512), 
                      nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512), 
                      nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=16), 
                                     nn.Conv2d(kernel_size=3, in_channels=16, out_channels=512),
                                    ]),
                      nn.Conv2d(kernel_size=5, in_channels=64, out_channels=512),
                      nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=16), 
                                     nn.Conv2d(kernel_size=5, in_channels=16, out_channels=512),
                                    ]),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512),
                      nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=16), 
                                     nn.Conv2d(kernel_size=7, in_channels=16, out_channels=512),
                                    ]),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=2),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=4),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=8),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=16),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=64),
                     ]
module_collection_stats = tracker.track_module_collection(module_collection=module_collection, inputs=inputs)

inputs: shape: torch.Size([1, 64, 224, 224]),loaded on cuda in: 3.60s, memory consumption: 12.85 MB
 Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)):
        run takes 0.90 +- 0.03s, loaded on cuda in 0.5762879848480225s, 
        module size 0.13 MB, outputs size 102.89 MB
 Conv2d(64, 512, kernel_size=(3, 3), stride=(1, 1)):
        run takes 2.45 +- 0.15s, loaded on cuda in 0.47231999039649963s, 
        module size 1.18 MB, outputs size 102.12 MB
 Sequential(
  (0): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
  (1): Conv2d(16, 512, kernel_size=(3, 3), stride=(1, 1))
):
        run takes 1.10 +- 0.01s, loaded on cuda in 0.5945280194282532s, 
        module size 0.30 MB, outputs size 104.45 MB
 Conv2d(64, 512, kernel_size=(5, 5), stride=(1, 1)):
        run takes 6.08 +- 0.08s, loaded on cuda in 0.8852159976959229s, 
        module size 3.28 MB, outputs size 102.40 MB
 Sequential(
  (0): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
  (1): Conv2d(16, 512, kernel_size=(5, 

In [56]:
warmup_tracker = Tracker(device, warmup=10)
inputs = torch.randn((1, 64, 224, 224))

module_collection = [nn.Conv2d(kernel_size=1, in_channels=64, out_channels=512), 
                      nn.Conv2d(kernel_size=3, in_channels=64, out_channels=512), 
                      nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=16), 
                                     nn.Conv2d(kernel_size=3, in_channels=16, out_channels=512),
                                    ]),
                      nn.Conv2d(kernel_size=5, in_channels=64, out_channels=512),
                      nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=16), 
                                     nn.Conv2d(kernel_size=5, in_channels=16, out_channels=512),
                                    ]),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512),
                      nn.Sequential(*[nn.Conv2d(kernel_size=1, in_channels=64, out_channels=16), 
                                     nn.Conv2d(kernel_size=7, in_channels=16, out_channels=512),
                                    ]),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=2),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=4),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=8),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=16),
                      nn.Conv2d(kernel_size=7, in_channels=64, out_channels=512, groups=64),
                     ]

warmup_module_collection_stats = warmup_tracker.track_module_collection(module_collection=module_collection, inputs=inputs)

inputs: shape: torch.Size([1, 64, 224, 224]),loaded on cuda in: 3.87s, memory consumption: 12.85 MB
 Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)):
        run takes 0.81 +- 0.27s, loaded on cuda in 0.3619840145111084s, 
        module size 0.13 MB, outputs size 92.60 MB
 Conv2d(64, 512, kernel_size=(3, 3), stride=(1, 1)):
        run takes 2.06 +- 0.69s, loaded on cuda in 0.47407999634742737s, 
        module size 1.18 MB, outputs size 91.90 MB
 Sequential(
  (0): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
  (1): Conv2d(16, 512, kernel_size=(3, 3), stride=(1, 1))
):
        run takes 0.99 +- 0.33s, loaded on cuda in 0.5836799740791321s, 
        module size 0.30 MB, outputs size 94.00 MB
 Conv2d(64, 512, kernel_size=(5, 5), stride=(1, 1)):
        run takes 5.52 +- 1.84s, loaded on cuda in 0.8649920225143433s, 
        module size 3.28 MB, outputs size 92.16 MB
 Sequential(
  (0): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
  (1): Conv2d(16, 512, kernel_size=(5, 5), 

In [15]:
for batch_size in [2**i for i in range(8)]:
    print(f'\n ---batch_size {batch_size}---\n')
    batch_tracker = Tracker(device, warmup=10)
    inputs = torch.randn((batch_size, 3, 64, 64))

    module_collection = [nn.Conv2d(kernel_size=1, in_channels=3, out_channels=256), 
                          nn.Conv2d(kernel_size=3, in_channels=3, out_channels=256), 
                          nn.Conv2d(kernel_size=5, in_channels=3, out_channels=256),
                          nn.Conv2d(kernel_size=7, in_channels=3, out_channels=256)
                         ]
    module_collection_stats = batch_tracker.track_module_collection(module_collection=module_collection, inputs=inputs)


 ---batch_size 1---

inputs: shape: torch.Size([1, 3, 64, 64]),loaded on cuda in: 0.24s, memory consumption: 0.05 MB
 Conv2d(3, 256, kernel_size=(1, 1), stride=(1, 1)):
        run takes 0.19 +- 0.09s, loaded on cuda in 0.5320640206336975s, 
        module size 0.00 MB, outputs size 3.78 MB
 Conv2d(3, 256, kernel_size=(3, 3), stride=(1, 1)):
        run takes 0.16 +- 0.05s, loaded on cuda in 0.4999360144138336s, 
        module size 0.03 MB, outputs size 3.57 MB
 Conv2d(3, 256, kernel_size=(5, 5), stride=(1, 1)):
        run takes 0.15 +- 0.05s, loaded on cuda in 0.14950400590896606s, 
        module size 0.08 MB, outputs size 3.39 MB
 Conv2d(3, 256, kernel_size=(7, 7), stride=(1, 1)):
        run takes 0.14 +- 0.05s, loaded on cuda in 0.3608640134334564s, 
        module size 0.15 MB, outputs size 3.24 MB

fastest module is Conv2d(3, 256, kernel_size=(7, 7), stride=(1, 1)) with 0.14s
slowest module is Conv2d(3, 256, kernel_size=(1, 1), stride=(1, 1)) with 0.19s

tiniest module is Con