In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math
import sys
import yaml
from copy import deepcopy
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from enot.latency import SearchSpaceLatencyContainer
from enot.latency.search_space_latency_calculator import SearchSpacePytorchCpuLatencyCalculator as CPUEstimator
from enot.latency.search_space_latency_calculator import SearchSpacePytorchCudaLatencyCalculator as CUDAEstimator
from enot.logging import prepare_log
from enot.models import SearchSpaceModel
from enot.models import SearchVariantsContainer
from thop import profile
from tqdm.auto import tqdm

from enot.latency import min_latency, max_latency, mean_latency, sample_latencies
from enot.latency.latency_calculator import MacCalculatorFvcore
from enot.latency import initialize_latency
from enot.visualization import plot_latency_heatmap

In [None]:
print(torch.__version__, torch.version.cuda, np.__version__)

import os
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
torch.set_num_threads(1)

In [None]:
from models.yolo import Model
from models import yolo

yolo.LOGGER = prepare_log(log_format='%(message)s')

In [None]:
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def init_torch_cuda():

    torch_backends = torch.backends

    if hasattr(torch_backends, 'cuda'):
        cuda = torch_backends.cuda
        cuda.matmul.allow_tf32 = True

    if hasattr(torch_backends, 'cudnn'):
        cudnn = torch_backends.cudnn
        cudnn.benchmark = True
        cudnn.deterministic = False
        cudnn.allow_tf32 = True
        cudnn.enabled = True
    else:
        logging.warning('cudnn is not available')

    torch.use_deterministic_algorithms(False)

In [None]:
image_size = 640
batch_size = 1
device = 'cpu'
dtype = torch.float16

dtype = torch.float32 if device == 'cpu' else dtype

In [None]:
!nvidia-smi

In [None]:
if 'cuda' in device:
    init_torch_cuda(detect_anomaly=False)
    torch.cuda.set_device(device)

torch.cuda.set_device('cuda:1')

## Select model to profile

In [None]:
# model_config = 'models/hub/yolov5l6.yaml'
# model_config = 'models/yolov5lss_v2.yaml'
model_config = 'models/yolov5sss_v1.yaml'
# model_config = 'models/yolov5s.yaml'

## Instantiate model

In [None]:
root_directory = Path('.').absolute()
model_config_file = root_directory / model_config

model = Model(model_config_file).to(device)
model.eval();
model.fuse();

search_space = None
if any(isinstance(layer, SearchVariantsContainer) for layer in model.modules()):
    search_space = SearchSpaceModel(model)

## Optionally save test search space checkpoint

In [None]:
model_to_save = model if search_space is None else search_space
model_save = deepcopy(model_to_save).cpu().float()

for i in range(12):
    model_save = model_to_save.get_network_by_indexes([i] * 8)
    ckpt = {
        'model': model_save,
        'ema': model_save,
        'phase_name': 'tune',
    }

    torch.save(ckpt, f'weights/yolov5sss_{i}.pt')
    del model_save

In [None]:
model_input = torch.zeros(batch_size, 3, image_size, image_size, dtype=dtype, device=device)
# model_input = model_input.to(memory_format=torch.channels_last)

model.to(dtype=dtype, device=device)
# model = model.to(memory_format=torch.channels_last)

pass

In [None]:
image_size = 704

while image_size >= 224:

    batch_size = 1

    model_input = torch.zeros(batch_size, 3, image_size, image_size, dtype=dtype, device=device)
    # model_input = model_input.to(memory_format=torch.channels_last)

    model.to(dtype=dtype, device=device)
    # model = model.to(memory_format=torch.channels_last)

    estimator_class = CPUEstimator if device == 'cpu' else CUDAEstimator
    estimator = estimator_class(search_space, warmup_iterations=1, run_iterations=1)
    with torch.no_grad():
        latency_container = estimator.compute((model_input, ))

    latency_container.save_to_file(f'latency/yolov5lss_v2_cpu_bs1/r{image_size}.pkl')

    image_size -= 32

raise

In [None]:
import glob
from enot.latency import min_latency, max_latency, mean_latency, sample_latencies


for lc_file in sorted(glob.glob('latency/yolov5lss_v2_cpu_bs1/r*.pkl')):

    latency_container = SearchSpaceLatencyContainer.load_from_file(lc_file)
    min_l, max_l, mean_l = \
        min_latency(latency_container), \
        max_latency(latency_container), \
        mean_latency(latency_container)

    print(lc_file)
    print(f'{min_l:.2f} {mean_l:.2f} {max_l:.2f}')

In [None]:
latency_container = initialize_latency('mmac.fvcore', search_space, (model_input, ))
latency_container.save_to_file('latency/yolov5sss_v3_latency_flops.pkl')

In [None]:
search_space.normal_forward(True)

estimator = CPUEstimator(search_space, warmup_iterations=10, run_iterations=80)
with torch.no_grad():
    latency_container = estimator.compute((model_input, ))

latency_container.save_to_file('latency/yolov5sss_v3_latency_cpu.pkl')

In [None]:
latency_container = SearchSpaceLatencyContainer.load_from_file('latency/yolov5sss_v3_latency_cpu.pkl')

In [None]:
latency_container = SearchSpaceLatencyContainer.load_from_file('latency/yolov5sss_v3_latency_flops.pkl')

min_l, max_l, mean_l, baseline_latency = \
    min_latency(latency_container), \
    max_latency(latency_container), \
    mean_latency(latency_container), \
    latency_container.constant_latency + sum(x[0] for x in latency_container.operations_latencies)

print(f'baseline={baseline_latency:.2f}, min={min_l:.2f}, mean={mean_l:.2f} max={max_l:.2f}')

latencies = np.array(sample_latencies(latency_container, n=200000, ))

plt.figure(figsize=(16, 5))

plt.xlabel('Latency, ms')
plt.ylabel('# cases')

plt.hist(latencies, bins=100, color='k', edgecolor='k', alpha=0.2)

plt.axvline(min_l, color='r', linestyle='dashed', linewidth=2, label='Min latency')
plt.axvline(max_l, color='r', linestyle='dashed', linewidth=2, label='Max latency')

plt.axvline(latencies.mean(), color='g', linestyle='dashed', linewidth=2, label='Average latency')
# plt.axvline(np.median(latencies), color='c', linestyle='dashed', linewidth=2, label='Median latency')

plt.axvline(baseline_latency, color='k', linestyle='dashed', linewidth=2, label='Baseline latency')


plt.legend()
plt.show()

plot_latency_heatmap(latency_container, annotate_values=True, figsize=(16, 5));

plt.show();

In [None]:
real, pred = [], []

for _ in tqdm(range(100)):

    sr = np.random.randint(8, size=8)

    lat1 = latency_container.constant_latency + sum(x[i] for x, i in zip(latency_container.operations_latencies, sr))

    m = search_space.get_network_by_indexes(sr)
    lat2 = get_operation_latency(
        m,
        (model_input,),
        operation_kwargs={},
        n_iterations=5,
        warmup_iterations=3,
        min_iterations=5,
        max_compute_time=20.0,
    )

    real.append(lat2); pred.append(lat1);

In [None]:
x = np.array(real) * 1000 / batch_size
y = np.array(pred) - (1.93 - 1.39)

plt.scatter(x, y)

z = np.linspace(min(np.min(x), np.min(y)), max(np.max(x), np.max(y)), 2)
plt.plot(z, z)
plt.show()

In [None]:
plot_latency_heatmap(latency_container, annotate_values=True, figsize=(16, 8));

In [None]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

with torch.no_grad():

    for _ in range(3):
        model(model_input)

    for _ in range(3):
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
            with record_function("model_inference"):
                model(model_input)

    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("model_inference"):
            for _ in range(1):
                model(model_input)
                torch.cuda.synchronize()

## Model profiling for selecting entities to replace with search blocks

In [None]:
print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=100, max_src_column_width=150))

In [None]:
root_directory = Path('.').absolute()
model_config_file = root_directory / model_config

model = Model(model_config_file)
model.eval();
model.fuse();
model.to(dtype=dtype, device=device);

search_space = None
if any(isinstance(layer, SearchVariantsContainer) for layer in model.modules()):
    search_space = SearchSpaceModel(model)

from models import yolo
from enot.logging import prepare_log
yolo.LOGGER = prepare_log(log_format='%(message)s')

with torch.no_grad():
    model(model_input, profile=True)

In [None]:
model = Model('models/yolov5s.yaml').float()
model.eval();
model.fuse();

baseline_latency = MacCalculatorFvcore().calculate(model, (model_input.float().cpu(), ))
print(baseline_latency)

In [None]:
prof.export_chrome_trace("trace3.json")