In [None]:
SELECTED_GPUS = [6, 7]

import os

os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(gpu_number) for gpu_number in SELECTED_GPUS])

import tensorflow as tf 

"""
https://github.com/tensorflow/tensorflow/issues/34415#issuecomment-895336269
https://stackoverflow.com/questions/59616436/how-to-reset-initialization-in-tensorflow-2
"""
MAX_CPU_THREADS = 16
tf.config.threading.set_intra_op_parallelism_threads(MAX_CPU_THREADS)
tf.config.threading.set_inter_op_parallelism_threads(MAX_CPU_THREADS)

tf.get_logger().setLevel('INFO')

assert len(tf.config.list_physical_devices('GPU')) > 0

GPUS = tf.config.experimental.list_physical_devices('GPU')
for gpu in GPUS:
    tf.config.experimental.set_memory_growth(gpu, True)

DISTRIBUTED_STRATEGY = tf.distribute.MirroredStrategy(
    cross_device_ops=tf.distribute.NcclAllReduce(),
    devices=['/gpu:%d' % index for index in range(len(SELECTED_GPUS))]
)

NUM_GPUS = DISTRIBUTED_STRATEGY.num_replicas_in_sync

print('Number of devices: {}'.format(NUM_GPUS))

import effnetv2_model
import json
import matplotlib.pyplot as plt
import numpy as np
import random
import seaborn as sns;
import sys
import time

from matplotlib.colors import ListedColormap
plt.rcParams.update({'font.size': 22})
from pprint import pprint
from tensorflow.python.framework.convert_to_constants import  convert_variables_to_constants_v2_as_graph
from vit_keras import vit

VERSION = 5
BATCH_TABLE_DIR = 'batch_table'
if not os.path.exists(BATCH_TABLE_DIR):
    os.makedirs(BATCH_TABLE_DIR)
INFERENCE_PLOT_DIR = os.path.join('inference_plot', str(VERSION))
if not os.path.exists(INFERENCE_PLOT_DIR):
    os.makedirs(INFERENCE_PLOT_DIR)
PLOT_LINESTYLES = ['-', '--', '-.', ':']
PLOT_COLORS = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
PLOT_MARKERS = ['o', 'v', 'P', 'X', 'D', '^', 's']
FIGURE_SIZE = (15, 5)

In [None]:
def get_model(config):
    if config['model_name'] == 'vit-b32':
        model = vit.vit_b32(
            image_size=config['image_size'],
            activation='sigmoid',
            pretrained=True,
            include_top=True,
            pretrained_top=True
        )
    elif config['model_name'] == 'vit-l32':
        model = vit.vit_l32(
            image_size=config['image_size'],
            activation='sigmoid',
            pretrained=True,
            include_top=True,
            pretrained_top=True
        )
    elif config['model_name'] == 'vgg-16':
        model = tf.keras.applications.vgg16.VGG16(
            include_top=True,
            weights='imagenet',
            classes=1000,
            classifier_activation='softmax'
        )
    elif config['model_name'] == 'vgg-19':
        model = tf.keras.applications.vgg19.VGG19(
            include_top=True,
            weights='imagenet',
            classes=1000,
            classifier_activation='softmax'
        )
    else:
        # https://github.com/google/automl/blob/master/efficientnetv2/effnetv2_model.py#L578
        input_shape = (config['image_size'], config['image_size'], 3)
        x = tf.keras.Input(shape=input_shape)
        model = tf.keras.Model(inputs=[x], outputs=effnetv2_model.get_model(config['model_name']).call(x, training=True))
    return model

In [None]:
def get_natural_bottlenecks(model, config, compressive_only=True):
    natural_bottlenecks = []
    input_size = (config['image_size'] ** 2) * 3
    best_compression = 1.0
    for layer in model.layers:
        if (config['model_name'].startswith('efficientnet') and layer.name.startswith('blocks_')) or \
                (config['model_name'].startswith('vgg') and layer.name.startswith('block')):
            output_size = layer.output_shape[1] * layer.output_shape[2] * layer.output_shape[3]
        elif config['model_name'].startswith('vit') and layer.name.startswith('Transformer/encoderblock_'):
            output_size = layer.output_shape[0][1] * layer.output_shape[0][2]
        else:
            continue
        if output_size < input_size:
            compression = output_size / input_size
            if not compressive_only or compression < best_compression:
                natural_bottlenecks.append({
                    'layer_name': layer.name,
                    'compression': compression,
                })
                best_compression = compression
    return natural_bottlenecks

In [None]:
def get_inference_time(model, batch_size, repetitions=10, input_shape=None):
    """
    https://github.com/google/automl/blob/master/efficientnetv2/infer.py#L89
    """
    if input_shape is None:
        input_shape = (batch_size, config['image_size'], config['image_size'], 3)
    else:
        tmp_list = list(input_shape)
        tmp_list[0] = batch_size
        input_shape = tuple(tmp_list)
    imgs = tf.ones(input_shape)  # the original code uses dtype=tf.float16, which would be 2 bytes

    # warmup
    for _ in range(repetitions):
        model(imgs)

    start = time.perf_counter()
    for _ in range(repetitions):
        model(imgs)
    end = time.perf_counter()
    inference_time = (end - start) / repetitions
    return inference_time * 100  # in milliseconds

In [None]:
def get_next_layer_index(model, layer_name):
    for i, layer in enumerate(model.layers):
        if layer.name == layer_name:
            return i + 1
    return None

def get_tail_model(model, layer_index):
    """
    https://stackoverflow.com/questions/52800025/keras-give-input-to-intermediate-layer-and-get-final-output
    """
    input_shape = model.layers[layer_index].get_input_shape_at(0)
    layer_input = tf.keras.Input(shape=tuple(list(input_shape)[1:]))
    x = layer_input
    for layer in model.layers[layer_index:]:
        if isinstance(x, tuple):
            x = layer(x[0])
        else:
            x = layer(x)
    return tf.keras.models.Model(layer_input, x)

def get_batch_table_path(config):
    if 'CPU' in config['processors']['weak']:
        return os.path.join(BATCH_TABLE_DIR, '%s_%s_v%d.json' % (
            config['model_name'],
            config['processors']['weak'].replace('/', ''),
            VERSION
        ))
    else:  # legacy name
       return os.path.join(BATCH_TABLE_DIR, '%s_v%d.json' % (
           config['model_name'],
           VERSION
       ))

def save_batch_table(batch_table, config):
    batch_table_path = get_batch_table_path(config)
    with open(batch_table_path, 'w') as batch_table_file:
        batch_table_file.write(json.dumps(batch_table))

def load_batch_table(config):
    batch_table_path = get_batch_table_path(config)
    with open(batch_table_path, 'r') as batch_table_file:
        return json.loads(batch_table_file.read())

def create_batch_table(config):
    model = get_model(config)

    natural_bottlenecks = get_natural_bottlenecks(model, config)

    batch_table = {}

    for  batch_size in config['batch_sizes']:
        print('Batch Size:', batch_size)
        batch_table[batch_size] = {}

        with tf.device(config['processors']['weak']):
            sys.stdout.write('\r%d/%d' % (1, len(natural_bottlenecks) + 2))
            sys.stdout.flush()

            batch_table[batch_size]['whole_device'] = get_inference_time(model, batch_size)
        with tf.device(config['processors']['strong']):
            sys.stdout.write('\r%d/%d' % (2, len(natural_bottlenecks) + 2))
            sys.stdout.flush()

        for i, natural_bottleneck in enumerate(natural_bottlenecks):
            sys.stdout.write('\r%d/%d' % (i + 3, len(natural_bottlenecks) + 2))
            sys.stdout.flush()

            if config['model_name'].startswith('efficientnet') or config['model_name'].startswith('vit'):
                pretty_layer_name = '%s_%02d' % (
                    natural_bottleneck['layer_name'].split('_')[0],
                    int(natural_bottleneck['layer_name'].split('_')[-1])
                )
            else:
                pretty_layer_name = natural_bottleneck['layer_name']

            head_model = tf.keras.models.Model(
                inputs=model.get_layer(index=0).input,
                outputs=model.get_layer(natural_bottleneck['layer_name']).output
            )
            with tf.device(config['processors']['weak']):
                batch_table[batch_size][pretty_layer_name] = {
                    'compression': natural_bottleneck['compression'],
                    'head': get_inference_time(head_model, batch_size),
                }

            next_layer_index = get_next_layer_index(model, natural_bottleneck['layer_name'])
            tail_model = get_tail_model(model, next_layer_index)
            with tf.device(config['processors']['strong']):
                batch_table[batch_size][pretty_layer_name]['tail'] = get_inference_time(
                    tail_model,
                    batch_size,
                    input_shape=model.layers[next_layer_index].get_input_shape_at(0)
                )

            save_batch_table(batch_table, config)
        print()  # newline
    return batch_table

In [None]:
def get_load(compression, batch_size, config, full_offloading):
    # 3 channels
    load = batch_size * (config['image_size'] ** 2) * 3 * compression
    if not full_offloading:
        load *= 4  # float32
    return load

def fix_legend_name(name):
    if name == 'whole_device':
        return 'No Offloading'
    elif name == 'whole_edge':
        return 'Full Offloading'
    else:
        return 'Split at\n%s' % name.split('/')[-1]

def create_inference_plots(batch_table, config, create_individual=True):
    split_points = list(batch_table[list(batch_table.keys())[0]].keys())
    bandwidths = np.arange(config['min_bandwidth'], config['max_bandwidth'], config['bandwidth_step'])
    best_splits = {}
    gains = {}
    for batch_size in batch_table.keys():
        if create_individual:
            plt.figure(figsize=FIGURE_SIZE)
        entry = batch_table[batch_size]
        inference_times_list = []
        for i, split_point in enumerate(split_points):
            if split_point == 'whole_device':
                inference_times = np.repeat(entry[split_point], bandwidths.shape[0])
                if create_individual:
                    plt.ylim(0, entry[split_point] * 2)
            elif split_point == 'whole_edge':
                load = get_load(1, int(batch_size), config, True)
                inference_times = entry[split_point] + load / bandwidths * 100  # in milliseconds
            else:
                load = get_load(entry[split_point]['compression'], int(batch_size), config, False)
                inference_times = entry[split_point]['head'] + entry[split_point]['tail'] + load / bandwidths * 100  # in milliseconds
            linestyle = PLOT_LINESTYLES[i % len(PLOT_LINESTYLES)]
            if create_individual:
                plt.plot(
                    bandwidths / 10 ** 6,
                    inference_times,
                    label=fix_legend_name(split_point),
                    linestyle=linestyle
                )
            inference_times_list.append(inference_times)
        if create_individual:
            plt.xlabel('Data Rate (MBps)')
            plt.ylabel('Inference Time (ms)')
            plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
            save_path = os.path.join(INFERENCE_PLOT_DIR, '%s_%s_%s_v%d.png' % (
                config['model_name'],
                batch_size,
                config['processors']['weak'].replace('/', ''),
                VERSION
            ))
            plt.savefig(save_path, bbox_inches='tight')
            plt.close()
        best_split = np.argmin(np.array(inference_times_list), axis=0)
        best_splits[batch_size] = best_split
        gains[batch_size] = {}
        for split_point_index, split_point in enumerate(split_points):
            absolute_diff = inference_times_list[split_point_index] - np.min(np.array(inference_times_list), axis=0)
            relative_diff = absolute_diff / inference_times_list[split_point_index]
            diff_percent = relative_diff * 100
            gains[batch_size][split_point] = np.clip(
                diff_percent,
                None,
                np.mean(diff_percent)
            )

    total_points = 0
    useful_split_points = 0
    color_mapped_values = []
    plt.figure(figsize=FIGURE_SIZE)
    for batch_size, best_split in sorted(best_splits.items(), key=lambda x: int(x[0])):
        color_mapped_values.append([])
        for bandwidth_index, entry in enumerate(best_split):
            for split_point_index, split_point in enumerate(split_points):
                if entry == split_point_index:
                    color_mapped_values[-1].append(split_point_index)
                    total_points += 1
                    if split_point != 'whole_edge' and split_point != 'whole_device':
                        useful_split_points += 1
                    break
    used_split_indices = np.sort(np.unique(np.array(color_mapped_values))).tolist()
    used_colors = [PLOT_COLORS[used_split_index] for used_split_index in used_split_indices]
    color_map = ListedColormap(used_colors)

    # replace distinct values with their index of discovery
    previous_shape = np.array(color_mapped_values).shape
    _, color_mapped_values = np.unique(np.array(color_mapped_values), return_inverse=True)
    color_mapped_values = np.reshape(color_mapped_values, previous_shape)
    colormesh = plt.pcolormesh(color_mapped_values, cmap=color_map)

    # legend
    cbar = plt.colorbar(colormesh)
    cbar.ax.get_yaxis().set_ticks([])
    max_value = np.amax(color_mapped_values)
    for j, used_split_index in enumerate(used_split_indices):
        cbar.ax.text(
            max_value + 0.5,
            max_value / len(used_split_indices) * (j + 0.5),
            fix_legend_name(split_points[used_split_index]),
            ha='left',
            va='center'
        )

    plt.xlabel('Data Rate (MBps)')
    plt.ylabel('Batch Size')
    save_path = os.path.join(INFERENCE_PLOT_DIR, '%s_all_%s_v%d.png' % (
        config['model_name'],
        config['processors']['weak'].replace('/', ''),
        VERSION
    ))
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()
    print('Percent of scenarios where split computing is useful: %.2f%% (%d/%d)' % (
        useful_split_points / total_points * 100,
        useful_split_points,
        total_points
    ))

    for split_point in split_points:
        heatmap_data = []
        for batch_size in sorted(gains.keys(), key=lambda x: int(x)):
            heatmap_data.append(gains[batch_size][split_point])
        heatmap_data = list(heatmap_data)
        fig, main_ax = plt.subplots()
        fig.set_size_inches(FIGURE_SIZE[0], FIGURE_SIZE[1])
        ax = sns.heatmap(np.array(heatmap_data), cbar_kws={'label': 'Gain %'}, ax=main_ax)
        ax.set_xlabel('Data Rate (MBps)')
        ax.set_ylabel('Batch Size')
        ax.invert_yaxis()
        save_path = os.path.join(INFERENCE_PLOT_DIR, '%s_gain_over_%s_%s_v%d.png' % (
            config['model_name'],
            split_point.replace('/', ''),  # ViT has / in block names
            config['processors']['weak'].replace('/', ''),
            VERSION
        ))
        ax.figure.savefig(save_path, bbox_inches='tight')
        plt.close()

In [None]:
def run_experiment(config, recreate=False):
    if recreate or not os.path.exists(get_batch_table_path(config)):
        batch_table = create_batch_table(config)
    else:
        batch_table = load_batch_table(config)
    create_inference_plots(batch_table, config)

In [None]:
# model names here: https://github.com/google/automl/blob/master/efficientnetv2/effnetv2_model.py#L693
# Also vit-b32, vit-l32, vgg-16, vgg-19
# Note that vgg-16 and vgg-19 are only compatible with 224 input size

# Limiting GPU clock
# sudo nvidia-smi -i 7 -pm 1
# sudo nvidia-smi -i 7 --lock-gpu-clocks=300
# sudo nvidia-smi -i 7 -reset-gpu-clocks

config = {
    'processors': {
        'weak': '/GPU:1',
        'strong': '/GPU:0',
    },
    'model_name': 'efficientnet-b0',
    'image_size': 384,
    'batch_sizes': list(range(1, 63)),
    'max_bandwidth': 128 * 10 ** 6,  # Bytes per second
    'min_bandwidth': 1 * 10 ** 6,  # Bytes per second
    'bandwidth_step': 1 * 10 ** 6,  # Bytes per second
}

run_experiment(config, recreate=True)