In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import os
import sys
sys.dont_write_bytecode = True
import json
import time
from datetime import datetime
import single_node_profiles_cpp as snp
import profiler
import numpy as np
from optimizer import GreedyOptimizer
from IPython.display import display
import matplotlib.pyplot as plt
import utils
%matplotlib inline


In [4]:
profs = snp.load_single_node_profiles(models=["tf-resnet-feats", "inception", "tf-kernel-svm", "tf-log-reg"])

In [5]:
profs.keys()

[u'inception', u'tf-kernel-svm', u'tf-resnet-feats', u'tf-log-reg']

In [7]:
dag = profiler.get_logical_pipeline("pipeline_one")
with open(os.path.abspath("../results_python_benchmarker/e2e_profs_new_metrics/incep_1-logreg_1-ksvm_1-resnet_1-180207_063416.json")) as f:
    sample_run = json.load(f)
print(dag.reference_node)
scale_factors = {'inception': 1.0, 'tf-log-reg': 1.0, 'tf-kernel-svm': 1.0, 'tf-resnet-feats': 1.0}
node_configs = profiler.get_node_configs_from_experiment(sample_run)
def which_stage(model_name):
    if model_name == "tf-kernel-svm" or model_name == "tf-log-reg":
        return "latency_stage"
    else:
        return "thru_stage"
node_profs = {name : profiler.NodeProfile(name, profs[name], which_stage(name)) for name, _ in node_configs.items()}

inception


In [14]:
resnet = profiler.NodeProfile("tf-resnet-feats", profs["tf-resnet-feats"], "thru_stage")
resnet_array = np.sort(resnet.profile[["mean_batch_size", "p99_latency", "thru_stage_mean_throughput_qps"]].values, axis=0)
resnet_array = resnet_array[np.argsort(resnet_array[:,0])]
resnet_array

array([[1.0000000e+00, 1.3254010e-01, 1.0000000e+01],
       [2.0000000e+00, 1.3424957e-01, 1.9700000e+01],
       [4.0000000e+00, 1.3839000e-01, 3.8600000e+01],
       [8.0000000e+00, 1.4921285e-01, 7.6100000e+01],
       [1.2000000e+01, 1.5238963e-01, 1.0520000e+02],
       [1.6000000e+01, 1.7264560e-01, 1.2620000e+02],
       [2.4000000e+01, 2.1201610e-01, 1.6890000e+02],
       [3.2000000e+01, 2.5663817e-01, 1.9150000e+02],
       [4.2870000e+01, 5.3025450e-01, 2.0900000e+02]])

In [15]:
inception = profiler.NodeProfile("inception", profs["inception"], "thru_stage")
inception_array = np.sort(inception.profile[["mean_batch_size", "p99_latency", "thru_stage_mean_throughput_qps"]].values, axis=0)
inception_array = inception_array[np.argsort(inception_array[:,0])]
inception_array

array([[1.00000000e+00, 6.81030500e-02, 2.63000000e+01],
       [2.00000000e+00, 7.28235000e-02, 4.58000000e+01],
       [4.00000000e+00, 7.38842600e-02, 8.15000000e+01],
       [8.00000000e+00, 1.05866700e-01, 1.32800000e+02],
       [1.20000000e+01, 1.12031640e-01, 1.79500000e+02],
       [1.60000000e+01, 1.49952640e-01, 1.99500000e+02],
       [2.40000000e+01, 1.95421250e-01, 2.13000000e+02],
       [2.54766667e+01, 9.03275120e-01, 2.50200000e+02],
       [3.19866667e+01, 1.05310206e+00, 2.54300000e+02]])

In [20]:
inception = profiler.NodeProfile("tf-kernel-svm", profs["tf-kernel-svm"], "thru_stage")
inception_array = np.sort(inception.profile[["mean_batch_size", "p99_latency", "thru_stage_mean_throughput_qps"]].values, axis=0)
inception_array = inception_array[np.argsort(inception_array[:,0])]
inception_array

array([[1.0000000e+00, 4.5305130e-02, 4.2300000e+01],
       [2.0000000e+00, 4.5864000e-02, 7.4300000e+01],
       [4.0000000e+00, 5.1040710e-02, 1.3840000e+02],
       [8.0000000e+00, 5.4351710e-02, 2.7690000e+02],
       [1.2000000e+01, 5.9511130e-02, 3.9000000e+02],
       [1.6000000e+01, 6.5195470e-02, 4.9870000e+02],
       [2.4000000e+01, 7.5762700e-02, 6.5550000e+02],
       [4.8000000e+01, 9.3624520e-02, 8.1150000e+02],
       [6.4000000e+01, 1.2436674e-01, 9.0780000e+02]])

In [21]:
inception = profiler.NodeProfile("tf-log-reg", profs["tf-log-reg"], "thru_stage")
inception_array = np.sort(inception.profile[["mean_batch_size", "p99_latency", "thru_stage_mean_throughput_qps"]].values, axis=0)
inception_array = inception_array[np.argsort(inception_array[:,0])]
inception_array

array([[1.000000e+00, 2.928447e-02, 5.315000e+02],
       [2.000000e+00, 3.027719e-02, 6.086000e+02],
       [4.000000e+00, 3.197647e-02, 1.038900e+03],
       [8.000000e+00, 3.615806e-02, 1.094800e+03]])

In [19]:
cost = utils.get_cpu_cost(cloud, 8) + utils.get_gpu_cost(cloud, "v100", num_gpus=4)
cost

10.644

In [18]:
%config Application.log_level="INFO"
from optimizer import GreedyOptimizer
opt = GreedyOptimizer(dag, scale_factors, node_profs)
cloud = "aws"
initial_config = {"tf-resnet-feats": profiler.NodeConfig(name="tf-resnet-feats",
                                                          num_cpus=1,
                                                          gpu_type="v100",
                                                          batch_size=1,
                                                          num_replicas=1,
                                                          cloud=cloud),
                  "inception": profiler.NodeConfig(name="inception",
                                                      num_cpus=1,
                                                      gpu_type="v100",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "tf-log-reg": profiler.NodeConfig(name="tf-log-reg",
                                                      num_cpus=1,
                                                      gpu_type="none",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                  "tf-kernel-svm": profiler.NodeConfig(name="tf-kernel-svm",
                                                      num_cpus=1,
                                                      gpu_type="none",
                                                      batch_size=1,
                                                      num_replicas=1,
                                                      cloud=cloud),
                 }

cost = utils.get_cpu_cost(cloud, 8) + utils.get_gpu_cost(cloud, "v100", num_gpus=4)
# 24 ms mean inter-arrival time
with open("../experiments/cached_arrival_processes/58_4.0.deltas", 'r') as f:
    deltas = np.array([float(l.strip()) for l in f]).flatten()
arrival_cached = np.cumsum(deltas)
cost = utils.get_cpu_cost(cloud, 8) + utils.get_gpu_cost(cloud, "v100", num_gpus=4)
opt.select_optimal_config(cloud, 0.5, cost, initial_config, arrival_history=arrival_cached)

[optimizer.py:253] 

Evaluating step gpu
[optimizer.py:309] 	Action configuration returned as None
[optimizer.py:253] 

Evaluating step replication_factor
[optimizer.py:257] 	Old config: NodeConfig(tf-resnet-feats, 1, v100, 1, 1, aws)
New config: NodeConfig(tf-resnet-feats, 1, v100, 1, 2, aws)
[optimizer.py:265] 	New estimated perf: {'latency': 0.1795547, 'cost': 7.916499999999999, 'throughput': 20.0}
[optimizer.py:270] 	Doing network calc
[optimizer.py:282] 	Response time: inf, T_s=0.1795547, T_q=inf
[optimizer.py:307] 	Setting best action response time to inf
[optimizer.py:253] 

Evaluating step batch_size
[optimizer.py:257] 	Old config: NodeConfig(tf-resnet-feats, 1, v100, 1, 1, aws)
New config: NodeConfig(tf-resnet-feats, 1, v100, 2.0, 1, aws)
[optimizer.py:265] 	New estimated perf: {'latency': 0.18369512999999998, 'cost': 5.321999999999999, 'throughput': 19.7}
[optimizer.py:270] 	Doing network calc
[optimizer.py:282] 	Response time: inf, T_s=0.18369513, T_q=inf
[optimizer.py:321] 

({'inception': NodeConfig(inception, 1, v100, 24.0, 1, aws),
  'tf-kernel-svm': NodeConfig(tf-kernel-svm, 1, none, 2.0, 3, aws),
  'tf-log-reg': NodeConfig(tf-log-reg, 1, none, 1, 1, aws),
  'tf-resnet-feats': NodeConfig(tf-resnet-feats, 1, v100, 8.0, 3, aws)},
 {'cost': 10.644, 'latency': 0.22569844, 'throughput': 213.0},
 0.3051282651822493)

In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test
