In [1]:
import numpy as np

import tvm
from tvm import relay, auto_scheduler
import tvm.relay.testing
from tvm.contrib import graph_executor

In [2]:
def get_network(name, batch_size, layout="NHWC", dtype="float32"):

    if layout == "NHWC":
        image_shape = (224, 224, 3)
    elif layout == "NCHW":
        image_shape = (3, 224, 224)
    else:
        raise ValueError("Invlida layout: "+ layout)

    input_shape = (batch_size,) + image_shape
    output_shape = (batch_size, 1000)

    if name.startswith("resnet-"):
        n_layer = int(name.split("-")[1])
        mod, params = relay.testing.resnet.get_workload(
            num_layers=n_layer,
            batch_size=batch_size,
            layout=layout,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name.startswith("resnet3d-"):
        n_layer = int(name.split("-")[1])
        mod, params = relay.testing.resnet.get_workload(
            num_layers=n_layer,
            batch_size=batch_size,
            layout=layout,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name == "mobilenet":
        mod, params = relay.testing.mobilenet.get_workload(
            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
        )
    elif name == "squeezenet_v1.1":
        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
        mod, params = relay.testing.squeezenet.get_workload(
            version="1.1",
            batch_size=batch_size,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name == "inception_v3":
        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)

    return mod, params, input_shape, output_shape
    
# Define the neural network and compilation target
network = "resnet-18"
batch_size = 1
layout = "NHWC"
target = tvm.target.Target("cuda")
dtype = "float32"
log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)

In [4]:
# Extract tasks from the network
print("Extract tasks...")
mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

for idx, task in enumerate(tasks):
    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
    print(task.compute_dag)
print(task_weights)

Extract tasks...
placeholder = PLACEHOLDER [1, 56, 56, 64]
PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
placeholder = PLACEHOLDER [3, 3, 64, 128]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
placeholder = PLACEHOLDER [1, 1, 1, 128]
T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)

placeholder = PLACEHOLDER [1, 14, 14, 256]
PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
placeholder = PLACEHOLDER [3, 3, 256, 512]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
placeholder = PLACEHOLDER [1, 1, 1, 512]
T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, a

In [6]:
def run_tuning():
    print("Begin tuning...")
    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=200, # change this to 20000 to achieve the best performance
        runner=measure_ctx.runner,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)

run_tuning()

Begin tuning...
Get devices for measurement successfully!
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------------------------------------------------

-------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    0 |            - |              - |      0 |
|    1 |            - |              - |      0 |
|    2 |            - |              - |      0 |
|    3 |            - |              - |      0 |
|    4 |            - |              - |      0 |
|    5 |            - |              - |      0 |
|    6 |            - |              - |      0 |
|    7 |            - |              - |      0 |
|    8 |            - |              - |      0 |
|    9 |            - |              - |      0 |
|   10 |            - |              - |      0 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |



------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 86	fail_ct: 1962	Time elapsed: 0.89
GA Iter: 0	Max score: 0.9899	Min score: 0.8054	#Pop: 16	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9981	#Pop: 16	#M+: 1389	#M-: 0
EvolutionarySearch		#s: 16	Time elapsed: 12.59
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 8 programs to measure:
.E.E.E.E.E.E.E.E
Time elapsed for measurement: 1.54 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.10 s
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |------------------------------------------------------------




Time elapsed for training: |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
0.42 s
----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
-------------------------------------------------
|    0 |            - |              - |      8 |
|    1 |            - |              - |      8 |
|    2 |            - |              - |      8 |
|    3 |            - |              - |      8 |
|    4 |            - |              - |      8 |
|    5 |            - |              - |      8 |
|    6 |            - |              - |      8 |
|    7 |            - |              - |      8 |
|    8 |            - |              - |      8 |
|    9 |            - |              - |      8 |
|   10 |            - |              - |      8 |
|   11 |            - |              - |      8 |
|   12 |            - |              - |      8 |
|   13 |            - |




Time elapsed for training: |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |            - |              - |      8 |
|    1 |            - |              - |      8 |
|    2 |            - |              - |      8 |
0.49 s
----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    3 |            - |              - |      8 |
|    4 |            - |              - |      8 |
|    5 |            - |              - |      8 |
|    6 |            - |              - |      8 |
|    7 |            - |              - |      8 |
|    8 |            - |              - |      8 |
|    9 |            - |              - |      8 |
|   10 |            - |              - |      8 |
|   11 |            - |              - |      8 |
|   12 |            - |              - |      8 |
|   13 |            - |




Time elapsed for training: 0.53 s
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |            - |              - |      8 |
|    1 |            - |              - |      8 |
|    2 |            - |              - |      8 |
|    3 |            - |              - |      8 |
|    4 |            - |              - |      8 |
|    5 |            - |              - |      8 |
|    6 |            - |              - |      8 |
|    7 |            - |              - |      8 |
|    8 |            - |              - |      8 |
|    9 |            - |              - |      8 |
|   10 |            - |              - |      8 |
|   11 |            - |              - |      8 |
|   12 |            - |              - |      8 |
|   13 |            - |              - |      8 |
|   14 |            - |              - |      8 |
|   15 |            - |              - |      8 |
|   16 |            - |              - |      8 |
|   17 |        




Time elapsed for training: 0.54 s
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
-------------------------------------------------
|    0 |            - |              - |      8 |
|    1 |            - |              - |      8 |
|    2 |            - |              - |      8 |
|    3 |            - |              - |      8 |
|    4 |            - |              - |      8 |
|    5 |            - |              - |      8 |
|    6 |            - |              - |      8 |
|    7 |            - |              - |      8 |
|    8 |            - |              - |      8 |
|    9 |            - |              - |      8 |
|   10 |            - |              - |      8 |
|   11 |            - |              - |      8 |
|   12 |            - |              - |      8 |
|   13 |            - |




Time elapsed for training: 0.48 s
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
----------------------------------------------------------------------
------------------------------  [ -------------------------------------------------
|    0 |            - |              - |      8 |
|    1 |            - |              - |      8 |
|    2 |            - |              - |      8 |
|    3 |            - |              - |      8 |
|    4 |            - |              - |      8 |
|    5 |            - |              - |      8 |
|    6 |            - |              - |      8 |
|    7 |            - |              - |      8 |
|    8 |            - |              - |      8 |
|    9 |            - |              - |      8 |
|   10 |            - |              - |      8 |
|   11 |            - |              - |      8 |
|   12 |            - |              - |      8 |
|   13 |            - |              - |      8 |
|   14 |            - |              - |      8 |
|   15 |   




Time elapsed for training: |  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |            - |              - |      8 |
|    1 |            - |              - |      8 |
|    2 |            - |              - |      8 |
|    3 |            - |              - |      8 |
|    4 |            - |              - |      8 |
|    5 |            - |              - |      8 |
|    6 |            - |              - |      8 |
|    7 |            - |              - |      8 |
|    8 |            - |              - |      8 |
|    9 |            - |              - |      8 |
|   10 |            - |              - |      8 |
|   11 |            - |              - |      8 |
|   12 |            - |              - |      8 |
|   13 |            - |              - |      8 |
|   14 |            - |              - |      8 |
|   15 |            - |              - |      8 |
|   16 |            - |              - |      8 |0.52 s
-----------------




Time elapsed for training: 0.51 s


In [9]:
# Official tutorial cannot run.

# Compile with the history best
print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib = relay.build(mod, target=target, params=params)

# Create graph executor
dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module.set_input("data", data_tvm)

# Evaluate
print("Evaluate inference time cost...")
print(module.benchmark(dev, repeat=3, min_repeat_ms=500))

Compile...


TVMError: Traceback (most recent call last):
  12: TVMFuncCall
  11: _ZNSt17_Function_handlerIFvN
  10: tvm::relay::backend::RelayBuildModule::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#3}::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
  9: tvm::relay::backend::RelayBuildModule::BuildRelay(tvm::IRModule, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::NDArray, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, tvm::runtime::NDArray> > > const&, tvm::runtime::String)
  8: tvm::build(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target const&)
  7: tvm::SplitMixedModule(tvm::IRModule, tvm::Target const&, tvm::Target const&)
  6: tvm::ApplyPasses(tvm::IRModule, tvm::transform::Sequential)
  5: tvm::transform::Pass::operator()(tvm::IRModule) const
  4: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  3: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  2: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  1: tvm::transform::ModulePassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  0: std::_Function_handler<void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1}>(tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1})::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)
  Did you forget to bind?
    Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `T_add` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `placeholder` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  File "/home/boyuan/.anaconda3/envs/tvm-build/conda-bld/tvm-cu102-package_1636400145961/work/src/tir/analysis/verify_memory.cc", line 206
RuntimeError: Memory verification failed with the following errors:
PrimFunc([placeholder, placeholder, placeholder, T_add]) attrs={"target": cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32, "tir.noalias": (bool)1, "global_symbol": "tvmgen_default_fused_nn_contrib_conv2d_winograd_without_weight_transform_add_1", "from_legacy_te_schedule": (bool)1} {
  allocate data_pad[float32 * 200704], storage_scope = global
  allocate input_tile[float32 * 200704], storage_scope = global
  allocate B[float32 * 16], storage_scope = global
  for (i1, 0, 16) {
    for (i2, 0, 16) {
      for (i3, 0, 256) {
        data_pad[(((i1*4096) + (i2*256)) + i3)] = tir.if_then_else(((((1 <= i1) && (i1 < 15)) && (1 <= i2)) && (i2 < 15)), placeholder[((((i1*3584) + (i2*256)) + i3) - 3840)], 0f)
      }
    }
  }
  for (eps, 0, 4) {
    for (nu, 0, 4) {
      for (p, 0, 49) {
        for (ci, 0, 256) {
          input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = data_pad[(((((floordiv(p, 7)*8192) + (eps*4096)) + (floormod(p, 7)*512)) + (nu*256)) + ci)]
        }
      }
    }
  }
  for (i, 0, 4) {
    for (j, 0, 4) {
      B[((i*4) + j)] = select(((i == 3) && (j == 3)), 1f, select(((i == 3) && (j == 2)), 0f, select(((i == 3) && (j == 1)), 0f, select(((i == 3) && (j == 0)), 0f, select(((i == 2) && (j == 3)), 0f, select(((i == 2) && (j == 2)), 1f, select(((i == 2) && (j == 1)), 1f, select(((i == 2) && (j == 0)), -1f, select(((i == 1) && (j == 3)), -1f, select(((i == 1) && (j == 2)), 1f, select(((i == 1) && (j == 1)), -1f, select(((i == 1) && (j == 0)), 0f, select(((i == 0) && (j == 3)), 0f, select(((i == 0) && (j == 2)), 0f, select(((i == 0) && (j == 1)), 0f, select(((i == 0) && (j == 0)), 1f, 0f))))))))))))))))
    }
  }
  for (eps, 0, 4) {
    for (nu, 0, 4) {
      for (p, 0, 49) {
        for (ci, 0, 256) {
          data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = 0f
          for (r_a, 0, 4) {
            for (r_b, 0, 4) {
              data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] = (data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)] + ((input_tile[((((r_a*50176) + (r_b*12544)) + (p*256)) + ci)]*B[((r_a*4) + eps)])*B[((r_b*4) + nu)]))
            }
          }
        }
      }
    }
  }
  for (eps, 0, 4) {
    for (nu, 0, 4) {
      for (p, 0, 49) {
        for (co, 0, 256) {
          input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] = 0f
          for (ci, 0, 256) {
            input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] = (input_tile[((((eps*50176) + (nu*12544)) + (p*256)) + co)] + (data_pad[((((eps*50176) + (nu*12544)) + (p*256)) + ci)]*placeholder[((((eps*262144) + (nu*65536)) + (co*256)) + ci)]))
          }
        }
      }
    }
  }
  for (i, 0, 4) {
    for (j, 0, 2) {
      B[((i*2) + j)] = select(((i == 3) && (j == 1)), 1f, select(((i == 3) && (j == 0)), 0f, select(((i == 2) && (j == 1)), 1f, select(((i == 2) && (j == 0)), 1f, select(((i == 1) && (j == 1)), -1f, select(((i == 1) && (j == 0)), 1f, select(((i == 0) && (j == 1)), 0f, select(((i == 0) && (j == 0)), 1f, 0f))))))))
    }
  }
  for (vh, 0, 2) {
    for (vw, 0, 2) {
      for (p, 0, 49) {
        for (co, 0, 256) {
          data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] = 0f
          for (r_a, 0, 4) {
            for (r_b, 0, 4) {
              data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] = (data_pad[((((vh*25088) + (vw*12544)) + (p*256)) + co)] + ((input_tile[((((r_a*50176) + (r_b*12544)) + (p*256)) + co)]*B[((r_a*2) + vh)])*B[((r_b*2) + vw)]))
            }
          }
        }
      }
    }
  }
  for (h, 0, 14) {
    for (w, 0, 14) {
      for (co, 0, 256) {
        input_tile[(((h*3584) + (w*256)) + co)] = data_pad[(((((floormod(h, 2)*25088) + (floormod(w, 2)*12544)) + (floordiv(h, 2)*1792)) + (floordiv(w, 2)*256)) + co)]
      }
    }
  }
  for (ax1, 0, 14) {
    for (ax2, 0, 14) {
      for (ax3, 0, 256) {
        T_add[(((ax1*3584) + (ax2*256)) + ax3)] = (input_tile[(((ax1*3584) + (ax2*256)) + ax3)] + placeholder[(((ax1*3584) + (ax2*256)) + ax3)])
      }
    }
  }
}
