In [2]:
import os

import numpy as np
import tvm
from tvm import te, auto_scheduler

import os
os.environ["CUDA_VISIBLE_DEVICES"]='0' # 0 for A6000 on winnie, 1 for P6000 on winnie.
from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
set_cuda_target_arch('sm_75')

In [3]:
# Note that fusing all computation into one graph is not supported yet on Ansor.
# Check: https://discuss.tvm.apache.org/t/assertion-triggered-when-auto-scheduling/9613/4
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_not_supported_yet(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lw = te.placeholder((length, dim_in, dim_out), name="x_lw", dtype=dtype)
    x_uw = te.placeholder((length, dim_in, dim_out), name="x_uw", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    
    y_lw = te.placeholder((length, dim_in, dim_Y_out), name="y_lw", dtype=dtype)
    y_uw = te.placeholder((length, dim_in, dim_Y_out), name="y_uw", dtype=dtype)
    y_lb = te.placeholder((length, dim_Y_out), name="y_lb", dtype=dtype)
    y_ub = te.placeholder((length, dim_Y_out), name="y_ub", dtype=dtype)
    
    W_pos = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.),
        name='w_pos'
    )
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='w_neg')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb_1 = te.compute(
        y_lb.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_lb_1'
    )
    y_lb_2 = te.compute(
        y_lb.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_neg[dout,i], axis=dout),
        name='y_lb_2'
    )
    y_lb = te.compute(
        y_lb.shape,
        lambda l, i: y_lb_1[l,i]+y_lb_2[l,i],
        name="y_lb"
    )

    y_ub_1 = te.compute(
        y_ub.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_pos[dout,i], axis=dout),
        name='y_ub_1'
    )
    y_ub_2 = te.compute(
        y_ub.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_ub_2'
    )
    y_ub = te.compute(
        y_ub.shape,
        lambda l, i: y_ub_1[l,i]+y_ub_2[l,i],
        name="y_ub"
    )

    y_lw_1 = te.compute(
        y_lw.shape,
        lambda l, j, i: 
            te.sum(x_lw[l,j,dout] * W_pos[dout,i], axis=dout),
        name='y_lw_1'
    )
    y_lw_2 = te.compute(
        y_lw.shape,
        lambda l, j, i: 
            te.sum(x_uw[l,j,dout] * W_neg[dout,i], axis=dout),
        name='y_lw_2'
    )
    y_lw = te.compute(
        y_lw.shape,
        lambda l, j, i: y_lw_1[l,j,i] + y_lw_2[l,j,i],
        name="y_lw"
    )

    y_uw_1 = te.compute(
        y_uw.shape,
        lambda l, j, i: 
            te.sum(x_uw[l,j,dout] * W_pos[dout,i], axis=dout),
        name='y_uw_1'
    )
    y_uw_2 = te.compute(
        y_uw.shape,
        lambda l, j, i: 
            te.sum(x_lw[l,j,dout] * W_neg[dout,i], axis=dout),
        name='y_uw_2'
    )
    y_uw = te.compute(
        y_uw.shape,
        lambda l, j, i: y_uw_1[l,j,i] + y_uw_2[l,j,i],
        name="y_uw"
    )

    return [W, x_lw, x_uw, x_lb, x_ub, y_lw, y_uw, y_lb, y_ub]

In [4]:
# Ansor does not support this type of kernel yet.
# In particular, the y_lb = y_lb_1+y_lb_2 is not supported in the computation graph.
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_not_supported(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    y_lb = te.placeholder((length, dim_Y_out), name="y_lb", dtype=dtype)
    
    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='w_pos')
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='w_neg')


    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb_1 = te.compute(
        y_lb.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_lb_1'
    )
    y_lb_2 = te.compute(
        y_lb.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_neg[dout,i], axis=dout),
        name='y_lb_2'
    )
    y_lb = te.compute(
        y_lb.shape,
        lambda l, i: y_lb_1[l,i]+y_lb_2[l,i],
        name="y_lb"
    )

    return [W, x_lb, y_lb]

In [5]:
# Ansor does not report error for this version.
# However, it takes 25 minutes but cannot generate 1 schedule.
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_stuck(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    y_lb_1 = te.placeholder((length, dim_Y_out), name="y_lb_1", dtype=dtype)
    y_lb_2 = te.placeholder((length, dim_Y_out), name="y_lb_2", dtype=dtype)
    
    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='w_pos')
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='w_neg')


    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb_1 = te.compute(
        y_lb_1.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_lb_1'
    )
    y_lb_2 = te.compute(
        y_lb_1.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_neg[dout,i], axis=dout),
        name='y_lb_2'
    )
    # y_lb = te.compute(
    #     y_lb.shape,
    #     lambda l, i: y_lb_1[l,i]+y_lb_2[l,i],
    #     name="y_lb"
    # )

    return [W, x_lb, y_lb_1, y_lb_2]

In [6]:
# Ansor keeps reporting errors when compiling this kernel:
#    Target has been reduced to 1 due to too many failures or duplications
#    See: https://discuss.tvm.apache.org/t/autoscheduler-prints-target-has-been-reduced-to-1-due-to-too-many-failures-or-duplications-and-fails-to-tune/10774/4
#    Also tried renaming operators. But it still stucks.
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_stuck2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="I_1", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="I_2", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="I_3", dtype=dtype)
    
    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='I_5')
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='I_6')


    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb = te.compute(
        (length, dim_Y_out),
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i] + x_ub[l,dout]*W_neg[dout,i], axis=dout),
        name='Y_7'
    )

    return [W, x_lb, y_lb]

In [7]:
# This is the largest subgraph that is supported by Ansor.
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_lb_1(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)

    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='W_pos')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb = te.compute(
        (length, dim_Y_out),
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_lb'
    )

    return [W, x_lb, y_lb]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_lb_2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)
    y_lb_1 = te.placeholder((length, dim_Y_out), name="y_lb_1", dtype=dtype)
    
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='W_neg')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb_2 = te.compute(
        y_lb_1.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_neg[dout,i], axis=dout),
        name='y_lb_2'
    )
    y_lb = te.compute(y_lb_1.shape, lambda l, i: y_lb_1[l,i]+y_lb_2[l,i], name="y_lb")

    return [W, x_ub, y_lb_1, y_lb]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_ub_1(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    
    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='W_pos')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_ub_1 = te.compute(
        (length, dim_Y_out),
        lambda l, i: te.sum(x_ub[l,dout] * W_pos[dout,i], axis=dout),
        name='y_ub_1'
    )

    return [W, x_ub, y_ub_1]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_ub_2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    y_ub_1 = te.placeholder((length, dim_Y_out), name="y_ub_1", dtype=dtype)
    
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='W_neg')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_ub_2 = te.compute(
        y_ub_1.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_neg[dout,i], axis=dout),
        name='y_ub_2'
    )
    y_ub = te.compute(y_ub_1.shape, lambda l, i: y_ub_1[l,i]+y_ub_2[l,i], name="y_ub")

    return [W, x_lb, y_ub_1, y_ub]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_lw_1(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lw = te.placeholder((length, dim_in, dim_out), name="x_lw", dtype=dtype)
    
    W_pos = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.),
        name='w_pos'
    )

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lw_1 = te.compute(
        (length, dim_in, dim_Y_out),
        lambda l, j, i: 
            te.sum(x_lw[l,j,dout] * W_pos[dout,i], axis=dout),
        name='y_lw_1'
    )

    return [W, x_lw, y_lw_1]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_lw_2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_uw = te.placeholder((length, dim_in, dim_out), name="x_uw", dtype=dtype)
    y_lw_1 = te.placeholder((length, dim_in, dim_Y_out), name="y_lw_1", dtype=dtype)
    
    W_neg = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.),
        name='w_neg'
    )

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lw_2 = te.compute(
        y_lw_1.shape,
        lambda l, j, i: 
            te.sum(x_uw[l,j,dout] * W_neg[dout,i], axis=dout),
        name='y_lw_2'
    )
    y_lw = te.compute(
        y_lw_1.shape,
        lambda l, j, i: y_lw_1[l,j,i] + y_lw_2[l,j,i],
        name="y_lw"
    )

    return [W, x_uw, y_lw_1, y_lw]


@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_uw_1(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_uw = te.placeholder((length, dim_in, dim_out), name="x_uw", dtype=dtype)
    
    W_pos = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.),
        name='w_pos'
    )

    dout = te.reduce_axis((0, dim_out), "dout")
    y_uw_1 = te.compute(
        (length, dim_in, dim_Y_out),
        lambda l, j, i: 
            te.sum(x_uw[l,j,dout] * W_pos[dout,i], axis=dout),
        name='y_uw_1'
    )

    return [W, x_uw, y_uw_1]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_uw_2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lw = te.placeholder((length, dim_in, dim_out), name="x_lw", dtype=dtype)
    y_uw_1 = te.placeholder((length, dim_in, dim_Y_out), name="y_uw_1", dtype=dtype)
    
    W_neg = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.),
        name='w_neg'
    )

    dout = te.reduce_axis((0, dim_out), "dout")
    y_uw_2 = te.compute(
        y_uw_1.shape,
        lambda l, j, i: 
            te.sum(x_lw[l,j,dout] * W_neg[dout,i], axis=dout),
        name='y_lw_2'
    )
    y_uw = te.compute(
        y_uw_1.shape,
        lambda l, j, i: y_uw_1[l,j,i] + y_uw_2[l,j,i],
        name="y_uw"
    )

    return [W, x_lw, y_uw_1, y_uw]

In [8]:
def ansor_tuner(func_pointer, func_args, log_file="ansor_autotuning.json", target=tvm.target.Target("llvm")):# (length, dim_in, dim_out, dim_Y_out)
    # length = 2
    # dim_in = dim_out = dim_Y_out = 64
    task = tvm.auto_scheduler.SearchTask(func=func_pointer, args=func_args, target=target)

    # Inspect the computational graph
    # print("Computational DAG:")
    # print(task.compute_dag)

    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=1000,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        verbose=2,
    )
    # Run auto-tuning (search)
    task.tune(tune_option)
    # Apply the best schedule
    sch, args = task.apply_best(log_file)
    return sch, args

In [9]:
length=2
dim_in = dim_out = dim_Y_out = 1024

# target = tvm.target.Target("llvm")
target = tvm.target.Target("cuda")

W_np = np.random.uniform(size=(dim_out, dim_Y_out)).astype(np.float32)
x_lb_np = np.random.uniform(size=(length, dim_out)).astype(np.float32)
x_ub_np = np.random.uniform(size=(length, dim_out)).astype(np.float32)
x_lw_np = np.random.uniform(size=(length, dim_in, dim_out)).astype(np.float32)
x_uw_np = np.random.uniform(size=(length, dim_in, dim_out)).astype(np.float32)

# dev = tvm.cpu()
dev = tvm.cuda()
W_tvm = tvm.nd.array(W_np, device=dev)
x_lb_tvm = tvm.nd.array(x_lb_np, device=dev)
x_ub_tvm = tvm.nd.array(x_ub_np, device=dev)
x_lw_tvm = tvm.nd.array(x_lw_np, device=dev)
x_uw_tvm = tvm.nd.array(x_uw_np, device=dev)
y_lb_1_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_lb_2_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_lb_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_ub_1_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_ub_2_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_ub_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_lw_1_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_lw_2_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_lw_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_uw_1_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_uw_2_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_uw_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)

# Evaluate execution time.
def profile(func, func_args):
    evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
    # "Execution time of this operator in ms"
    return np.median(evaluator(*func_args).results) * 1000

In [10]:
lb_1_sch, lb_1_args = ansor_tuner(verify_matmul_lb_1, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_lb_1.json", target=target)
func_lb1 = tvm.build(lb_1_sch, lb_1_args, target)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 77	fail_ct: 1971	Time elapsed: 0.62
GA Iter: 0	Max score: 0.9607	Min score: 0.0172	#Pop: 77	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9819	#Pop: 128	#M+: 1401	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 6.37
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 1	GFLOPS: 46.37 / 46.37	results: MeasureResult(cost:[0.0001], error_no:0, all_cost:1.01, Tstamp:1637018905.86)
Placeholder: x_lb, W
blockIdx.x l.0@i.0@ (0,8)
  threadIdx.x l.2@i.2@ (0,128)
    y_lb.local auto_unroll: 512
    for dout.0 (0,1024)
     



Time elapsed for training: 0.36 s
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 79	fail_ct: 1969	Time elapsed: 0.53
GA Iter: 0	Max score: 0.9875	Min score: 0.0202	#Pop: 79	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9826	#Pop: 128	#M+: 1399	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 6.48
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 65	GFLOPS: 67.24 / 221.46	results: MeasureResult(cost:[0.0001], error_no:0, all_cost:1.13, Tstamp:1637018953.38)
Placeholder: x_lb, W
blockIdx.x l.0@i.0@ (0,8)
  vthread l.1@i.1@ (0,4)
    threadIdx.x l.2@i.2@ (0,32)
      y_lb.local auto_un

In [11]:
lb_2_sch, lb_2_args = ansor_tuner(verify_matmul_lb_2, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_lb_2.json", target=target)
func_lb2 = tvm.build(lb_2_sch, lb_2_args, target)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 75	fail_ct: 1973	Time elapsed: 0.54
GA Iter: 0	Max score: 0.9916	Min score: 0.0042	#Pop: 75	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9807	#Pop: 128	#M+: 1403	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 6.87
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 1	GFLOPS: 5.57 / 5.57	results: MeasureResult(cost:[0.0009], error_no:0, all_cost:1.21, Tstamp:1637019716.18)
Placeholder: y_lb_1, x_lb, W
vthread l.1@i.1@ (0,4)
  threadIdx.x l.2@i.2@ (0,32)
    for dout.0 (0,256)
      for ax0@ax1@.0.0 (0,32)
      



Time elapsed for training: 0.28 s
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 100	fail_ct: 1948	Time elapsed: 0.68
GA Iter: 0	Max score: 0.9959	Min score: 0.0017	#Pop: 100	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9815	#Pop: 128	#M+: 1395	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 7.33
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 65	GFLOPS: 52.94 / 295.40	results: MeasureResult(cost:[0.0001], error_no:0, all_cost:1.96, Tstamp:1637019760.64)
Placeholder: y_lb_1, x_lb, W
blockIdx.x l.0@i.0@ (0,2)
  vthread l.1@i.1@ (0,2)
    threadIdx.x l.2@i.2@ (0,64)
      y_lb_2 a

In [12]:
ub_1_sch, ub_1_args = ansor_tuner(verify_matmul_ub_1, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_ub_1.json", target=target)
func_ub1 = tvm.build(ub_1_sch, ub_1_args, target)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 101	fail_ct: 1947	Time elapsed: 0.56
GA Iter: 0	Max score: 0.9960	Min score: 0.0007	#Pop: 101	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9814	#Pop: 128	#M+: 1400	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 6.59
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 1	GFLOPS: 19.58 / 19.58	results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.32, Tstamp:1637020474.31)
Placeholder: x_ub, W
threadIdx.x l.2@i.2@ (0,64)
  y_ub_1.local auto_unroll: 1024
  for dout.0 (0,256)
    for ax0@ax1@.0.0 (0,64)
      



Time elapsed for training: 0.25 s
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 90	fail_ct: 1958	Time elapsed: 0.67
GA Iter: 0	Max score: 0.9787	Min score: 0.0089	#Pop: 90	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9787	#Pop: 128	#M+: 1386	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 7.48
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 65	GFLOPS: 56.11 / 149.90	results: MeasureResult(cost:[0.0001], error_no:0, all_cost:1.17, Tstamp:1637020528.10)
Placeholder: x_ub, W
blockIdx.x l.0@i.0@ (0,4)
  vthread l.1@i.1@ (0,2)
    threadIdx.x l.2@i.2@ (0,128)
      y_ub_1.local auto

In [13]:
ub_2_sch, ub_2_args = ansor_tuner(verify_matmul_ub_2, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_ub_2.json", target=target)
func_ub2 = tvm.build(ub_2_sch, ub_2_args, target)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 72	fail_ct: 1976	Time elapsed: 0.65
GA Iter: 0	Max score: 0.9944	Min score: 0.0007	#Pop: 72	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9995	Min score: 0.9789	#Pop: 128	#M+: 1393	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 7.64
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 1	GFLOPS: 14.45 / 14.45	results: MeasureResult(cost:[0.0004], error_no:0, all_cost:1.25, Tstamp:1637021226.00)
Placeholder: y_ub_1, x_ub, W
blockIdx.x l.0@i.0@ (0,2)
  vthread l.1@i.1@ (0,2)
    threadIdx.x l.2@i.2@ (0,32)
      y_ub_2 auto_unroll: 1



Time elapsed for training: 0.26 s
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 70	fail_ct: 1978	Time elapsed: 0.67
GA Iter: 0	Max score: 0.9848	Min score: 0.0010	#Pop: 70	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9820	#Pop: 128	#M+: 1392	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 7.63
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 65	GFLOPS: 18.33 / 357.56	results: MeasureResult(cost:[0.0003], error_no:0, all_cost:1.14, Tstamp:1637021283.50)
Placeholder: y_ub_1, x_ub, W
blockIdx.x l.0@i.0@ (0,2)
  vthread l.1@i.1@ (0,2)
    threadIdx.x l.2@i.2@ (0,512)
      y_ub_2 au

In [14]:
lw_1_sch, lw_1_args = ansor_tuner(verify_matmul_lw_1, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_lw_1.json", target=target)
func_lw1 = tvm.build(lw_1_sch, lw_1_args, target)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 94	fail_ct: 1954	Time elapsed: 1.14
GA Iter: 0	Max score: 0.9984	Min score: 0.0225	#Pop: 94	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9803	#Pop: 128	#M+: 1397	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 9.79
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................**********************E**********
No: 1	GFLOPS: 413.87 / 413.87	results: MeasureResult(cost:[0.0104], error_no:0, all_cost:1.42, Tstamp:1637021951.29)
Placeholder: x_lw, W
blockIdx.x l.0@j.0@i.0@ (0,32)
  vthread l.1@j.1@i.1@ (0,8)
    threadIdx.x l.2@j.2@i.2@ (0,1024)
      y_lw_1.loca



Time elapsed for training: 0.31 s
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 80	fail_ct: 1968	Time elapsed: 1.09
GA Iter: 0	Max score: 0.9807	Min score: 0.0103	#Pop: 80	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9997	Min score: 0.9815	#Pop: 128	#M+: 1387	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 10.69
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 65	GFLOPS: 12194.78 / 13710.16	results: MeasureResult(cost:[0.0004], error_no:0, all_cost:3.02, Tstamp:1637022029.35)
Placeholder: x_lw, W
blockIdx.x l.0@j.0@i.0@ (0,256)
  vthread l.1@j.1@i.1@ (0,4)
    threadIdx.x l.2@j.2@i.2@ (0,256)
   

In [15]:
lw_2_sch, lw_2_args = ansor_tuner(verify_matmul_lw_2, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_lw_2.json", target=target)
func_lw2 = tvm.build(lw_2_sch, lw_2_args, target)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 77	fail_ct: 1971	Time elapsed: 1.13
GA Iter: 0	Max score: 0.9752	Min score: 0.0037	#Pop: 77	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9998	Min score: 0.9795	#Pop: 128	#M+: 1387	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 10.57
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 1	GFLOPS: 153.05 / 153.05	results: MeasureResult(cost:[0.0281], error_no:0, all_cost:2.34, Tstamp:1637023157.47)
Placeholder: y_lw_1, x_uw, W
blockIdx.x l.0@j.0@i.0@ (0,16)
  vthread l.1@j.1@i.1@ (0,8)
    threadIdx.x l.2@j.2@i.2@ (0,64)
      y_lw_



Time elapsed for training: 0.29 s
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 85	fail_ct: 1963	Time elapsed: 1.13
GA Iter: 0	Max score: 0.9994	Min score: 0.0149	#Pop: 85	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9812	#Pop: 128	#M+: 1385	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 10.89
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 65	GFLOPS: 123.76 / 13348.03	results: MeasureResult(cost:[0.0347], error_no:0, all_cost:10.00, Tstamp:1637023235.69)
Placeholder: y_lw_1, x_uw, W
blockIdx.x l.0@j.0@i.0@ (0,128)
  vthread l.1@j.1@i.1@ (0,2)
    threadIdx.x l.2@j.2@i.2@ (0,1

In [16]:
uw_1_sch, uw_1_args = ansor_tuner(verify_matmul_uw_1, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_uw_1.json", target=target)
func_uw1 = tvm.build(uw_1_sch, uw_1_args, target)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 93	fail_ct: 1955	Time elapsed: 1.17
GA Iter: 0	Max score: 0.9970	Min score: 0.0041	#Pop: 93	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9788	#Pop: 128	#M+: 1392	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 9.98
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 1	GFLOPS: 472.04 / 472.04	results: MeasureResult(cost:[0.0091], error_no:0, all_cost:3.03, Tstamp:1637024350.49)
Placeholder: x_uw, W
blockIdx.x l.0@j.0@i.0@ (0,16)
  vthread l.1@j.1@i.1@ (0,4)
    threadIdx.x l.2@j.2@i.2@ (0,128)
      y_uw_1.local 



Time elapsed for training: 0.28 s
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 98	fail_ct: 1950	Time elapsed: 1.09
GA Iter: 0	Max score: 0.9832	Min score: 0.0170	#Pop: 98	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9793	#Pop: 128	#M+: 1398	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 10.72
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................********************************
No: 65	GFLOPS: 2496.29 / 13689.85	results: MeasureResult(cost:[0.0017], error_no:0, all_cost:1.39, Tstamp:1637024439.48)
Placeholder: x_uw, W
blockIdx.x l.0@j.0@i.0@ (0,256)
  threadIdx.x l.2@j.2@i.2@ (0,256)
    y_uw_1.local auto_unroll: 1024


In [17]:
uw_2_sch, uw_2_args = ansor_tuner(verify_matmul_uw_2, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_uw_2.json", target=target)
func_uw2 = tvm.build(uw_2_sch, uw_2_args, target)

----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 106	fail_ct: 1942	Time elapsed: 1.17
GA Iter: 0	Max score: 0.9971	Min score: 0.0068	#Pop: 106	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9998	Min score: 0.9810	#Pop: 128	#M+: 1391	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 10.70
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................*******E*************************
No: 1	GFLOPS: 304.34 / 304.34	results: MeasureResult(cost:[0.0141], error_no:0, all_cost:1.94, Tstamp:1637025844.61)
Placeholder: y_uw_1, x_lw, W
blockIdx.x l.0@j.0@i.0@ (0,16)
  threadIdx.x l.2@j.2@i.2@ (0,128)
    y_lw_2 auto_unroll: 512
    for do



Time elapsed for training: 0.27 s
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 96	fail_ct: 1952	Time elapsed: 1.02
GA Iter: 0	Max score: 0.9927	Min score: 0.0147	#Pop: 96	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9827	#Pop: 128	#M+: 1380	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 11.51
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 64 programs to measure:
................................**************E**************E****
No: 65	GFLOPS: 75.03 / 13829.19	results: MeasureResult(cost:[0.0573], error_no:0, all_cost:3.46, Tstamp:1637025920.18)
Placeholder: y_uw_1, x_lw, W
blockIdx.x l.0@j.0@i.0@ (0,4)
  vthread l.1@j.1@i.1@ (0,4)
    threadIdx.x l.2@j.2@i.2@ (0,32)

In [18]:
verify_matmul_lb_1_latency = profile(func_lb1, (W_tvm, x_lb_tvm, y_lb_1_tvm))
verify_matmul_lb_2_latency = profile(func_lb2, (W_tvm, x_ub_tvm, y_lb_1_tvm, y_lb_tvm))
verify_matmul_ub_1_latency = profile(func_ub1, (W_tvm, x_ub_tvm, y_ub_1_tvm))
verify_matmul_ub_2_latency = profile(func_ub2, (W_tvm, x_lb_tvm, y_ub_1_tvm, y_ub_tvm))
verify_matmul_lw_1_latency = profile(func_lw1, (W_tvm, x_lw_tvm, y_lw_1_tvm))
verify_matmul_lw_2_latency = profile(func_lw2, (W_tvm, x_uw_tvm, y_lw_1_tvm, y_lw_tvm))
verify_matmul_uw_1_latency = profile(func_uw1, (W_tvm, x_uw_tvm, y_uw_1_tvm))
verify_matmul_uw_2_latency = profile(func_uw2, (W_tvm, x_lw_tvm, y_uw_1_tvm, y_uw_tvm))

print(
    "verify_matmul_lb_1_latency: ", verify_matmul_lb_1_latency,
    "verify_matmul_lb_2_latency: ", verify_matmul_lb_2_latency,
    "verify_matmul_ub_1_latency: ", verify_matmul_ub_1_latency,
    "verify_matmul_ub_2_latency: ", verify_matmul_ub_2_latency,
    "verify_matmul_lw_1_latency: ", verify_matmul_lw_1_latency,
    "verify_matmul_lw_2_latency: ", verify_matmul_lw_2_latency,
    "verify_matmul_uw_1_latency: ", verify_matmul_uw_1_latency,
    "verify_matmul_uw_2_latency: ", verify_matmul_uw_2_latency,
    "total time: ", verify_matmul_lb_1_latency + verify_matmul_lb_2_latency + verify_matmul_ub_1_latency + verify_matmul_ub_2_latency + verify_matmul_lw_1_latency + verify_matmul_lw_2_latency + verify_matmul_uw_1_latency + verify_matmul_uw_2_latency
)

verify_matmul_lb_1_latency:  0.008919071026524108 verify_matmul_lb_2_latency:  0.00872938604737549 verify_matmul_ub_1_latency:  0.008837150053214648 verify_matmul_ub_2_latency:  0.009033510423060867 verify_matmul_lw_1_latency:  0.23663836219459924 verify_matmul_lw_2_latency:  0.26998218702865756 verify_matmul_uw_1_latency:  0.2513553490384615 verify_matmul_uw_2_latency:  0.2694436762275158 total time:  1.0629386920394093
