In [1]:
import os

import numpy as np
import tvm
from tvm import te, auto_scheduler

import os
os.environ["CUDA_VISIBLE_DEVICES"]='1' # 0 for A6000 on winnie, 1 for P6000 on winnie.
from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
# set_cuda_target_arch('sm_75')

In [2]:
# Note that fusing all computation into one graph is not supported yet on Ansor.
# Check: https://discuss.tvm.apache.org/t/assertion-triggered-when-auto-scheduling/9613/4
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_not_supported_yet(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lw = te.placeholder((length, dim_in, dim_out), name="x_lw", dtype=dtype)
    x_uw = te.placeholder((length, dim_in, dim_out), name="x_uw", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    
    y_lw = te.placeholder((length, dim_in, dim_Y_out), name="y_lw", dtype=dtype)
    y_uw = te.placeholder((length, dim_in, dim_Y_out), name="y_uw", dtype=dtype)
    y_lb = te.placeholder((length, dim_Y_out), name="y_lb", dtype=dtype)
    y_ub = te.placeholder((length, dim_Y_out), name="y_ub", dtype=dtype)
    
    W_pos = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.),
        name='w_pos'
    )
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='w_neg')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb_1 = te.compute(
        y_lb.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_lb_1'
    )
    y_lb_2 = te.compute(
        y_lb.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_neg[dout,i], axis=dout),
        name='y_lb_2'
    )
    y_lb = te.compute(
        y_lb.shape,
        lambda l, i: y_lb_1[l,i]+y_lb_2[l,i],
        name="y_lb"
    )

    y_ub_1 = te.compute(
        y_ub.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_pos[dout,i], axis=dout),
        name='y_ub_1'
    )
    y_ub_2 = te.compute(
        y_ub.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_ub_2'
    )
    y_ub = te.compute(
        y_ub.shape,
        lambda l, i: y_ub_1[l,i]+y_ub_2[l,i],
        name="y_ub"
    )

    y_lw_1 = te.compute(
        y_lw.shape,
        lambda l, j, i: 
            te.sum(x_lw[l,j,dout] * W_pos[dout,i], axis=dout),
        name='y_lw_1'
    )
    y_lw_2 = te.compute(
        y_lw.shape,
        lambda l, j, i: 
            te.sum(x_uw[l,j,dout] * W_neg[dout,i], axis=dout),
        name='y_lw_2'
    )
    y_lw = te.compute(
        y_lw.shape,
        lambda l, j, i: y_lw_1[l,j,i] + y_lw_2[l,j,i],
        name="y_lw"
    )

    y_uw_1 = te.compute(
        y_uw.shape,
        lambda l, j, i: 
            te.sum(x_uw[l,j,dout] * W_pos[dout,i], axis=dout),
        name='y_uw_1'
    )
    y_uw_2 = te.compute(
        y_uw.shape,
        lambda l, j, i: 
            te.sum(x_lw[l,j,dout] * W_neg[dout,i], axis=dout),
        name='y_uw_2'
    )
    y_uw = te.compute(
        y_uw.shape,
        lambda l, j, i: y_uw_1[l,j,i] + y_uw_2[l,j,i],
        name="y_uw"
    )

    return [W, x_lw, x_uw, x_lb, x_ub, y_lw, y_uw, y_lb, y_ub]

In [3]:
# Ansor does not support this type of kernel yet.
# In particular, the y_lb = y_lb_1+y_lb_2 is not supported in the computation graph.
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_not_supported(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    y_lb = te.placeholder((length, dim_Y_out), name="y_lb", dtype=dtype)
    
    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='w_pos')
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='w_neg')


    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb_1 = te.compute(
        y_lb.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_lb_1'
    )
    y_lb_2 = te.compute(
        y_lb.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_neg[dout,i], axis=dout),
        name='y_lb_2'
    )
    y_lb = te.compute(
        y_lb.shape,
        lambda l, i: y_lb_1[l,i]+y_lb_2[l,i],
        name="y_lb"
    )

    return [W, x_lb, y_lb]

In [4]:
# Ansor does not report error for this version.
# However, it takes 25 minutes but cannot generate 1 schedule.
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_stuck(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    y_lb_1 = te.placeholder((length, dim_Y_out), name="y_lb_1", dtype=dtype)
    y_lb_2 = te.placeholder((length, dim_Y_out), name="y_lb_2", dtype=dtype)
    
    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='w_pos')
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='w_neg')


    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb_1 = te.compute(
        y_lb_1.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i], axis=dout),
        name='y_lb_1'
    )
    y_lb_2 = te.compute(
        y_lb_1.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_neg[dout,i], axis=dout),
        name='y_lb_2'
    )
    # y_lb = te.compute(
    #     y_lb.shape,
    #     lambda l, i: y_lb_1[l,i]+y_lb_2[l,i],
    #     name="y_lb"
    # )

    return [W, x_lb, y_lb_1, y_lb_2]

In [5]:
# Ansor keeps reporting errors when compiling this kernel:
#    Target has been reduced to 1 due to too many failures or duplications
#    See: https://discuss.tvm.apache.org/t/autoscheduler-prints-target-has-been-reduced-to-1-due-to-too-many-failures-or-duplications-and-fails-to-tune/10774/4
#    Also tried renaming operators. But it still stucks.
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_stuck2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="I_1", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="I_2", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="I_3", dtype=dtype)
    
    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='I_5')
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='I_6')


    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb = te.compute(
        (length, dim_Y_out),
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout,i] + x_ub[l,dout]*W_neg[dout,i], axis=dout),
        name='Y_7'
    )

    return [W, x_lb, y_lb]

In [6]:
# This is the largest subgraph that is supported by Ansor.
@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_lb_1(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)

    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='W_pos')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb1 = te.compute(
        (length, dim_Y_out),
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_pos[dout, i], axis=dout),
        name='y_lb1'
    )

    return [W, x_lb, y_lb1]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_lb_2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_lb", dtype=dtype)
    y_lb_1 = te.placeholder((length, dim_Y_out), name="y_lb_1", dtype=dtype)
    
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='W_neg')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lb_2 = te.compute(
        y_lb_1.shape,
        lambda l, i: 
            te.sum(x_ub[l,dout] * W_neg[dout,i], axis=dout),
        name='y_lb_2'
    )
    y_lb = te.compute(y_lb_1.shape, lambda l, i: y_lb_1[l,i]+y_lb_2[l,i], name="y_lb")

    return [W, x_ub, y_lb_1, y_lb]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_ub_1(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_ub = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    
    W_pos = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.), name='W_pos')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_ub_1 = te.compute(
        (length, dim_Y_out),
        lambda l, i: te.sum(x_ub[l,dout] * W_pos[dout, i], axis=dout),
        name='y_ub_1'
    )

    return [W, x_ub, y_ub_1]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_ub_2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lb = te.placeholder((length, dim_out), name="x_ub", dtype=dtype)
    y_ub_1 = te.placeholder((length, dim_Y_out), name="y_ub_1", dtype=dtype)
    
    W_neg = te.compute(W.shape, lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.), name='W_neg')

    dout = te.reduce_axis((0, dim_out), "dout")
    y_ub_2 = te.compute(
        y_ub_1.shape,
        lambda l, i: 
            te.sum(x_lb[l,dout] * W_neg[dout, i], axis=dout),
        name='y_ub_2'
    )
    y_ub = te.compute(y_ub_1.shape, lambda l, i: y_ub_1[l,i]+y_ub_2[l,i], name="y_ub")

    return [W, x_lb, y_ub_1, y_ub]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_lw_1(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lw = te.placeholder((length, dim_in, dim_out), name="x_lw", dtype=dtype)
    
    W_pos = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.),
        name='w_pos'
    )

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lw_1 = te.compute(
        (length, dim_in, dim_Y_out),
        lambda l, j, i: 
            te.sum(x_lw[l,j,dout] * W_pos[dout, i], axis=dout),
        name='y_lw_1'
    )

    return [W, x_lw, y_lw_1]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_lw_2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_uw = te.placeholder((length, dim_in, dim_out), name="x_uw", dtype=dtype)
    y_lw_1 = te.placeholder((length, dim_in, dim_Y_out), name="y_lw_1", dtype=dtype)
    
    W_neg = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.),
        name='w_neg'
    )

    dout = te.reduce_axis((0, dim_out), "dout")
    y_lw_2 = te.compute(
        y_lw_1.shape,
        lambda l, j, i: 
            te.sum(x_uw[l,j,dout] * W_neg[dout, i], axis=dout),
        name='y_lw_2'
    )
    y_lw = te.compute(
        y_lw_1.shape,
        lambda l, j, i: y_lw_1[l,j,i] + y_lw_2[l,j,i],
        name="y_lw"
    )

    return [W, x_uw, y_lw_1, y_lw]


@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_uw_1(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_uw = te.placeholder((length, dim_in, dim_out), name="x_uw", dtype=dtype)
    
    W_pos = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]>0, W[i,j], 0.),
        name='w_pos'
    )

    dout = te.reduce_axis((0, dim_out), "dout")
    y_uw_1 = te.compute(
        (length, dim_in, dim_Y_out),
        lambda l, j, i: 
            te.sum(x_uw[l,j,dout] * W_pos[dout, i], axis=dout),
        name='y_uw_1'
    )

    return [W, x_uw, y_uw_1]

@auto_scheduler.register_workload  # Note the auto_scheduler decorator
def verify_matmul_uw_2(length, dim_in, dim_out, dim_Y_out, dtype="float32"):
    W = te.placeholder((dim_out, dim_Y_out), name="W", dtype=dtype)
    x_lw = te.placeholder((length, dim_in, dim_out), name="x_lw", dtype=dtype)
    y_uw_1 = te.placeholder((length, dim_in, dim_Y_out), name="y_uw_1", dtype=dtype)
    
    W_neg = te.compute(
        W.shape, 
        lambda i,j: te.if_then_else(W[i,j]<=0, W[i,j], 0.),
        name='w_neg'
    )

    dout = te.reduce_axis((0, dim_out), "dout")
    y_uw_2 = te.compute(
        y_uw_1.shape,
        lambda l, j, i: 
            te.sum(x_lw[l,j,dout] * W_neg[dout, i], axis=dout),
        name='y_lw_2'
    )
    y_uw = te.compute(
        y_uw_1.shape,
        lambda l, j, i: y_uw_1[l,j,i] + y_uw_2[l,j,i],
        name="y_uw"
    )

    return [W, x_lw, y_uw_1, y_uw]

In [7]:
def ansor_tuner(func_pointer, func_args, log_file="ansor_autotuning.json", target=tvm.target.Target("llvm")):# (length, dim_in, dim_out, dim_Y_out)
    # length = 2
    # dim_in = dim_out = dim_Y_out = 64
    task = tvm.auto_scheduler.SearchTask(func=func_pointer, args=func_args, target=target)

    # Inspect the computational graph
    # print("Computational DAG:")
    # print(task.compute_dag)

    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=1,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
        verbose=2,
    )
    # Run auto-tuning (search)
    task.tune(tune_option)
    # Apply the best schedule
    sch, args = task.apply_best(log_file)
    return sch, args

In [8]:
length=2
# dim_in = 512
# dim_out = 512
# dim_Y_out = 512
dim_in = dim_out = dim_Y_out = 1024

# target = tvm.target.Target("llvm")
target = tvm.target.Target("cuda")

W_np = np.random.uniform(size=(dim_out, dim_Y_out)).astype(np.float32)
x_lb_np = np.random.uniform(size=(length, dim_out)).astype(np.float32)
x_ub_np = np.random.uniform(size=(length, dim_out)).astype(np.float32)
x_lw_np = np.random.uniform(size=(length, dim_in, dim_out)).astype(np.float32)
x_uw_np = np.random.uniform(size=(length, dim_in, dim_out)).astype(np.float32)

# dev = tvm.cpu()
dev = tvm.cuda()
W_tvm = tvm.nd.array(W_np, device=dev)
x_lb_tvm = tvm.nd.array(x_lb_np, device=dev)
x_ub_tvm = tvm.nd.array(x_ub_np, device=dev)
x_lw_tvm = tvm.nd.array(x_lw_np, device=dev)
x_uw_tvm = tvm.nd.array(x_uw_np, device=dev)
y_lb_1_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_lb_2_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_lb_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_ub_1_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_ub_2_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_ub_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_lw_1_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_lw_2_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_lw_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_uw_1_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_uw_2_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_uw_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)

# Evaluate execution time.
def profile(func, func_args):
    evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
    # "Execution time of this operator in ms"
    return np.mean(evaluator(*func_args).results) * 1000

In [9]:
lb_1_sch, lb_1_args = ansor_tuner(verify_matmul_lb_1, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_lb_1.json", target=target)
func_lb1 = tvm.build(lb_1_sch, lb_1_args, target)

Generate Sketches		#s: 1
Sample Initial Population	#s: 99	fail_ct: 1949	Time elapsed: 0.65
GA Iter: 0	Max score: 0.9904	Min score: 0.0070	#Pop: 99	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9803	#Pop: 128	#M+: 1388	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 6.37


In [10]:
lb_2_sch, lb_2_args = ansor_tuner(verify_matmul_lb_2, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_lb_2.json", target=target)
func_lb2 = tvm.build(lb_2_sch, lb_2_args, target)

Generate Sketches		#s: 1
Sample Initial Population	#s: 80	fail_ct: 1968	Time elapsed: 0.56
GA Iter: 0	Max score: 0.9968	Min score: 0.0168	#Pop: 80	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9818	#Pop: 128	#M+: 1386	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 6.64


In [11]:
ub_1_sch, ub_1_args = ansor_tuner(verify_matmul_ub_1, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_ub_1.json", target=target)
func_ub1 = tvm.build(ub_1_sch, ub_1_args, target)

Generate Sketches		#s: 1
Sample Initial Population	#s: 99	fail_ct: 1949	Time elapsed: 0.56
GA Iter: 0	Max score: 0.9904	Min score: 0.0020	#Pop: 99	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9792	#Pop: 128	#M+: 1391	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 6.54


In [12]:
ub_2_sch, ub_2_args = ansor_tuner(verify_matmul_ub_2, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_ub_2.json", target=target)
func_ub2 = tvm.build(ub_2_sch, ub_2_args, target)

Generate Sketches		#s: 1
Sample Initial Population	#s: 72	fail_ct: 1976	Time elapsed: 0.58
GA Iter: 0	Max score: 0.9991	Min score: 0.0038	#Pop: 72	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9996	Min score: 0.9799	#Pop: 128	#M+: 1393	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 6.58


In [13]:
lw_1_sch, lw_1_args = ansor_tuner(verify_matmul_lw_1, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_lw_1.json", target=target)
func_lw1 = tvm.build(lw_1_sch, lw_1_args, target)

Generate Sketches		#s: 1
Sample Initial Population	#s: 92	fail_ct: 1956	Time elapsed: 0.96
GA Iter: 0	Max score: 0.9919	Min score: 0.0211	#Pop: 92	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9814	#Pop: 128	#M+: 1388	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 9.24


In [14]:
lw_2_sch, lw_2_args = ansor_tuner(verify_matmul_lw_2, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_lw_2.json", target=target)
func_lw2 = tvm.build(lw_2_sch, lw_2_args, target)

Generate Sketches		#s: 1
Sample Initial Population	#s: 103	fail_ct: 1945	Time elapsed: 0.97
GA Iter: 0	Max score: 0.9920	Min score: 0.0326	#Pop: 103	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9998	Min score: 0.9814	#Pop: 128	#M+: 1388	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 9.80


In [15]:
uw_1_sch, uw_1_args = ansor_tuner(verify_matmul_uw_1, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_uw_1.json", target=target)
func_uw1 = tvm.build(uw_1_sch, uw_1_args, target)

Generate Sketches		#s: 1
Sample Initial Population	#s: 82	fail_ct: 1966	Time elapsed: 0.92
GA Iter: 0	Max score: 0.9696	Min score: 0.0090	#Pop: 82	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9792	#Pop: 128	#M+: 1385	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 9.50


In [16]:
uw_2_sch, uw_2_args = ansor_tuner(verify_matmul_uw_2, (length, dim_in, dim_out, dim_Y_out), log_file="verify_matmul_uw_2.json", target=target)
func_uw2 = tvm.build(uw_2_sch, uw_2_args, target)

Generate Sketches		#s: 1
Sample Initial Population	#s: 97	fail_ct: 1951	Time elapsed: 0.95
GA Iter: 0	Max score: 0.9886	Min score: 0.0021	#Pop: 97	#M+: 0	#M-: 0
GA Iter: 4	Max score: 1.0000	Min score: 0.9822	#Pop: 128	#M+: 1400	#M-: 0
EvolutionarySearch		#s: 128	Time elapsed: 9.65


In [17]:
verify_matmul_lb_1_latency = profile(func_lb1, (W_tvm, x_lb_tvm, y_lb_1_tvm))
verify_matmul_lb_2_latency = profile(func_lb2, (W_tvm, x_ub_tvm, y_lb_1_tvm, y_lb_tvm))
verify_matmul_ub_1_latency = profile(func_ub1, (W_tvm, x_ub_tvm, y_ub_1_tvm))
verify_matmul_ub_2_latency = profile(func_ub2, (W_tvm, x_lb_tvm, y_ub_1_tvm, y_ub_tvm))
verify_matmul_lw_1_latency = profile(func_lw1, (W_tvm, x_lw_tvm, y_lw_1_tvm))
verify_matmul_lw_2_latency = profile(func_lw2, (W_tvm, x_uw_tvm, y_lw_1_tvm, y_lw_tvm))
verify_matmul_uw_1_latency = profile(func_uw1, (W_tvm, x_uw_tvm, y_uw_1_tvm))
verify_matmul_uw_2_latency = profile(func_uw2, (W_tvm, x_lw_tvm, y_uw_1_tvm, y_uw_tvm))

print(
    "verify_matmul_lb_1_latency: ", verify_matmul_lb_1_latency,
    "verify_matmul_lb_2_latency: ", verify_matmul_lb_2_latency,
    "verify_matmul_ub_1_latency: ", verify_matmul_ub_1_latency,
    "verify_matmul_ub_2_latency: ", verify_matmul_ub_2_latency,
    "verify_matmul_lw_1_latency: ", verify_matmul_lw_1_latency,
    "verify_matmul_lw_2_latency: ", verify_matmul_lw_2_latency,
    "verify_matmul_uw_1_latency: ", verify_matmul_uw_1_latency,
    "verify_matmul_uw_2_latency: ", verify_matmul_uw_2_latency,
    "total time: ", verify_matmul_lb_1_latency + verify_matmul_lb_2_latency + verify_matmul_ub_1_latency + verify_matmul_ub_2_latency + verify_matmul_lw_1_latency + verify_matmul_lw_2_latency + verify_matmul_uw_1_latency + verify_matmul_uw_2_latency
)

verify_matmul_lb_1_latency:  0.026845218963939228 verify_matmul_lb_2_latency:  0.03377088577880027 verify_matmul_ub_1_latency:  0.032087939783818666 verify_matmul_ub_2_latency:  0.03510502681753386 verify_matmul_lw_1_latency:  0.9268270541474654 verify_matmul_lw_2_latency:  1.3374931604278075 verify_matmul_uw_1_latency:  0.8565199043570669 verify_matmul_uw_2_latency:  1.3420017754010696 total time:  4.590650965677502


In [20]:
# Check correctness
# dim_out, dim_in, dim_Y_out has been set earlier
import torch
import os
import sys
torch.manual_seed(1)

os.environ["CUDA_VISIBLE_DEVICES"]='1' # 0 for A6000 on winnie, 1 for P6000 on winnie.
sys.path.append('/home/boyuan/Faith-NNVerificationCompiler/HandTunedKernels/kernel_test/')
import forward_test_bound

import importlib
importlib.reload(forward_test_bound)

p = 2
eps = 0.5
batch_size = 1

# Helper function for pytorch profiling
def profile_pytorch(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type='relu', num_profile=100):        
    bound = forward_test_bound.Bounds(p, eps, lw, lb, uw, ub)

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    start.record()
    for i in range(num_profile):
        if op_type == 'relu':
            bound_output = bound.relu()
        elif op_type == 'matmul':
            bound_output = bound.matmul(w_input)
        elif op_type == 'dot_product':
            bound_output = bound.dot_product(other_bound)

    end.record()

    # Waits for everything to finish running
    torch.cuda.synchronize()

    average_time = start.elapsed_time(end)/num_profile # Unit: Millisecond
    print("\n\n\n pytorch. op_type: {}, batch_size: {}, length: {}, dim_in: {}, dim_out: {}, average_time (ms): {}\n\n\n".format(op_type, batch_size, length, dim_in, dim_out, average_time))
    return bound_output

# Get pytorch results
device="cuda"
lb = torch.rand(1,length,dim_out).to(device)
ub = lb + torch.rand(1,length,dim_out).to(device)
lw = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5
uw = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5

w_input = torch.rand(dim_Y_out, dim_out).to(device) - 0.5
other_bound = forward_test_bound.Bounds(p=2,eps=0.5,lw=lw,lb=lb,uw=uw,ub=ub)

bound = profile_pytorch(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type="matmul", num_profile=1)




w_input = w_input.t().contiguous()
pos_mask = torch.gt(w_input, 0)
w_pos = w_input * pos_mask
w_neg = w_input - w_pos
lb1 = lb.matmul(w_pos)
lb2 = ub.matmul(w_neg)
y_lb = lb1 + lb2


print("w_pos: ", w_pos)
print("w_neg: ", w_neg)
print("numpy lb1: ", lb1)
print("numpy lb2: ", lb2)
print("numpy y_lb: ", y_lb)

# Prepare ansor inputs
# dev = tvm.cpu()
dev = tvm.cuda()
W_np = w_input.cpu().numpy()
x_lb_np = lb.cpu().numpy()[0]
x_ub_np = ub.cpu().numpy()[0]
x_lw_np = lw.cpu().numpy()[0]
x_uw_np = uw.cpu().numpy()[0]

W_tvm = tvm.nd.array(W_np, device=dev)
x_lb_tvm = tvm.nd.array(x_lb_np, device=dev)
x_ub_tvm = tvm.nd.array(x_ub_np, device=dev)
x_lw_tvm = tvm.nd.array(x_lw_np, device=dev)
x_uw_tvm = tvm.nd.array(x_uw_np, device=dev)
y_lb_1_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_lb_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_ub_1_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_ub_tvm = tvm.nd.empty((length, dim_Y_out), device=dev)
y_lw_1_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_lw_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_uw_1_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)
y_uw_tvm = tvm.nd.empty((length, dim_in, dim_Y_out), device=dev)

# Inference with ansor kernel
func_lb1(W_tvm, x_lb_tvm, y_lb_1_tvm)
func_lb2(W_tvm, x_ub_tvm, y_lb_1_tvm, y_lb_tvm)
func_ub1(W_tvm, x_ub_tvm, y_ub_1_tvm)
func_ub2(W_tvm, x_lb_tvm, y_ub_1_tvm, y_ub_tvm)
func_lw1(W_tvm, x_lw_tvm, y_lw_1_tvm)
func_lw2(W_tvm, x_uw_tvm, y_lw_1_tvm, y_lw_tvm)
func_uw1(W_tvm, x_uw_tvm, y_uw_1_tvm)
func_uw2(W_tvm, x_lw_tvm, y_uw_1_tvm, y_uw_tvm)

# Compare ansor with tvm
def check_tvm_ansor_baseline_diff(bound, y_lb_np, y_ub_np, y_lw_np, y_uw_np, device="cuda"):
    print("pytorch lb: ", bound.lb)
    print("ansor lb: ", y_lb_np)
    lb_diff = torch.norm((bound.lb- (torch.from_numpy(y_lb_np).to(device))))
    ub_diff = torch.norm((bound.ub- (torch.from_numpy(y_ub_np).to(device))))
    lw_diff = torch.norm((bound.lw- (torch.from_numpy(y_lw_np).to(device))))
    uw_diff = torch.norm((bound.uw- (torch.from_numpy(y_uw_np).to(device))))
    print("lb_diff: ", lb_diff, ", ub_diff: ", ub_diff, ", lw_diff: ", lw_diff, ", uw_diff: ", uw_diff)

check_tvm_ansor_baseline_diff(
    bound=bound,
    y_lb_np=y_lb_tvm.numpy(),
    y_ub_np=y_ub_tvm.numpy(),
    y_lw_np=y_lw_tvm.numpy(),
    y_uw_np=y_uw_tvm.numpy(),
)


Using /home/boyuan/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module c_relu_verification, skipping build step...
Loading extension module c_relu_verification...
pytorch. self.lb.shape:  torch.Size([1, 2, 1024])
pytorch. w.shape:  torch.Size([1024, 1024])
pytorch. W_pos:  tensor([[-0.0000, -0.0000, 0.0181,  ..., -0.0000, 0.4000, -0.0000],
        [-0.0000, -0.0000, 0.3784,  ..., 0.4982, 0.3339, 0.3990],
        [0.1272, 0.0037, 0.1144,  ..., -0.0000, 0.1197, -0.0000],
        ...,
        [0.2021, 0.3102, 0.2176,  ..., -0.0000, -0.0000, 0.4763],
        [-0.0000, -0.0000, -0.0000,  ..., -0.0000, -0.0000, -0.0000],
        [0.4370, -0.0000, 0.2924,  ..., -0.0000, 0.2635, 0.3885]],
       device='cuda:0')
pytorch. W_neg:  tensor([[-0.4813, -0.4038,  0.0000,  ..., -0.4719,  0.0000, -0.1915],
        [-0.3211, -0.3797,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ..., -0.4999,  0.0000, -0.2483],
