In [1]:
import torch
import torch.nn as nn
import math, time
import sys
import tvm
from tvm import relay
from tvm.contrib import graph_executor
from tvm import auto_scheduler
import numpy as np
from tvm.contrib import graph_executor

sys.path.append('/home/boyuan/verification_tianqi/')

epsilon = 1e-12

from HandTunedKernels.kernel_test.cnn_forward_test_bound import Bounds

import os
os.environ["CUDA_VISIBLE_DEVICES"]='1' # 0 for A6000 on winnie, 1 for P6000 on winnie.

Using /home/boyuan/.cache/torch_extensions as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/boyuan/.cache/torch_extensions/c_relu_verification/build.ninja...
Building extension module c_relu_verification...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module c_relu_verification...


In [2]:
def Bounds2Tuple(x):
    return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))
def Tuple2Bounds(x):
    return Bounds(x[0], x[1], x[2], x[3], x[4], x[5])
# def Elements2BoundsDotProduct(x1, x2, x3, x4):
#     return BoundsDotProduct(args, p=2, eps=0.1, w=None, b=None, lw=x1, lb=x2, uw=x3, ub=x4)

class BoundsReLUWrapper(nn.Module):
    def __init__(self): #, ):
        super(BoundsReLUWrapper, self).__init__()
        
    def forward(self, p, eps, lw, lb, uw, ub):
        x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
        return Bounds2Tuple(x)

class BoundsMatMulWrapper(nn.Module):
    def __init__(self, W): 
        super(BoundsMatMulWrapper, self).__init__()
        self.W = W

    def forward(self, p, eps, lw, lb, uw, ub):
        x = Bounds(float(p), float(eps), lw, lb, uw, ub).matmul(self.W)
        return Bounds2Tuple(x) 

class BoundsDotProductWrapper(nn.Module):
    def __init__(self):
        super(BoundsDotProductWrapper, self).__init__()

    def forward(self, p0, eps0, lw0, lb0, uw0, ub0, p1, eps1, lw1, lb1, uw1, ub1):
        x = Bounds(float(p0), float(eps0), lw0, lb0, uw0, ub0)
        y = Bounds(float(p1), float(eps1), lw1, lb1, uw1, ub1)
        return Bounds2Tuple(x.dot_product(y))

In [3]:
torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
p = 2
eps = 0.5
batch_size, length, dim_in, dim_out, dim_y_out = 8, 4, 64, 32, 32
lb = torch.rand(batch_size,length,dim_out).to(device)
ub = lb + torch.rand(batch_size,length,dim_out).to(device)
lw = torch.rand(batch_size,length,dim_in,dim_out).to(device) - 0.5
uw = torch.rand(batch_size,length,dim_in,dim_out).to(device) - 0.5
W = torch.rand(dim_y_out, dim_out).to(device) - 0.5
bound = Bounds(p=2,eps=0.5,lw=lw,lb=lb,uw=uw,ub=ub)
bound1 = Bounds(p=2,eps=0.5,lw=lw,lb=lb,uw=uw,ub=ub)

bound_relu_wrapper = BoundsReLUWrapper()
bound_matmul_wrapper = BoundsMatMulWrapper(W)
bound_dot_product_wrapper = BoundsDotProductWrapper()

example_relu_inputs = Bounds2Tuple(bound)
example_matmul_inputs = Bounds2Tuple(bound)#(p, eps, lw, lb, uw, ub)
example_dot_product_inputs = (*Bounds2Tuple(bound), *Bounds2Tuple(bound1))


# test1.forward(p, eps, lw, lb, uw, ub)
# test2.forward(p, eps, lw, lb, uw, ub)
# test3.forward(p, eps, lw, lb, uw, ub, *Bounds2Tuple(bound1))

scripted_relu_model = torch.jit.trace(bound_relu_wrapper.eval(), example_relu_inputs).eval()
# scripted_relu_model = torch.jit.script(bound_relu_wrapper)
scripted_matmul_model = torch.jit.trace(bound_matmul_wrapper.eval(), example_matmul_inputs).eval()
scripted_dot_product_model = torch.jit.trace(bound_dot_product_wrapper.eval(), example_dot_product_inputs).eval()
# scripted_dot_product_model = torch.jit.script(bound_dot_product_wrapper)

  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))
  x = Bounds(float(p), float(eps), lw, lb, uw, ub).matmul(self.W)
  x = Bounds(float(p0), float(eps0), lw0, lb0, uw0, ub0)
  y = Bounds(float(p1), float(eps1), lw1, lb1, uw1, ub1)


In [4]:
def check_bound_wrapper_diff(bound, output_tuple):
    return torch.norm((bound.lw-output_tuple[2]).flatten())


def check_tvm_ansor_baseline_diff(bound, output_tuple):
    return torch.norm((bound.lw- (torch.from_numpy(output_tuple[2])).to(device)).flatten())

def profile_pytorch(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type='relu', num_profile=100):        
    bound = Bounds(p, eps, lw, lb, uw, ub)

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    start.record()
    for i in range(num_profile):
        if op_type == 'relu':
            bound_output = bound.relu()
        elif op_type == 'matmul':
            bound_output = bound.matmul(w_input)
        elif op_type == 'dot_product':
            bound_output = bound.dot_product(other_bound)

    end.record()

    # Waits for everything to finish running
    torch.cuda.synchronize()

    average_time = start.elapsed_time(end)/num_profile # Unit: Millisecond
    print("\n\n\n pytorch. op_type: {}, batch_size: {}, length: {}, dim_in: {}, dim_out: {}, average_time (ms): {}\n\n\n".format(op_type, batch_size, length, dim_in, dim_out, average_time))
    return bound_output
    
def profile_nn_wrapper(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type='relu', num_profile=1000):    
    if op_type == 'relu':
        bound_nn_wrapper = BoundsReLUWrapper()
    elif op_type == 'matmul':
        bound_nn_wrapper = BoundsMatMulWrapper(w_input)
    elif op_type == 'dot_product':
        bound_nn_wrapper = BoundsDotProductWrapper()
        
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    if op_type == 'relu':
        start.record()
        for i in range(num_profile):
            bound_output = bound_nn_wrapper.forward(p, eps, lw, lb, uw, ub)
        end.record()
    elif op_type == 'matmul':
        start.record()
        for i in range(num_profile):
            bound_output = bound_nn_wrapper.forward(p, eps, lw, lb, uw, ub)
        end.record()
    elif op_type == 'dot_product':
        start.record()
        for i in range(num_profile):
            bound_output = bound_nn_wrapper.forward(p, eps, lw, lb, uw, ub, *Bounds2Tuple(other_bound))
        end.record()

    # Waits for everything to finish running
    torch.cuda.synchronize()

    average_time = start.elapsed_time(end)/num_profile # Unit: Millisecond
    print("\n\n\n nn wrapper. op_type: {}, batch_size: {}, length: {}, dim_in: {}, dim_out: {}, average_time (ms): {}\n\n\n".format(op_type, batch_size, length, dim_in, dim_out, average_time))
    return bound_output

def profile_tvm_baseline(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type='relu', num_profile=1000):    
    if op_type == 'relu':
        bound_wrapper = BoundsReLUWrapper()
        example_inputs = Bounds2Tuple(Bounds(p, eps, lw, lb, uw, ub))
        scripted_model = torch.jit.trace(bound_wrapper.eval(), example_inputs).eval()
    elif op_type == 'matmul':
        bound_wrapper = BoundsMatMulWrapper(w_input)
        example_inputs = Bounds2Tuple(Bounds(p, eps, lw, lb, uw, ub))
        scripted_model = torch.jit.trace(bound_wrapper.eval(), example_inputs).eval()
    elif op_type == 'dot_product':
        bound_wrapper = BoundsDotProductWrapper()
        example_inputs = (*Bounds2Tuple(Bounds(p, eps, lw, lb, uw, ub)), *Bounds2Tuple(other_bound))
        scripted_model = torch.jit.trace(bound_wrapper.eval(), example_inputs).eval()    
        
    input_name = "input%d"
    shape_list = []
    if type(example_inputs) == tuple:
        for i in range(len(example_inputs)):
            shape_list.append((input_name%(i), example_inputs[i].shape))
    else:
        shape_list.append((input_name%(0), example_inputs.shape))
    
    mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)

    # target = tvm.target.Target("llvm", host="llvm")
    target = tvm.target.Target('cuda')
    dev = tvm.cuda(0)
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target=target, params=params)
        
        
    dtype = "float32"
    m = graph_executor.GraphModule(lib["default"](dev))
    # Set inputs
    if type(example_inputs) == tuple:
        for i in range(2, len(example_inputs)):
            m.set_input(input_name%(i), tvm.nd.array(example_inputs[i].cpu().numpy().astype(dtype)))
    else:
        m.set_input(input_name%(0), tvm.nd.array(example_inputs.numpy().cpu().astype(dtype)))
    # Execute

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    start.record()
    for i in range(num_profile):
        m.run()
        # Get outputs

    end.record()
    
    # Waits for everything to finish running
    torch.cuda.synchronize()

    average_time = start.elapsed_time(end)/num_profile # Unit: Millisecond
    tvm_output = m.get_output(0)
    print("\n\n\ntvm. op_type: {}, batch_size: {}, length: {}, dim_in: {}, dim_out: {}, average_time (ms): {}\n\n\n".format(op_type, batch_size, length, dim_in, dim_out, average_time))

    return (m.get_output(0).numpy(), m.get_output(1).numpy(), m.get_output(2).numpy(), m.get_output(3).numpy())
    
def profile_ansor_baseline(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type="relu", num_profile=1000):
    if op_type == 'relu':
        bound_wrapper = BoundsReLUWrapper()
        example_inputs = Bounds2Tuple(Bounds(p, eps, lw, lb, uw, ub))
        scripted_model = torch.jit.trace(bound_wrapper.eval(), example_inputs).eval()
    elif op_type == 'matmul':
        bound_wrapper = BoundsMatMulWrapper(w_input)
        example_inputs = Bounds2Tuple(Bounds(p, eps, lw, lb, uw, ub))
        scripted_model = torch.jit.trace(bound_wrapper.eval(), example_inputs).eval()
    elif op_type == 'dot_product':
        bound_wrapper = BoundsDotProductWrapper()
        example_inputs = (*Bounds2Tuple(Bounds(p, eps, lw, lb, uw, ub)), *Bounds2Tuple(other_bound))
        scripted_model = torch.jit.trace(bound_wrapper.eval(), example_inputs).eval()    
        
    input_name = "input%d"
    shape_list = []
    if type(example_inputs) == tuple:
        for i in range(len(example_inputs)):
            shape_list.append((input_name%(i), example_inputs[i].shape))
    else:
        shape_list.append((input_name%(0), example_inputs.shape))
    
    mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)

    # target = tvm.target.Target("llvm", host="llvm")
    target = tvm.target.Target('cuda')
    dev = tvm.device(str(target), 0)
    log_file = "ansor_autotuning_json/ansor_"+op_type+".json"

    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target, include_simple_tasks=False)
    # Enumerate the tasks
    # for idx, task in enumerate(tasks):
    #     print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
    #     print(task.compute_dag)

    # measure_ctx launches a different process for measurement to provide isolation
    # It protect the master process from GPU crashes
    measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)

    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=800 * len(tasks),  # change this to 800 & #task to achieve the best performance
        runner=measure_ctx.runner,
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    tuner.tune(tune_option)

    # Compile with the history best
    print("Compile ...")
    with auto_scheduler.ApplyHistoryBest(log_file):
        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
            lib_ansor = relay.build(mod, target=target, params=params)

    # Create graph executer
    dtype = "float32"
    module_ansor = graph_executor.GraphModule(lib_ansor["default"](dev))
    if type(example_inputs) == tuple:
        for i in range(2, len(example_inputs)):
            module_ansor.set_input(input_name%(i), tvm.nd.array(example_inputs[i].cpu().numpy().astype(dtype)))
    else:
        module_ansor.set_input(input_name%(0), tvm.nd.array(example_inputs.numpy().cpu().astype(dtype)))

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    start.record()
    for i in range(num_profile):
        module_ansor.run()
        # Get outputs

    end.record()
    
    # Waits for everything to finish running
    torch.cuda.synchronize()

    average_time = start.elapsed_time(end)/num_profile # Unit: Millisecond
    print("\n\n\n ansor\n\n\n")
    print("\n\n\nansor. op_type: {}, batch_size: {}, length: {}, dim_in: {}, dim_out: {}, average_time (ms): {}\n\n\n".format(op_type, batch_size, length, dim_in, dim_out, average_time))

    return (module_ansor.get_output(0).numpy(), module_ansor.get_output(1).numpy(), module_ansor.get_output(2).numpy(), module_ansor.get_output(3).numpy())

In [5]:
batch_size = 1
for op_type in [ 'relu']:
    for length in [2,4,8,16,32,64,128]:
        for dim_in in [64, 128, 256, 512, 1024]:
            dim_out=dim_in # Just an assumption for profiling
            dim_y_out = dim_out
            lb = torch.rand(1,length,dim_out).to(device)
            ub = lb + torch.rand(1,length,dim_out).to(device)
            lw = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5
            uw = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5

            lb1 = torch.rand(1,length,dim_out).to(device)
            ub1 = lb1 + torch.rand(1,length,dim_out).to(device)
            lw1 = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5
            uw1 = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5

            w_input = torch.rand(dim_y_out, dim_out).to(device) - 0.5
            other_bound = Bounds(p=2,eps=0.5,lw=lw,lb=lb,uw=uw,ub=ub)
            
            bound = profile_pytorch(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            wrapper_outpuet_tuple = profile_nn_wrapper(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            tvm_output_tuple =  profile_tvm_baseline(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            ansor_output_tuple = profile_ansor_baseline(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            print(check_bound_wrapper_diff(bound, wrapper_outpuet_tuple))
            print(check_tvm_ansor_baseline_diff(bound, tvm_output_tuple))
            print(check_tvm_ansor_baseline_diff(bound, ansor_output_tuple))
            print('--------------\n\n')
            





 pytorch. op_type: relu, batch_size: 1, length: 2, dim_in: 64, dim_out: 64, average_time (ms): 8.716328735351562






 nn wrapper. op_type: relu, batch_size: 1, length: 2, dim_in: 64, dim_out: 64, average_time (ms): 1.7544358825683595





  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))





tvm. op_type: relu, batch_size: 1, length: 2, dim_in: 64, dim_out: 64, average_time (ms): 1.2006195068359375





  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))


Get devices for measurement successfully!
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------------------------------------------------
------------------------------  [ 
-------------------------------------------------
|    0 |            - |              - |      0 |
-------------------------------------------------
Estimated total latency: - ms	Trials: 0	Used time : 0 s	Next ID: 0	
Task Scheduler ]
----------------------------------------------------------------------
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 2
Sample Iter: 5	#Pop: 10	#Target: 50	fail_ct: 10230	Time elapsed: 3.76
#Target has been reduced to 25 due to too many failures or duplications
Sample Iter: 10	#Pop: 10	#Target: 25	fail_ct: 20470	Time elapsed: 7.78
#Target has been reduced to 12 due to too many failures or duplications
Sa



----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 10	fail_ct: 2038	Time elapsed: 0.56
GA Iter: 0	Max score: N/A	Min score: N/A	#Pop: 0	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.8775	Min score: 0.0753	#Pop: 6	#M+: 1394	#M-: 0
EvolutionarySearch		#s: 6	Time elapsed: 1.96
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 6 programs to measure:
......******
Time elapsed for measurement: 7.91 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.13 s
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------

  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))





tvm. op_type: relu, batch_size: 1, length: 2, dim_in: 128, dim_out: 128, average_time (ms): 0.9847090911865234





  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))


Get devices for measurement successfully!
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------

-------------------------------------------------
|    0 |            - |              - |      0 |
-------------------------------------------------
Estimated total latency: - ms	Trials: 0	Used time : 0 s	Next ID: 0	
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 2
Sample Iter: 5	#Pop: 10	#Target: 50	fail_ct: 10230	Time elapsed: 3.80
#Target has been reduced to 25 due to too many failures or duplications
Sample Iter: 10	#Pop: 10	#Target: 25	fail_ct: 20470	Time elapsed: 7.93
#Target has been reduced to 12 due to too many failures or duplications
Sa



----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 10	fail_ct: 2038	Time elapsed: 0.55
GA Iter: 0	Max score: N/A	Min score: N/A	#Pop: 0	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9201	Min score: 0.2613	#Pop: 6	#M+: 1386	#M-: 0
EvolutionarySearch		#s: 6	Time elapsed: 1.94
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 6 programs to measure:
......******
Time elapsed for measurement: 8.12 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.15 s
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------

  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))





tvm. op_type: relu, batch_size: 1, length: 2, dim_in: 256, dim_out: 256, average_time (ms): 1.0053119659423828





  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))


Get devices for measurement successfully!
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------------------------------------------------
-------------------------------------------------
|    0 |            - |              - |      0 |
-------------------------------------------------
Estimated total latency: - ms	Trials: 0	Used time : 0 s	Next ID: 0	

------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 2
Sample Iter: 5	#Pop: 10	#Target: 50	fail_ct: 10230	Time elapsed: 3.76
#Target has been reduced to 25 due to too many failures or duplications
Sample Iter: 10	#Pop: 10	#Target: 25	fail_ct: 20470	Time elapsed: 7.01
#Target has been reduced to 12 due to too many failures or duplications
Sa



Time elapsed for training: 0.20 s
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        0.007 |          20.03 |     64 |
-------------------------------------------------
Estimated total latency: 0.013 ms	Trials: 64	Used time : 87 s	Next ID: 0	
----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Sample Initial Population	#s: 10	fail_ct: 2038	Time elapsed: 0.60
GA Iter: 0	Max score: N/A	Min score: N/A	#Pop: 0	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.6342	Min score: 0.1643	#Pop: 6	#M+: 1394	#M-: 0
EvolutionarySearch		#s: 6	Time elapsed: 2.01
--------------------------------------------------------------------

  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))





tvm. op_type: relu, batch_size: 1, length: 2, dim_in: 512, dim_out: 512, average_time (ms): 1.2594483184814453





  x = Bounds(float(p), float(eps), lw, lb, uw, ub).relu()
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))


Get devices for measurement successfully!
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------------------------------------------------

-------------------------------------------------
|    0 |            - |              - |      0 |
-------------------------------------------------
Estimated total latency: - ms	Trials: 0	Used time : 0 s	Next ID: 0	
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 2
Sample Iter: 5	#Pop: 10	#Target: 50	fail_ct: 10230	Time elapsed: 3.11
#Target has been reduced to 25 due to too many failures or duplications
Sample Iter: 10	#Pop: 10	#Target: 25	fail_ct: 20470	Time elapsed: 6.39
#Target has been reduced to 12 due to too many failures or duplications
Sa

KeyboardInterrupt: 

In [None]:
batch_size = 1
for op_type in ['matmul']:
    for length in [128]: # [2,4,8,16,32,64,128]:
        for dim_in in [64, 128, 256, 512, 1024]: # [64, 128, 256, 512, 1024]:
            dim_out=dim_in # Just an assumption for profiling
            dim_y_out = dim_out
            lb = torch.rand(1,length,dim_out).to(device)
            ub = lb + torch.rand(1,length,dim_out).to(device)
            lw = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5
            uw = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5

            lb1 = torch.rand(1,length,dim_out).to(device)
            ub1 = lb1 + torch.rand(1,length,dim_out).to(device)
            lw1 = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5
            uw1 = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5

            w_input = torch.rand(dim_y_out, dim_out).to(device) - 0.5
            other_bound = Bounds(p=2,eps=0.5,lw=lw,lb=lb,uw=uw,ub=ub)
            
            bound = profile_pytorch(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            wrapper_outpuet_tuple = profile_nn_wrapper(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            tvm_output_tuple =  profile_tvm_baseline(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            ansor_output_tuple = profile_ansor_baseline(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            #  print(type(bound.lw), type(bound_wrapper[0]), type(tvm_output[0]))
            print(check_bound_wrapper_diff(bound, wrapper_outpuet_tuple))
            print(check_tvm_ansor_baseline_diff(bound, tvm_output_tuple))
            print(check_tvm_ansor_baseline_diff(bound, ansor_output_tuple))
            print('---')


pytorch. op_type: matmul, batch_size: 1, length: 128, dim_in: 64, dim_out: 64, average_time (ms): 0.18743295669555665
nn wrapper. op_type: matmul, batch_size: 1, length: 128, dim_in: 64, dim_out: 64, average_time (ms): 0.19392383575439454


  x = Bounds(float(p), float(eps), lw, lb, uw, ub).matmul(self.W)
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))
One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


tvm. op_type: matmul, batch_size: 1, length: 128, dim_in: 64, dim_out: 64, average_time (ms): 3.5975372314453127


  x = Bounds(float(p), float(eps), lw, lb, uw, ub).matmul(self.W)
  return tuple((torch.Tensor([[x.p]]), torch.Tensor([[x.eps]]), x.lw, x.lb, x.uw, x.ub))


Get devices for measurement successfully!
|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------

-------------------------------------------------
|    0 |            - |              - |      0 |
|    1 |            - |              - |      0 |
-------------------------------------------------
Estimated total latency: - ms	Trials: 0	Used time : 0 s	Next ID: 0	
----------------------------------------------------------------------
------------------------------  [ Search ]
----------------------------------------------------------------------
Generate Sketches		#s: 1
Sample Initial Population	#s: 294	fail_ct: 1754	Time elapsed: 0.75
GA Iter: 0	Max score: 0.9982	Min score: 0.5667	#Pop: 128	#M+: 0	#M-: 0
GA Iter: 4	Max score: 0.9999	Min score: 0.9811	#Pop: 128	#M+: 1400	#M-: 0
EvolutionarySearch		#s: 1

KeyboardInterrupt: 

In [None]:
batch_size = 1
for op_type in ['dot_product']:
    for length in [2,4,8,16,32,64,128]:
        for dim_in in [64, 128, 256, 512, 1024]:
            dim_out=dim_in # Just an assumption for profiling
            dim_y_out = dim_out
            lb = torch.rand(1,length,dim_out).to(device)
            ub = lb + torch.rand(1,length,dim_out).to(device)
            lw = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5
            uw = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5

            lb1 = torch.rand(1,length,dim_out).to(device)
            ub1 = lb1 + torch.rand(1,length,dim_out).to(device)
            lw1 = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5
            uw1 = torch.rand(1,length,dim_in,dim_out).to(device) - 0.5

            w_input = torch.rand(dim_y_out, dim_out).to(device) - 0.5
            other_bound = Bounds(p=2,eps=0.5,lw=lw,lb=lb,uw=uw,ub=ub)
            
            bound = profile_pytorch(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            wrapper_outpuet_tuple = profile_nn_wrapper(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            tvm_output_tuple =  profile_tvm_baseline(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type) # May comment out this line
            ansor_output_tuple = profile_ansor_baseline(p, eps, lw, lb, uw, ub, w_input, other_bound, op_type)
            # print(type(bound.lw), type(bound_wrapper[0]), type(tvm_output[0]))
            print(check_bound_wrapper_diff(bound, wrapper_outpuet_tuple))
            print(check_tvm_ansor_baseline_diff(bound, tvm_output_tuple)) # May comment out this line
            print(check_tvm_ansor_baseline_diff(bound, ansor_output_tuple))
            print('---')