安装tvm, 参考官方文档: https://tvm.apache.org/docs/install/index.html

In [1]:
pip install apache-tvm

Note: you may need to restart the kernel to use updated packages.


In [2]:
from __future__ import absolute_import, print_function


import tvm
from tvm import te
import numpy as np

和tensorflow的静态图的概念类似，TVM通过compute描述tensor的操作，用schedule描述在硬件上的调度，最后通过build完成编译。简单的理解是compute影响计算的结果，schedule影响计算的性能。

# compute

使用compute描述一个tensor的加法和减法，并用tvm.lower打印出计算描述信息。
* 在定义TVM placeholders的时候指定数据类型

In [3]:
def add(shape, dtype):
    A = te.placeholder(shape, dtype=dtype, name="A")
    B = te.placeholder(shape, dtype=dtype, name="B")
    C = te.compute(shape, lambda *index: A[index] + B[index], name="C")
    s = te.create_schedule(C.op)
    print(tvm.lower(s, [A, B, C], simple_mode=True))
    return tvm.build(s, [A, B, C])
    
op = add((3, 4), "float16")
a = tvm.nd.array(np.arange(12, dtype='float16').reshape((3, 4)))
b = tvm.nd.array(np.ones((3, 4)).astype('float16'))
c = tvm.nd.array(np.empty((3, 4), dtype='float16'))
op(a, b, c)
print("a", a, '\nb', b, '\nc', c)

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float16), float16, [12], []),
             B: Buffer(B_2: Pointer(float16), float16, [12], []),
             C: Buffer(C_2: Pointer(float16), float16, [12], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float16, [3, 4], []), B_1: B_3: Buffer(B_2, float16, [3, 4], []), C_1: C_3: Buffer(C_2, float16, [3, 4], [])} {
  for (i0: int32, 0, 3) {
    for (i1: int32, 0, 4) {
      let cse_var_1: int32 = ((i0*4) + i1)
      C[cse_var_1] = (A[cse_var_1] + B[cse_var_1])
    }
  }
}


a [[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]] 
b [[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]] 
c [[ 1.  2.  3.  4.]
 [ 5.  6.  7.  8.]
 [ 9. 10. 11. 12.]]


* 在compute过程中用astype转换数据类型。

In [4]:
def sub(shape, dtype):
    A = te.placeholder(shape, name="A")
    B = te.placeholder(shape, name="B")
    C = te.compute(shape, lambda *index: A[index].astype(dtype) - B[index].astype(dtype), name="C")
    s = te.create_schedule(C.op)
    print(tvm.lower(s, [A, B, C], simple_mode=True))
    return tvm.build(s, [A, B, C])
    
op = sub((3, 4), "float16")
a = tvm.nd.array(np.arange(12, dtype='float32').reshape((3, 4)))
b = tvm.nd.array(np.ones((3, 4)).astype('float32'))
c = tvm.nd.array(np.empty((3, 4), dtype='float16'))
op(a, b, c)
print("a", a, '\nb', b, '\nc', c)

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [12], []),
             B: Buffer(B_2: Pointer(float32), float32, [12], []),
             C: Buffer(C_2: Pointer(float16), float16, [12], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [3, 4], []), B_1: B_3: Buffer(B_2, float32, [3, 4], []), C_1: C_3: Buffer(C_2, float16, [3, 4], [])} {
  for (i0: int32, 0, 3) {
    for (i1: int32, 0, 4) {
      let cse_var_1: int32 = ((i0*4) + i1)
      C[cse_var_1] = (cast(float16, A[cse_var_1]) - cast(float16, B[cse_var_1]))
    }
  }
}


a [[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]] 
b [[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]] 
c [[-1.  0.  1.  2.]
 [ 3.  4.  5.  6.]
 [ 7.  8.  9. 10.]]


* shape未知时用te.var()来表示tensor的维度大小.

In [5]:
def dynamic_add():
    # declare some variables for use later
    n = te.var("n")
    m = te.var("m")
    # declare a matrix element-wise multiply
    A = te.placeholder((m, n), name="A")
    B = te.placeholder((m, n), name="B")
    C = te.compute((m, n), lambda i, j: A[i, j] + B[i, j], name="C")
    s = te.create_schedule([C.op])
    print(tvm.lower(s, [A, B, C], simple_mode=True))
    return tvm.build(s, [A, B, C])
op = dynamic_add()

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto"),
             C: Buffer(C_2: Pointer(float32), float32, [(stride_2: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m, n: int32], [stride, stride_3: int32], type="auto"), B_1: B_3: Buffer(B_2, float32, [m, n], [stride_1, stride_4: int32], type="auto"), C_1: C_3: Buffer(C_2, float32, [m, n], [stride_2, stride_5: int32], type="auto")} {
  for (i: int32, 0, m) {
    for (j: int32, 0, n) {
      C[((i*stride_2) + (j*stride_5))] = (A[((i*stride) + (j*stride_3))] + B[((i*stride_1) + (j*stride_4))])
    }
  }
}




In [6]:
a = tvm.nd.array(np.arange(12, dtype='float32').reshape((3, 4)))
b = tvm.nd.array(np.ones((3, 4)).astype('float32'))
c = tvm.nd.array(np.empty((3, 4), dtype='float32'))
op(a, b, c)
print("a", a, '\nb', b, '\nc', c)

a [[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]] 
b [[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]] 
c [[ 1.  2.  3.  4.]
 [ 5.  6.  7.  8.]
 [ 9. 10. 11. 12.]]


* 变量的compute支持运行时传递任意shape

In [7]:
a = tvm.nd.array(np.arange(10, dtype='float32').reshape((2, 5)))
b = tvm.nd.array(np.ones((2, 5)).astype('float32'))
c = tvm.nd.array(np.empty((2, 5), dtype='float32'))
op(a, b, c)
print("a", a, '\nb', b, '\nc', c)

a [[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]] 
b [[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]] 
c [[ 1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10.]]


* 交换tensor的维度

In [8]:
def transpose():
    n = te.var("n")
    m = te.var("m")
    A = te.placeholder((m, n), name="A")
    B = te.compute((n, m), lambda i, j: A[j, i], name="B")
    s = te.create_schedule([B.op])
    print(tvm.lower(s, [A, B], simple_mode=True))
    return tvm.build(s, [A, B])
op = transpose()
a = tvm.nd.array(np.arange(10, dtype='float32').reshape((2, 5)))
b = tvm.nd.array(np.empty((5, 2)).astype('float32'))
op(a, b)
print("a", a, '\nb', b)

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*n: int32)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m, n], [stride, stride_2: int32], type="auto"), B_1: B_3: Buffer(B_2, float32, [n, m], [stride_1, stride_3: int32], type="auto")} {
  for (i: int32, 0, n) {
    for (j: int32, 0, m) {
      B[((i*stride_1) + (j*stride_3))] = A[((j*stride) + (i*stride_2))]
    }
  }
}


a [[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]] 
b [[0. 5.]
 [1. 6.]
 [2. 7.]
 [3. 8.]
 [4. 9.]]


* 改变tensor的shape

In [9]:
def reshape():
    n, m = te.var("n"), te.var("m")
    p, q = te.var('p'), te.var('q')
    A = te.placeholder((m, n), name="A")
    B = te.compute((p, q), lambda i, j: A[(i*q+j)//n, (i*q+j)%n], name="B")
    s = te.create_schedule([B.op])
    print(tvm.lower(s, [A, B], simple_mode=True))
    return tvm.build(s, [A, B])
op = reshape()
a = tvm.nd.array(np.arange((12), dtype='float32').reshape(4, 3))
b = tvm.nd.array(np.empty((4, 3)).astype('float32'))
op(a, b)
print("a", a, '\nb', b)

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*p: int32)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m, n: int32], [stride, stride_2: int32], type="auto"), B_1: B_3: Buffer(B_2, float32, [p, q: int32], [stride_1, stride_3: int32], type="auto")} {
  for (i: int32, 0, p) {
    for (j: int32, 0, q) {
      B[((i*stride_1) + (j*stride_3))] = A[((floordiv(((i*q) + j), n)*stride) + (floormod(((i*q) + j), n)*stride_2))]
    }
  }
}


a [[ 0.  1.  2.]
 [ 3.  4.  5.]
 [ 6.  7.  8.]
 [ 9. 10. 11.]] 
b [[ 0.  1.  2.]
 [ 3.  4.  5.]
 [ 6.  7.  8.]
 [ 9. 10. 11.]]


* 维度和下标都可以用变量来表示。

In [10]:
def slicing():
    n, m, bi, bj, si, sj = [te.var(name) for name in ['n', 'm', 'bi', 'bj', 'si', 'sj']]
    A = te.placeholder((n, m), name='A')
    B = te.compute(((n - bi) // si, (m - bj) // sj), lambda i, j: A[i * si + bi, j * sj + bj], name='B')
    s = te.create_schedule([B.op])
    print(tvm.lower(s, [A, B], simple_mode=True))
    return tvm.build(s, [A, B, bi, bj, si, sj])
op = slicing()
a = tvm.nd.array(np.arange((12), dtype='float32').reshape(3, 4))
b = tvm.nd.array(np.empty((1, 3)).astype('float32'))
op(a, b, 1, 1, 2, 1)
print("a", a, '\nb', b)

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*n: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(floordiv((n - bi: int32), si: int32)*floordiv((m: int32 - bj: int32), sj: int32))], [])}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [n, m], [stride, stride_1: int32], type="auto"), B_1: B_3: Buffer(B_2, float32, [floordiv((n - bi), si), floordiv((m - bj), sj)], [])} {
  for (i: int32, 0, floordiv((n - bi), si)) {
    for (j: int32, 0, floordiv((m - bj), sj)) {
      B[((i*floordiv((m - bj), sj)) + j)] = A[((((i*si) + bi)*stride) + (((j*sj) + bj)*stride_1))]
    }
  }
}


a [[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]] 
b [[5. 6. 7.]]


* reduce_axis

In [11]:
def reduction_sum():
    n, m = te.var('n'), te.var('m')
    A = te.placeholder((n, m), name='A')
    j = te.reduce_axis((0, m), name='j')
    B = te.compute((n,), lambda i: te.sum(A[i, j], axis=j), name='b')
    s = te.create_schedule(B.op)
    print(tvm.lower(s, [A, B], simple_mode=True))
    return tvm.build(s, [A, B])
op = sum()
a = tvm.nd.array(np.arange((12), dtype='float32').reshape(3, 4))
b = tvm.nd.array(np.empty((3,)).astype('float32'))
op(a, b)
print("a", a, '\nb', b)

TypeError: sum() takes at least 1 positional argument (0 given)

In [None]:
def matmul(dtype):
    m, n, l = [te.var(name) for name in ['m', 'n', 'l']]
    A = te.placeholder((m, l), name="A", dtype=dtype)
    B = te.placeholder((l, n), name="B", dtype=dtype)
    k = te.reduce_axis((0, l), name='k')
    C = te.compute((m, n), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)
    print(tvm.lower(s, [A, B, C], simple_mode=True))
    return tvm.build(s, [A, B, C])
op = matmul('int8')
a = tvm.nd.array(np.arange((6), dtype='int8').reshape(2, 3))
b = tvm.nd.array(np.arange((6), dtype='int8').reshape(3, 2))
c = tvm.nd.array(np.empty((2, 2), dtype='int8'))
op(a, b, c)
print("a", a, '\nb', b, '\nc', c)

* 条件表达式

In [12]:
def if_express():
    n, m = te.var('n'), te.var('m')
    A = te.placeholder((n, m))
    B = te.compute(A.shape, lambda i, j: te.if_then_else(i >= j, A[i, j], 0.0))
    s = te.create_schedule(B.op)
    print(tvm.lower(s, [A, B], simple_mode=True))
    return tvm.build(s, [A, B])
op = if_express()
a = tvm.nd.array(np.arange(12).reshape((3, 4)).astype('float32'))
b = tvm.nd.array(np.empty((3, 4), dtype='float32'))
op(a, b)
print("a", a, '\nb', b)

@main = primfn(placeholder_1: handle, compute_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {placeholder: Buffer(placeholder_2: Pointer(float32), float32, [(stride: int32*n: int32)], [], type="auto"),
             compute: Buffer(compute_2: Pointer(float32), float32, [(stride_1: int32*n)], [], type="auto")}
  buffer_map = {placeholder_1: placeholder, compute_1: compute}
  preflattened_buffer_map = {placeholder_1: placeholder_3: Buffer(placeholder_2, float32, [n, m: int32], [stride, stride_2: int32], type="auto"), compute_1: compute_3: Buffer(compute_2, float32, [n, m], [stride_1, stride_3: int32], type="auto")} {
  for (i: int32, 0, n) {
    for (j: int32, 0, m) {
      compute[((i*stride_1) + (j*stride_3))] = @tir.if_then_else((j <= i), placeholder[((i*stride) + (j*stride_2))], 0f32, dtype=float32)
    }
  }
}


a [[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]] 
b [[ 0.  0.  0.  0.]
 [ 4.  5.  0.  0.]
 [

* all和any表示条件

In [13]:
def padding(pad):
    n, m = te.var('n'), te.var('m')
    A = te.placeholder((n, m), name='A')
    B = te.compute((n + pad * 2, m + pad * 2),
                    lambda i, j: te.if_then_else(
                        te.any(i < pad, i >= n + pad, j < pad, j >= m + pad), 0, A[i - pad, j - pad]),
                    name='B')
    s = te.create_schedule(B.op)
    print(tvm.lower(s, [A, B], simple_mode=True))
    return tvm.build(s, [A, B])
op = padding(2)
a = tvm.nd.array(np.ones((2, 2), dtype='float32'))
b = tvm.nd.array(np.empty((6, 6), dtype='float32'))
op(a, b)
print('a', a, '\nb', b)

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*n: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [((n + 4)*(m: int32 + 4))], [])}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [n, m], [stride, stride_1: int32], type="auto"), B_1: B_3: Buffer(B_2, float32, [(n + 4), (m + 4)], [])} {
  for (i: int32, 0, (n + 4)) {
    for (j: int32, 0, (m + 4)) {
      B[((i*(m + 4)) + j)] = @tir.if_then_else(((((i < 2) || ((n + 2) <= i)) || (j < 2)) || ((m + 2) <= j)), 0f32, A[(((i - 2)*stride) + ((j - 2)*stride_1))], dtype=float32)
    }
  }
}


a [[1. 1.]
 [1. 1.]] 
b [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 1. 0. 0.]
 [0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


# schedule

通常存在多种计算相同结果的方法，但是，不同的方法会导致不同的局部性和性能。因此 TVM 要求用户提供如何执行称为 Schedule 的计算。 时间表是一组计算转换，它转换程序中的计算循环。
可以从操作列表创建调度，默认情况下，调度以行优先顺序以串行方式计算张量。

## split

In [19]:
# split can split a specified axis into two axes by factor.
m, n = te.var("m"), te.var("n")
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] * 2, name="B")

s = te.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=32)
print(tvm.lower(s, [A, B], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m], [stride], type="auto"), B_1: B_3: Buffer(B_2, float32, [m], [stride_1], type="auto")} {
  for (i.outer: int32, 0, floordiv((m + 31), 32)) {
    for (i.inner: int32, 0, 32) {
      if @tir.likely((((i.outer*32) + i.inner) < m), dtype=bool) {
        let cse_var_1: int32 = ((i.outer*32) + i.inner)
        B[(cse_var_1*stride_1)] = (A[(cse_var_1*stride)]*2f32)
      }
    }
  }
}




In [20]:
# You can also split a axis by nparts, which splits the axis contrary with factor.
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i], name="B")

s = te.create_schedule(B.op)
bx, tx = s[B].split(B.op.axis[0], nparts=32)
print(tvm.lower(s, [A, B], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m], [stride], type="auto"), B_1: B_3: Buffer(B_2, float32, [m], [stride_1], type="auto")} {
  for (i.outer: int32, 0, 32) {
    for (i.inner: int32, 0, floordiv((m + 31), 32)) {
      if @tir.likely(((i.inner + (i.outer*floordiv((m + 31), 32))) < m), dtype=bool) {
        B[((i.inner + (i.outer*floordiv((m + 31), 32)))*stride_1)] = A[((i.inner + (i.outer*floordiv((m + 31), 32)))*stride)]
      }
    }
  }
}




## tile

In [21]:
# tile help you execute the computation tile by tile over two axes.
A = te.placeholder((m, n), name="A")
B = te.compute((m, n), lambda i, j: A[i, j], name="B")

s = te.create_schedule(B.op)
xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
print(tvm.lower(s, [A, B], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m, n: int32], [stride, stride_2: int32], type="auto"), B_1: B_3: Buffer(B_2, float32, [m, n], [stride_1, stride_3: int32], type="auto")} {
  for (i.outer: int32, 0, floordiv((m + 9), 10)) {
    for (j.outer: int32, 0, floordiv((n + 4), 5)) {
      for (i.inner: int32, 0, 10) {
        if @tir.likely((((i.outer*10) + i.inner) < m), dtype=bool) {
          for (j.inner: int32, 0, 5) {
            if @tir.likely((((j.outer*5) + j.inner) < n), dtype=bool) {
              let cse_var_2: int32 = ((j.outer*5) + j.inner)
              let cse_var_1: int32 = ((i.outer*10) + i.

## fuse

In [22]:
# fuse can fuse two consecutive axes of one computation.
A = te.placeholder((m, n), name="A")
B = te.compute((m, n), lambda i, j: A[i, j], name="B")

s = te.create_schedule(B.op)
# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)
fused = s[B].fuse(xi, yi)
print(tvm.lower(s, [A, B], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m, n: int32], [stride, stride_2: int32], type="auto"), B_1: B_3: Buffer(B_2, float32, [m, n], [stride_1, stride_3: int32], type="auto")} {
  for (i.outer: int32, 0, floordiv((m + 9), 10)) {
    for (j.outer: int32, 0, floordiv((n + 4), 5)) {
      for (i.inner.j.inner.fused: int32, 0, 50) {
        if @tir.likely((((i.outer*10) + floordiv(i.inner.j.inner.fused, 5)) < m), dtype=bool) {
          if @tir.likely((((j.outer*5) + floormod(i.inner.j.inner.fused, 5)) < n), dtype=bool) {
            let cse_var_2: int32 = ((j.outer*5) + floormod(i.inner.j.inner.fused, 5))
   

## reorder


In [23]:
# reorder can reorder the axes in the specified order.
A = te.placeholder((m, n), name="A")
B = te.compute((m, n), lambda i, j: A[i, j], name="B")

s = te.create_schedule(B.op)
# tile to four axes first: (i.outer, j.outer, i.inner, j.inner)
xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
# then reorder the axes: (i.inner, j.outer, i.outer, j.inner)
s[B].reorder(xi, yo, xo, yi)
print(tvm.lower(s, [A, B], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m, n: int32], [stride, stride_2: int32], type="auto"), B_1: B_3: Buffer(B_2, float32, [m, n], [stride_1, stride_3: int32], type="auto")} {
  for (i.inner: int32, 0, 10) {
    for (j.outer: int32, 0, floordiv((n + 4), 5)) {
      for (i.outer: int32, 0, floordiv((m + 9), 10)) {
        if @tir.likely((((i.outer*10) + i.inner) < m), dtype=bool) {
          for (j.inner: int32, 0, 5) {
            if @tir.likely((((j.outer*5) + j.inner) < n), dtype=bool) {
              let cse_var_2: int32 = ((j.outer*5) + j.inner)
              let cse_var_1: int32 = ((i.outer*10) + i.

## bind

In [24]:
# bind can bind a specified axis with a thread axis, often used in gpu programming.
A = te.placeholder((n,), name="A")
B = te.compute(A.shape, lambda i: A[i] * 2, name="B")

s = te.create_schedule(B.op)
bx, tx = s[B].split(B.op.axis[0], factor=64)
s[B].bind(bx, te.thread_axis("blockIdx.x"))
s[B].bind(tx, te.thread_axis("threadIdx.x"))
print(tvm.lower(s, [A, B], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*n: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*n)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [n], [stride], type="auto"), B_1: B_3: Buffer(B_2, float32, [n], [stride_1], type="auto")} {
  attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = floordiv((n + 63), 64);
  attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
  if @tir.likely((((blockIdx.x*64) + threadIdx.x) < n), dtype=bool) {
    B[(((blockIdx.x*64) + threadIdx.x)*stride_1)] = (A[(((blockIdx.x*64) + threadIdx.x)*stride)]*2f32)
  }
}




## compute_at

In [25]:
# For a schedule that consists of multiple operators, TVM will compute tensors at the root separately by default.
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] + 1, name="B")
C = te.compute((m,), lambda i: B[i] * 2, name="C")

s = te.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto"),
             C: Buffer(C_2: Pointer(float32), float32, [(stride_2: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m], [stride], type="auto"), B_1: B_3: Buffer(B_2, float32, [m], [stride_1], type="auto"), C_1: C_3: Buffer(C_2, float32, [m], [stride_2], type="auto")} {
  for (i: int32, 0, m) {
    B[(i*stride_1)] = (A[(i*stride)] + 1f32)
  }
  for (i_1: int32, 0, m) {
    C[(i_1*stride_2)] = (B[(i_1*stride_1)]*2f32)
  }
}




In [26]:
# compute_at can move computation of B into the first axis of computation of C.
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] + 1, name="B")
C = te.compute((m,), lambda i: B[i] * 2, name="C")

s = te.create_schedule(C.op)
s[B].compute_at(s[C], C.op.axis[0])
print(tvm.lower(s, [A, B, C], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto"),
             C: Buffer(C_2: Pointer(float32), float32, [(stride_2: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m], [stride], type="auto"), B_1: B_3: Buffer(B_2, float32, [m], [stride_1], type="auto"), C_1: C_3: Buffer(C_2, float32, [m], [stride_2], type="auto")} {
  for (i: int32, 0, m) {
    B[(i*stride_1)] = (A[(i*stride)] + 1f32)
    C[(i*stride_2)] = (B[(i*stride_1)]*2f32)
  }
}




## compute_inline

In [27]:
# compute_inline can mark one stage as inline, then the body of computation will be expanded and inserted at the address where the tensor is required.
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] + 1, name="B")
C = te.compute((m,), lambda i: B[i] * 2, name="C")

s = te.create_schedule(C.op)
s[B].compute_inline()
print(tvm.lower(s, [A, B, C], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto"),
             C: Buffer(C_2: Pointer(float32), float32, [(stride_2: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m], [stride], type="auto"), B_1: B_3: Buffer(B_2, float32, [m], [stride_1], type="auto"), C_1: C_3: Buffer(C_2, float32, [m], [stride_2], type="auto")} {
  for (i: int32, 0, m) {
    C[(i*stride_2)] = ((A[(i*stride)] + 1f32)*2f32)
  }
}




## compute_root

In [28]:
# compute_root can move computation of one stage to the root.
A = te.placeholder((m,), name="A")
B = te.compute((m,), lambda i: A[i] + 1, name="B")
C = te.compute((m,), lambda i: B[i] * 2, name="C")

s = te.create_schedule(C.op)
s[B].compute_at(s[C], C.op.axis[0])
s[B].compute_root()
print(tvm.lower(s, [A, B, C], simple_mode=True))

@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
             B: Buffer(B_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto"),
             C: Buffer(C_2: Pointer(float32), float32, [(stride_2: int32*m)], [], type="auto")}
  buffer_map = {A_1: A, B_1: B, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [m], [stride], type="auto"), B_1: B_3: Buffer(B_2, float32, [m], [stride_1], type="auto"), C_1: C_3: Buffer(C_2, float32, [m], [stride_2], type="auto")} {
  for (i: int32, 0, m) {
    B[(i*stride_1)] = (A[(i*stride)] + 1f32)
  }
  for (i_1: int32, 0, m) {
    C[(i_1*stride_2)] = (B[(i_1*stride_1)]*2f32)
  }
}




## Summary
为了获得性能良好的内核实现，一般的工作流程往往是：
* 通过一系列compute操作描述计算。 
* 尝试使用原语来schedule。 
* 编译运行看看性能差异。 
* 根据运行结果调整schedule。