# VTA 测试

## 2D 卷积
使用 NCHW 布局的卷积的数学定义：

$$
\text{Conv}[b, k, i, j] =
\sum_{d_i, d_j, q} A[b, q, \text{strides} * i + d_i, \text{strides} * j + d_j] * W[k, q, d_i, d_j],
$$

其中 $A$ 是输入张量，$W$ 是权重张量， $W$ 是权重张量， $b$ 是批次索引， $k$ 是输出通道索引， $i$ 和 $j$ 是图像高度和宽度的索引，$d_i$ 和 $d_j$ 是权重的索引，$q$ 是输入通道，$\text{strides}$ 是卷积核的步幅。

下面考虑简单的情况：`stride=1, padding=0`。

In [1]:
import set_env

In [2]:
import numpy as np
import tvm
from tvm.ir.module import IRModule
from tvm.script import tir as T

In [22]:
N, CI, H, W, CO, K = 1, 1, 8, 8, 2, 3
OUT_H, OUT_W = H - K + 1, W - K + 1
data = np.arange(N*CI*H*W).reshape(N, CI, H, W).astype(np.int32)
weight = np.arange(CO*CI*K*K).reshape(CO, CI, K, K).astype(np.int32)

In [23]:
# torch version
import torch

data_torch = torch.from_numpy(data)
weight_torch = torch.from_numpy(weight)
conv_torch = torch.nn.functional.conv2d(data_torch, weight_torch)
conv_torch = conv_torch.numpy().astype(np.int32)
conv_torch

array([[[[ 474,  510,  546,  582,  618,  654],
         [ 762,  798,  834,  870,  906,  942],
         [1050, 1086, 1122, 1158, 1194, 1230],
         [1338, 1374, 1410, 1446, 1482, 1518],
         [1626, 1662, 1698, 1734, 1770, 1806],
         [1914, 1950, 1986, 2022, 2058, 2094]],

        [[1203, 1320, 1437, 1554, 1671, 1788],
         [2139, 2256, 2373, 2490, 2607, 2724],
         [3075, 3192, 3309, 3426, 3543, 3660],
         [4011, 4128, 4245, 4362, 4479, 4596],
         [4947, 5064, 5181, 5298, 5415, 5532],
         [5883, 6000, 6117, 6234, 6351, 6468]]]], dtype=int32)

In [None]:
@tvm.script.ir_module
class MyConv:
  @T.prim_func
  def conv(data: T.Buffer[(N, CI, HI, WI), "int32"],
           weight: T.Buffer[(CO, CI, K, K), "int32"],
           Y: T.Buffer[(N, CO, HO, WO), "int32"]):
    T.func_attr({"global_symbol": "conv", "tir.noalias": True})
    

In [None]:

rt_lib = tvm.build(MyConv, target="llvm")
data_tvm = tvm.nd.array(data)
weight_tvm = tvm.nd.array(weight)
conv_tvm = tvm.nd.array(np.empty((N, CO, OUT_H, OUT_W), dtype=np.int64))
rt_lib["conv"](data_tvm, weight_tvm, conv_tvm)
np.testing.assert_allclose(conv_tvm.numpy(), conv_torch, rtol=1e-5)

In [9]:
@tvm.script.ir_module
class Add:
    @T.prim_func
    def add(A: T.Buffer[(4, 4), "int64"],
            B: T.Buffer[(4, 4), "int64"],
            C: T.Buffer[(4, 4), "int64"]):
        T.func_attr({"global_symbol": "add"})
        for i, j in T.grid(4, 4):
            with T.block("C"):
                vi, vj = T.axis.remap("SS", [i, j])
                C[vi, vj] = A[vi, vj] + B[vi, vj]

In [8]:
# low-level numpy version
def lnumpy_add(a: np.ndarray, b: np.ndarray, c: np.ndarray):
  for i in range(4):
    for j in range(4):
      c[i, j] = a[i, j] + b[i, j]
c_lnumpy = np.empty((4, 4), dtype=np.int64)
lnumpy_add(a, b, c_lnumpy)
c_lnumpy

array([[16, 16, 16, 16],
       [16, 16, 16, 16],
       [16, 16, 16, 16],
       [16, 16, 16, 16]])

In [10]:
rt_lib = tvm.build(Add, target="llvm")
a_tvm = tvm.nd.array(a)
b_tvm = tvm.nd.array(b)
c_tvm = tvm.nd.array(np.empty((4, 4), dtype=np.int64))
rt_lib["add"](a_tvm, b_tvm, c_tvm)
np.testing.assert_allclose(c_tvm.numpy(), c_np, rtol=1e-5)

In [13]:
rt_lib = tvm.build(Add, target="c")

In [18]:
print(rt_lib.get_source())

// tvm target: c -keys=cpu 
#define TVM_EXPORTS
#include "tvm/runtime/c_runtime_api.h"
#include "tvm/runtime/c_backend_api.h"
#include <math.h>
#ifdef __cplusplus
extern "C"
#endif
TVM_DLL int32_t add(void* args, int32_t* arg_type_ids, int32_t num_args, void* out_ret_value, int32_t* out_ret_tcode, void* resource_handle) {
  void* arg_A_handle = (((TVMValue*)args)[0].v_handle);
  int32_t arg_A_handle_code = arg_type_ids[0];
  void* arg_B_handle = (((TVMValue*)args)[1].v_handle);
  int32_t arg_B_handle_code = arg_type_ids[1];
  void* arg_C_handle = (((TVMValue*)args)[2].v_handle);
  int32_t arg_C_handle_code = arg_type_ids[2];
  void* A = (((DLTensor*)arg_A_handle)[0].data);
  void* arg_A_handle_shape = (((DLTensor*)arg_A_handle)[0].shape);
  void* arg_A_handle_strides = (((DLTensor*)arg_A_handle)[0].strides);
  int32_t dev_id = (((DLTensor*)arg_A_handle)[0].device.device_id);
  void* B = (((DLTensor*)arg_B_handle)[0].data);
  void* arg_B_handle_shape = (((DLTensor*)arg_B_handle)[0].shap

In [3]:
N, CI, H, W, CO, K = 1, 1, 8, 8, 2, 3
OUT_H, OUT_W = H - K + 1, W - K + 1
data = np.arange(N*CI*H*W).reshape(N, CI, H, W)
weight = np.arange(CO*CI*K*K).reshape(CO, CI, K, K)