# Dot Product 구현

### Use CPU

In [1]:
import numpy as np

N = 10

a_mat = np.random.randint(5, size=[N, N])
b_mat = np.random.randint(5, size=[N, N])
ret_mat_cpu = np.dot(a_mat, b_mat)

print("a_mat =")
print(a_mat)
print("\nb_mat =")
print(b_mat)
print("\nret_mat =")
print(ret_mat_cpu)

a_mat =
[[1 4 0 4 1 4 2 2 0 3]
 [3 1 0 4 1 3 0 3 4 1]
 [2 1 1 3 3 0 1 2 3 2]
 [1 2 2 2 2 0 1 0 4 2]
 [2 4 1 1 1 0 0 3 2 2]
 [3 3 3 4 0 4 1 4 3 1]
 [3 0 3 4 4 0 3 4 2 3]
 [2 4 4 3 3 0 3 3 2 3]
 [4 3 2 2 3 4 2 0 0 1]
 [3 3 2 4 2 3 2 2 2 3]]

b_mat =
[[3 2 1 0 1 0 4 0 2 4]
 [3 0 2 2 4 4 1 0 2 3]
 [2 2 1 1 1 2 1 0 3 0]
 [2 2 0 2 4 1 3 3 2 1]
 [1 2 0 2 1 3 3 4 0 3]
 [1 1 4 1 1 3 3 3 3 2]
 [3 3 4 4 0 3 1 3 1 2]
 [4 1 4 1 1 4 0 2 4 1]
 [0 4 0 1 1 3 4 0 0 3]
 [2 1 4 1 1 1 3 3 1 3]]

ret_mat =
[[48 27 53 35 43 52 46 47 43 46]
 [38 39 33 23 35 45 56 34 38 46]
 [35 37 25 26 29 40 47 34 26 42]
 [26 35 19 24 27 37 43 23 19 38]
 [39 23 31 20 31 42 33 19 31 39]
 [57 44 52 33 46 63 58 38 59 51]
 [58 52 46 40 35 56 59 54 45 53]
 [62 47 50 44 45 66 54 45 48 56]
 [44 33 40 31 34 46 53 39 39 51]
 [55 44 51 38 45 58 63 48 48 58]]


### Use GPU

In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda import driver, compiler

In [3]:
# Kernel code
kernel_code = """
__constant__ int n;

__global__ void mul(int* in_arr1, int* in_arr2, int* out_arr)
{
  int col = threadIdx.x;
  int row = threadIdx.y;
  
  int sum = 0;
  
  if ( col < n && row < n )
  {
    for ( int i=0 ; i<n ; i++ )
    {
      sum += in_arr1[row * n + i] * in_arr2[i * n + col];
    }
  }
  
  out_arr[col + row*n] = sum;
}
"""

# Compile the kernel code
mod = compiler.SourceModule(kernel_code)

# Get kernel function
mul_func = mod.get_function("mul")

# 상수 설정
host_n = np.array([N], dtype=np.int)
device_n = mod.get_global("n")[0]
cuda.memcpy_htod(device_n, host_n[0])

# Run
ret_mat_gpu = np.zeros_like(a_mat)
mul_func(cuda.In(a_mat), cuda.In(b_mat), cuda.Out(ret_mat_gpu), block=(N, N, 1), grid=(1, 1))

# 출력
print("\nGPU로 계산 =")
print(ret_mat_gpu)
print("\nCPU로 계산 =")
print(ret_mat_cpu)


GPU로 계산 =
[[48 27 53 35 43 52 46 47 43 46]
 [38 39 33 23 35 45 56 34 38 46]
 [35 37 25 26 29 40 47 34 26 42]
 [26 35 19 24 27 37 43 23 19 38]
 [39 23 31 20 31 42 33 19 31 39]
 [57 44 52 33 46 63 58 38 59 51]
 [58 52 46 40 35 56 59 54 45 53]
 [62 47 50 44 45 66 54 45 48 56]
 [44 33 40 31 34 46 53 39 39 51]
 [55 44 51 38 45 58 63 48 48 58]]

CPU로 계산 =
[[48 27 53 35 43 52 46 47 43 46]
 [38 39 33 23 35 45 56 34 38 46]
 [35 37 25 26 29 40 47 34 26 42]
 [26 35 19 24 27 37 43 23 19 38]
 [39 23 31 20 31 42 33 19 31 39]
 [57 44 52 33 46 63 58 38 59 51]
 [58 52 46 40 35 56 59 54 45 53]
 [62 47 50 44 45 66 54 45 48 56]
 [44 33 40 31 34 46 53 39 39 51]
 [55 44 51 38 45 58 63 48 48 58]]
