# Dot Product 구현

### Use CPU

In [1]:
import numpy as np

N = 10

a_mat = np.random.randint(5, size=[N, N])
b_mat = np.random.randint(5, size=[N, N])
ret_mat_cpu = np.dot(a_mat, b_mat)

print("a_mat =")
print(a_mat)
print("\nb_mat =")
print(b_mat)
print("\nret_mat =")
print(ret_mat_cpu)

a_mat =
[[3 0 4 4 0 4 4 3 2 4]
 [2 4 2 0 4 2 2 3 2 2]
 [1 3 2 3 4 4 2 1 3 4]
 [0 0 1 1 3 4 3 2 0 1]
 [3 1 1 4 4 3 0 4 1 1]
 [0 4 2 3 2 0 2 0 0 2]
 [2 0 1 3 4 4 2 4 2 2]
 [0 2 3 0 3 0 3 3 2 2]
 [1 4 2 4 2 2 1 3 4 3]
 [4 3 2 3 1 1 0 0 1 2]]

b_mat =
[[1 2 3 0 3 3 1 1 4 4]
 [1 1 2 2 0 0 1 3 0 1]
 [1 2 4 1 3 3 0 3 0 3]
 [3 0 3 1 3 3 0 0 3 2]
 [3 0 2 1 0 0 0 4 4 3]
 [0 4 3 2 4 1 1 2 3 2]
 [3 4 2 2 1 4 4 3 0 0]
 [1 2 1 1 1 3 3 1 3 4]
 [3 3 1 0 1 2 4 4 1 3]
 [0 4 2 0 2 4 0 1 0 0]]

ret_mat =
[[40 74 70 27 66 82 40 50 47 58]
 [35 48 49 25 31 43 33 59 41 52]
 [43 60 62 28 48 55 31 63 47 52]
 [24 38 35 21 29 32 22 35 33 30]
 [36 36 51 21 43 45 23 40 62 61]
 [27 24 37 19 21 31 12 34 17 22]
 [40 52 53 24 46 54 34 49 59 59]
 [32 40 37 19 21 42 31 49 23 38]
 [43 52 56 25 43 56 36 56 43 56]
 [24 30 45 14 36 38 12 31 33 39]]


### Use GPU

In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda import driver, compiler

In [3]:
# Kernel code
kernel_code = """
__device__ __constant__ int n;

__global__ void mul(int* in_arr1, int* in_arr2, int* out_arr)
{
  int col = threadIdx.x;
  int row = threadIdx.y;
  
  int sum = 0;
  
  if ( col < n && row < n )
  {
    for ( int i=0 ; i<n ; i++ )
    {
      sum += in_arr1[row * n + i] * in_arr2[i * n + col];
    }
  }
  
  out_arr[col + row*n] = sum;
}
"""

# Compile the kernel code
mod = compiler.SourceModule(kernel_code)

# Get kernel function
mul_func = mod.get_function("mul")

# 상수 설정
host_n = np.array([N], dtype=np.int)
device_n = mod.get_global("n")[0]
cuda.memcpy_htod(device_n, host_n[0])

# Run
ret_mat_gpu = np.zeros_like(a_mat)
mul_func(cuda.In(a_mat), cuda.In(b_mat), cuda.Out(ret_mat_gpu), block=(N, N, 1), grid=(1, 1))

# 출력
print("\nGPU로 계산 =")
print(ret_mat_gpu)
print("\nCPU로 계산 =")
print(ret_mat_cpu)


GPU로 계산 =
[[40 74 70 27 66 82 40 50 47 58]
 [35 48 49 25 31 43 33 59 41 52]
 [43 60 62 28 48 55 31 63 47 52]
 [24 38 35 21 29 32 22 35 33 30]
 [36 36 51 21 43 45 23 40 62 61]
 [27 24 37 19 21 31 12 34 17 22]
 [40 52 53 24 46 54 34 49 59 59]
 [32 40 37 19 21 42 31 49 23 38]
 [43 52 56 25 43 56 36 56 43 56]
 [24 30 45 14 36 38 12 31 33 39]]

CPU로 계산 =
[[40 74 70 27 66 82 40 50 47 58]
 [35 48 49 25 31 43 33 59 41 52]
 [43 60 62 28 48 55 31 63 47 52]
 [24 38 35 21 29 32 22 35 33 30]
 [36 36 51 21 43 45 23 40 62 61]
 [27 24 37 19 21 31 12 34 17 22]
 [40 52 53 24 46 54 34 49 59 59]
 [32 40 37 19 21 42 31 49 23 38]
 [43 52 56 25 43 56 36 56 43 56]
 [24 30 45 14 36 38 12 31 33 39]]
