# 배열의 값들을 전부 더하기
***
Reduction : 여러값들로 하나의 값을 얻는것 (예 : sum, max, min)

In [1]:
import numpy as np

arr = np.random.randint(10, size=10)
sum_arr = np.sum(arr)

print("arr =")
print(arr)
print("\nsum_arr")
print(sum_arr)

arr =
[9 7 9 2 1 1 7 9 5 2]

sum_arr
52


### 잘못된 방법

In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda import driver, compiler

In [3]:
# Kernel code
kernel_code = """
__global__ void sum_all(int* in_arr, int* ret_arr)
{
  int idx = threadIdx.x;
  ret_arr[0] += in_arr[idx];
}
"""

# Compile the kernel code
mod = compiler.SourceModule(kernel_code)

# Get kernel function
sum_func = mod.get_function("sum_all")

# Run & Print
result_arr = np.zeros(1, dtype=int)
sum_func(cuda.In(arr), cuda.InOut(result_arr), block=(10, 1, 1))

print("\nCPU로 계산 =")
print(sum_arr)
print("\nGPU로 계산 =")
print(result_arr[0])


CPU로 계산 =
52

GPU로 계산 =
9


### Atomic function 사용
* 코드가 간결하지만 느리다.

In [4]:
# Kernel code
kernel_code = """
__global__ void sum_all(int* in_arr, int* ret_arr)
{
  int idx = threadIdx.x;
  atomicAdd(&ret_arr[0], in_arr[idx]);
}
"""

# Compile the kernel code
mod = compiler.SourceModule(kernel_code)

# Get kernel function
sum_func = mod.get_function("sum_all")

# Run & Print
result_arr = np.zeros(1, dtype=int)
sum_func(cuda.In(arr), cuda.InOut(result_arr), block=(10, 1, 1))

print("\nCPU로 계산 =")
print(sum_arr)
print("\nGPU로 계산 =")
print(result_arr[0])


CPU로 계산 =
52

GPU로 계산 =
52


### Parallel reduction with shared memory
* 코드가 복잡하지만 빠르다.

Optimizing Parallel Reduction in CUDA의 Reduction #1을 구현<br/>
해당 Paper는 http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf 을 참고

In [5]:
# Kernel code
kernel_code = """
__global__ void sum_all(int* in_arr, int* ret_arr)
{
  extern __shared__ int sdata[];

  int tid = threadIdx.x;
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  sdata[tid] = in_arr[idx];
  __syncthreads();

  // Shared memory에서 reduction한다
  for ( int s=1 ; s<blockDim.x ; s*=2 )
  {
    if ( 0 == tid % (2*s) )
    {
      sdata[tid] += sdata[tid+s];
    }
    __syncthreads();
  }
  
  // Global memory에 결과를 쓴다
  if ( 0 == tid )
  {
    ret_arr[blockIdx.x] = sdata[0];
  }
}
"""

# Compile the kernel code
mod = compiler.SourceModule(kernel_code)

# Get kernel function
sum_func = mod.get_function("sum_all")

# Run & Print
THREAD_CNT = 5
BLOCK_CNT = 2
result_arr = np.zeros(BLOCK_CNT, dtype=int)
sum_func(cuda.In(arr), cuda.Out(result_arr), block=(THREAD_CNT, 1, 1), grid=(BLOCK_CNT, 1), shared=THREAD_CNT)

print("\nCPU로 계산 =")
print(sum_arr)
print("\nGPU로 계산 =")
print(result_arr)
print(result_arr.sum())


CPU로 계산 =
52

GPU로 계산 =
[28 24]
52
