# 배열의 값들을 전부 더하기
***
Reduction : 여러값들로 하나의 값을 얻는것 (예 : sum, max, min)

In [1]:
import numpy as np

arr = np.random.randint(10, size=10)
sum_arr = np.sum(arr)

print("arr =")
print(arr)
print("\nsum_arr")
print(sum_arr)

arr =
[2 7 7 7 7 4 8 0 7 8]

sum_arr
57


### 잘못된 방법

In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda import driver, compiler

In [3]:
# Kernel code
kernel_code = """
__global__ void sum_all(int* in_arr, int* ret_arr)
{
  int idx = threadIdx.x;
  ret_arr[0] += in_arr[idx];
}
"""

# Compile the kernel code
mod = compiler.SourceModule(kernel_code)

# Get kernel function
sum_func = mod.get_function("sum_all")

# Run & Print
result_arr = np.zeros(1, dtype=int)
sum_func(cuda.In(arr), cuda.InOut(result_arr), block=(10, 1, 1))

print("\nCPU로 계산 =")
print(sum_arr)
print("\nGPU로 계산 =")
print(result_arr[0])


CPU로 계산 =
57

GPU로 계산 =
8


### Atomic function 사용
* 코드가 간결하지만 느리다.

In [4]:
# Kernel code
kernel_code = """
__global__ void sum_all(int* in_arr, int* ret_arr)
{
  int idx = threadIdx.x;
  atomicAdd(&ret_arr[0], in_arr[idx]);
}
"""

# Compile the kernel code
mod = compiler.SourceModule(kernel_code)

# Get kernel function
sum_func = mod.get_function("sum_all")

# Run & Print
result_arr = np.zeros(1, dtype=int)
sum_func(cuda.In(arr), cuda.InOut(result_arr), block=(10, 1, 1))

print("\nCPU로 계산 =")
print(sum_arr)
print("\nGPU로 계산 =")
print(result_arr[0])


CPU로 계산 =
57

GPU로 계산 =
57


### Parallel reduction with shared memory
* 코드가 복잡하지만 빠르다.