Импорт необходимых библиотек

In [178]:
import cupy as cp
import numpy as np
import time
import pandas as pd

Класс для взаимодействия и получения результатов вычислений

In [179]:
class CudaVecSum:
    def __init__(self, vec_size: int, vec: cp.ndarray, pars: dict):
        self.add_kernel = cp.RawKernel(r'''
            extern "C" 
            __global__ void vecsum(const float* input, float* output, int n) {
                int idx = blockIdx.x * blockDim.x + threadIdx.x;
                if (idx < n) {
                atomicAdd(&output[0], input[idx]);
                }
            }
            ''',
            "vecsum")
        self.N = vec_size
        self.vector = vec
        self.params = pars
        self.result = cp.zeros(1, dtype=cp.float32)

    def vec_sum(self):
        res = 0.0
        for el in self.vector: res += el
        return res

    def get_result(self):
        gstart = time.perf_counter()
        result_gpu = self.add_kernel((self.params["gridX"], 1),
                                    (self.params["blockX"], 1),
                                    (self.vector, self.result, self.N))
        gend = time.perf_counter()

        csumstart = time.perf_counter()
        result_cpu_sum = sum(self.vector)
        csumend = time.perf_counter()

        cfuncstart = time.perf_counter()
        result_cpu_func = self.vec_sum()
        cfuncend = time.perf_counter()

        cnpstart = time.perf_counter()
        result_cpu_np = np.sum(self.vector)
        cnpend = time.perf_counter()

        cupystart = time.perf_counter()
        result_cupy = cp.sum(self.vector)
        cupyend = time.perf_counter()

        return {
            "vector size": self.N,
            "parameters": str(self.params),
            "gpu matched cpu sum": str(self.result == result_cpu_sum) + " " + str(self.result) + " " + str(result_cpu_sum),
            "gpu matched cpu func": str(self.result == result_cpu_func) + " " + str(self.result) + " " + str(result_cpu_func),
            "gpu matched cpu numpy": str(self.result == result_cpu_np) + " " + str(self.result) + " " + str(result_cpu_np),
            "gpu matched cupy": str(self.result == result_cupy) + " " + str(self.result) + " " + str(result_cupy),
            "gpu time": (gend - gstart),
            "cpu sum time": (csumend - csumstart),
            "cpu func time": (cfuncend - cfuncstart),
            "cpu numpy time": (cnpend - cnpstart),
            "cupy time": (cupyend - cupystart),
        }

Задаваемые значения и полученные результаты для версии с подобранными значениями для grid и block

In [180]:
size_list = [1000, 5000, 10000, 50000, 100000, 500000, 1000000]
grid_block_list = [(32, 32), (128, 64), (128, 128), (256, 256), (512, 256), (1024, 512), (1024, 1024)]

res_list = []

for size, grid_block in zip(size_list, grid_block_list):
    obj = CudaVecSum(size,
                    cp.random.randn(size, dtype=cp.float32),
                    {
                        "gridX": grid_block[0],
                        "blockX": grid_block[1],
                    })
    res_list.append(obj.get_result())

df1 = pd.DataFrame.from_records(res_list)
print(df1.to_markdown())

|    |   vector size | parameters                      | gpu matched cpu sum              | gpu matched cpu func             | gpu matched cpu numpy           | gpu matched cupy                |    gpu time |   cpu sum time |   cpu func time |   cpu numpy time |   cupy time |
|---:|--------------:|:--------------------------------|:---------------------------------|:---------------------------------|:--------------------------------|:--------------------------------|------------:|---------------:|----------------:|-----------------:|------------:|
|  0 |          1000 | {'gridX': 32, 'blockX': 32}     | [False] [-23.980793] -23.980791  | [False] [-23.980793] -23.980791  | [False] [-23.980793] -23.98079  | [False] [-23.980793] -23.98079  | 7.73e-05    |      0.0143505 |       0.0096166 |      7.37e-05    | 1.59e-05    |
|  1 |          5000 | {'gridX': 128, 'blockX': 64}    | [False] [-57.918102] -57.91822   | [False] [-57.918102] -57.91822   | [False] [-57.918102] -57.91811  | [False] 

Задаваемые значения и полученные результаты для версии с одинаковыми максимальными значениями для grid и block

In [181]:
size_list = [1000, 5000, 10000, 50000, 100000, 500000, 1000000]
grid_block_list = [(1024, 1024), (1024, 1024), (1024, 1024), (1024, 1024), (1024, 1024), (1024, 1024), (1024, 1024)]

res_list = []

for size, grid_block in zip(size_list, grid_block_list):
    obj = CudaVecSum(size,
                    cp.random.randn(size, dtype=cp.float32),
                    {
                        "gridX": grid_block[0],
                        "blockX": grid_block[1],
                    })
    res_list.append(obj.get_result())

df2 = pd.DataFrame.from_records(res_list)
print(df2.to_markdown())

|    |   vector size | parameters                      | gpu matched cpu sum             | gpu matched cpu func            | gpu matched cpu numpy           | gpu matched cupy                |    gpu time |   cpu sum time |   cpu func time |   cpu numpy time |   cupy time |
|---:|--------------:|:--------------------------------|:--------------------------------|:--------------------------------|:--------------------------------|:--------------------------------|------------:|---------------:|----------------:|-----------------:|------------:|
|  0 |          1000 | {'gridX': 1024, 'blockX': 1024} | [False] [-20.829592] -20.829573 | [False] [-20.829592] -20.829573 | [False] [-20.829592] -20.829597 | [False] [-20.829592] -20.829597 | 3.38e-05    |      0.0129546 |       0.0089493 |      6.69002e-05 | 1.49e-05    |
|  1 |          5000 | {'gridX': 1024, 'blockX': 1024} | [False] [-61.4354] -61.435253   | [False] [-61.4354] -61.435253   | [False] [-61.4354] -61.435204   | [False] [-61.435

Результаты ускорений

In [213]:
perf_rate_1 = pd.DataFrame({
    "size": size_list,
    "cpu sum vs gpu": (df1["cpu sum time"] / df1["gpu time"]).apply('{:.2f}'.format),
    "cpu func vs gpu": (df1["cpu func time"] / df1["gpu time"]).apply('{:.2f}'.format),
    "cpu numpy vs gpu": (df1["cpu numpy time"] / df1["gpu time"]).apply('{:.2f}'.format),
    "cupy vs gpu": (df1["cupy time"] / df1["gpu time"]).apply('{:.2f}'.format),
    })

print(perf_rate_1.to_markdown(index=False))

|    size |   cpu sum vs gpu |   cpu func vs gpu |   cpu numpy vs gpu |   cupy vs gpu |
|--------:|-----------------:|------------------:|-------------------:|--------------:|
|    1000 |           185.65 |            124.41 |               0.95 |          0.21 |
|    5000 |          1772.16 |           2569.71 |               2.18 |          0.53 |
|   10000 |          5226.51 |           4267.28 |               2.94 |          0.77 |
|   50000 |         27646.2  |          21304.7  |               3.35 |          0.9  |
|  100000 |         51042.6  |          40044.5  |               3.28 |          1.17 |
|  500000 |        266257    |         210885    |               4.02 |          1    |
| 1000000 |        480257    |         366171    |               3.01 |          1.06 |


Результаты ускорений

In [214]:
perf_rate_2 = pd.DataFrame({
    "size": size_list,
    "cpu sum vs gpu": (df2["cpu sum time"] / df2["gpu time"]).apply('{:.2f}'.format),
    "cpu func vs gpu": (df2["cpu func time"] / df2["gpu time"]).apply('{:.2f}'.format),
    "cpu numpy vs gpu": (df2["cpu numpy time"] / df2["gpu time"]).apply('{:.2f}'.format),
    "cupy vs gpu": (df2["cupy time"] / df2["gpu time"]).apply('{:.2f}'.format),
    })

print(perf_rate_2.to_markdown(index=False))

|    size |   cpu sum vs gpu |   cpu func vs gpu |   cpu numpy vs gpu |   cupy vs gpu |
|--------:|-----------------:|------------------:|-------------------:|--------------:|
|    1000 |           383.27 |            264.77 |               1.98 |          0.44 |
|    5000 |          2500.64 |           1916.6  |               2.82 |          0.73 |
|   10000 |          5515.95 |           4215.42 |               3.29 |          0.87 |
|   50000 |         26686.6  |          20638.9  |               3.35 |          0.86 |
|  100000 |         53025    |          40517.5  |               3.16 |          0.88 |
|  500000 |        269427    |         207976    |               3.58 |          0.94 |
| 1000000 |        540862    |         420786    |               3.36 |          1.19 |
