In [1]:
# Setup cuda environment
# !pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
# %load_ext nvcc4jupyter

In [1]:
%%writefile polySumGPU.cu
#include <cstdio>
#include <iostream>
#define CUDA_CHECK(call)                                                     \
  do {                                                                        \
    cudaError_t err = call;                                                   \
    if (err != cudaSuccess) {                                                 \
      fprintf(stderr, "CUDA error at %s:%d: %s\n",                            \
              __FILE__, __LINE__, cudaGetErrorString(err));                   \
      exit(EXIT_FAILURE);                                                     \
    }                                                                         \
  } while (0)

// CUDA kernel function
__global__ void poly_sum_kernel(int *input1, int *input2, int size) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = gridDim.x * blockDim.x;
    for (; tid < size; tid += stride) {
        input2[tid] = input1[tid] + input2[tid];
    }
}

extern "C" int get_max_threads_per_block() {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, /*device=*/0);
    return prop.maxThreadsPerBlock;
}

// Wrapper function to call the CUDA kernel
extern "C" void poly_sum(int *input1, int *input2, int size) {
    // Allocate device memory
    int *d_input1, *d_input2;
    CUDA_CHECK(cudaMalloc((void**)&d_input1, size * sizeof(int)));
    CUDA_CHECK(cudaMalloc((void**)&d_input2, size * sizeof(int)));

    // Copy input data to device
    CUDA_CHECK(cudaMemcpy(d_input1, input1, size * sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_input2, input2, size * sizeof(int), cudaMemcpyHostToDevice));

    // Launch CUDA kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;

    // Query max active blocks per multiprocessor
    int device;
    CUDA_CHECK(cudaGetDevice(&device));

    cudaFuncAttributes attr;
    CUDA_CHECK(cudaFuncGetAttributes(&attr, poly_sum_kernel));

    cudaDeviceProp prop;
    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));

    std::cout << prop.maxGridSize[0];
    // In case number of needed blocks exceeds hardware limit
    blocksPerGrid = std::min(blocksPerGrid, prop.maxGridSize[0]);
    poly_sum_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_input1, d_input2, size);

    // Copy result back to host
    CUDA_CHECK(cudaMemcpy(input2, d_input2, size * sizeof(int), cudaMemcpyDeviceToHost));

    // Free device memory
    cudaFree(d_input1);
    cudaFree(d_input2);
}

Writing polySumGPU.cu


In [None]:
# Compile the cuda code and produce a shared library to get linked to the python main program
!nvcc  -o polySumGPU.so -shared -Xcompiler -fPIC polySumGPU.cu

# Testing Polynomial Sum

In [None]:
# Python function calling the compiled C++/CUDA function

# ctypes in python bridges the gap between python dynamic data types and c static ones.
import ctypes

# Load the CUDA library
cuda_lib = ctypes.CDLL('./polySumGPU.so')  # Update with the correct path

# Define the function prototype
cuda_lib.poly_sum.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int), ctypes.c_int]
cuda_lib.poly_sum.restype = None

# Prepare data
input_data_1 = [1, 2, 3, 4]
input_data_2 = [5, 2, 9, 7]
size = len(input_data_1)

# Convert Python lists to ctypes arrays
input_array_1 = (ctypes.c_int * size)(*input_data_1)
input_array_2 = (ctypes.c_int * size)(*input_data_2)

# Call the CUDA function
cuda_lib.poly_sum(input_array_1, input_array_2, size)

# Print the result
result = list(input_array_2)
print("Result:", result)

Result: [6, 4, 12, 11]


# Testing Large Input Size

In [None]:
import ctypes

# 1) Load your shared library and set up the prototype
cuda_lib = ctypes.CDLL('./polySumGPU.so')  # adjust if your .so has a different name
cuda_lib.poly_sum.argtypes = [
    ctypes.POINTER(ctypes.c_int),  # input1
    ctypes.POINTER(ctypes.c_int),  # input2 (result is written here)
    ctypes.c_int                   # size
]
cuda_lib.poly_sum.restype = None

def test_poly_sum(N: int):
    # 2) Build two big “polynomials” of length N
    input1 = list(range(N))
    input2 = list(range(N, 2*N))
    # 3) Reference sum on CPU
    expected = [a + b for a, b in zip(input1, input2)]

    # 4) Marshall into ctypes arrays
    ArrayType = ctypes.c_int * N
    c_in1 = ArrayType(*input1)
    c_in2 = ArrayType(*input2)

    # 5) Call the GPU kernel once for the entire N
    cuda_lib.poly_sum(c_in1, c_in2, N)

    # 6) Copy back and compare
    gpu_out = list(c_in2)
    if gpu_out != expected:
        # find the first mismatch
        for i, (g, e) in enumerate(zip(gpu_out, expected)):
            if g != e:
                print(f"Mismatch at index {i}: GPU={g}  CPU={e}")
                break
        raise AssertionError("GPU result does not match CPU result!")
    print(f"[PASS] N={N}")

if __name__ == "__main__":
    # choose N well above your thread‐per‐block (256) or even blocksPerGrid
    for N in [256, 1024, 10_000, 100_000]:
        test_poly_sum(N)


# Testing Batching

In [None]:
import ctypes
from typing import List, Tuple

# -----------------------------------------------------------------------------
# 1) Load your CUDA library and define its prototype
# -----------------------------------------------------------------------------
cuda_lib = ctypes.CDLL('./polySumGPU.so')  # or whatever your .so is named
cuda_lib.poly_sum.argtypes = [
    ctypes.POINTER(ctypes.c_int),  # int *input1
    ctypes.POINTER(ctypes.c_int),  # int *input2  (and result is written here)
    ctypes.c_int                   # int   size
]
cuda_lib.poly_sum.restype = None

# -----------------------------------------------------------------------------
# 2) Batch‐sum helper
# -----------------------------------------------------------------------------
def polynomial_sum_batch(
    polynomial_pairs: List[Tuple[List[int], List[int]]],
    degree: int
) -> List[List[int]]:
    """
    Given a list of (polyA, polyB) pairs, each of length `degree + 1`,
    computes element‐wise sum polyA + polyB on the GPU in one go,
    and returns a list of resulting polynomials.
    """
    n_pairs = len(polynomial_pairs)
    if n_pairs == 0:
        return []

    # --- Validate & flatten into two big lists ---
    num_elements = degree + 1
    flat_a = []
    flat_b = []
    for idx, (a, b) in enumerate(polynomial_pairs):
        if len(a) != num_elements or len(b) != num_elements:
            raise ValueError(
                f"Pair #{idx} has lengths ({len(a)}, {len(b)}), "
                f"but expected both == {num_elements}"
            )
        flat_a.extend(a)
        flat_b.extend(b)

    total_size = len(flat_a)  # == n_pairs * num_elements

    # --- Build ctypes arrays ---
    ArrayType = ctypes.c_int * total_size
    arr_a = ArrayType(*flat_a)
    arr_b = ArrayType(*flat_b)

    # --- Call the CUDA kernel ---
    cuda_lib.poly_sum(arr_a, arr_b, total_size)

    # --- Read back and un-flatten ---
    result_flat = list(arr_b)  # kernel writes result into the second array

    # split into `n_pairs` chunks of length `num_elements`
    result = [
        result_flat[i*num_elements : (i+1)*num_elements]
        for i in range(n_pairs)
    ]
    return result

# -----------------------------------------------------------------------------
# 3) Example usage
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    # three pairs of degree-3 polynomials
    polys = [
        ([1,2,3,4],  [10,20,30,40]),
        ([5,6,7,8],  [50,60,70,80]),
        ([9,10,11,12], [90,100,110,120]),
    ]
    deg = 3

    summed = polynomial_sum_batch(polys, deg)
    print("Input pairs:")
    for a,b in polys:
        print(" ", a, "+", b)
    print("Resulting sums:")
    for r in summed:
        print(" ", r)




# def polynomial_sum_batch(polynomials, degree, coeff_mod):
#     # TODO: Polynomilas should be a list of pairs where each pair is 2 polynomials we need to sum
#     # TODO: Concat first set of elements of the pair into one least and the second pair into another
#     # TODO: Call the kernel using these 2 lists and don't forget to find the length of one of them
#     # TODO: Divide the result into 'x' objects where each one has length of degree
#     # TODO: return this new list result

Input pairs:
  [1, 2, 3, 4] + [10, 20, 30, 40]
  [5, 6, 7, 8] + [50, 60, 70, 80]
  [9, 10, 11, 12] + [90, 100, 110, 120]
Resulting sums:
  [11, 22, 33, 44]
  [55, 66, 77, 88]
  [99, 110, 121, 132]


### **Resources**

*   https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
*   https://developer.nvidia.com/blog/how-overlap-data-transfers-cuda-cc/#overlapping_kernel_execution_and_data_transfers
*   https://developer.nvidia.com/blog/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
*   https://vitalitylearning.medium.com/using-c-c-and-cuda-functions-as-regular-python-functions-716f01f7ca22

