In [1]:
# Setup cuda environment
# !pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
# %load_ext nvcc4jupyter

### **Resources**

*   https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
*   https://developer.nvidia.com/blog/how-overlap-data-transfers-cuda-cc/#overlapping_kernel_execution_and_data_transfers
*   https://developer.nvidia.com/blog/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
*   https://vitalitylearning.medium.com/using-c-c-and-cuda-functions-as-regular-python-functions-716f01f7ca22



In [1]:
%%writefile polyModGPU.cu
#include <cstdio>
#include <iostream>
#define CUDA_CHECK(call)                                                     \
  do {                                                                        \
    cudaError_t err = call;                                                   \
    if (err != cudaSuccess) {                                                 \
      fprintf(stderr, "CUDA error at %s:%d: %s\n",                            \
              __FILE__, __LINE__, cudaGetErrorString(err));                   \
      exit(EXIT_FAILURE);                                                     \
    }                                                                         \
  } while (0)

// CUDA kernel function
__global__ void polynomial_mod_kernel(int *polynomial, int size, int coeff_mod) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = gridDim.x * blockDim.x;
    for (; tid < size; tid += stride) {
        polynomial[tid] %= coeff_mod;
    }
}

// Wrapper function to call the CUDA kernel
extern "C" void polynomial_mod(int *polynomial, int size, int coeff_mod) {
    // Allocate device memory
    int *d_polynomial = nullptr;
    CUDA_CHECK(cudaMalloc((void**)&d_polynomial, size * sizeof(int)));

    // Copy input data to device
    CUDA_CHECK(cudaMemcpy(d_polynomial, polynomial, size * sizeof(int), cudaMemcpyHostToDevice));

    // Launch CUDA kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;

    // Query max active blocks per multiprocessor
    int device;
    CUDA_CHECK(cudaGetDevice(&device));

    cudaFuncAttributes attr;
    CUDA_CHECK(cudaFuncGetAttributes(&attr, polynomial_mod_kernel));

    cudaDeviceProp prop;
    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));

    std::cout << prop.maxGridSize[0];
    // In case number of needed blocks exceeds hardware limit
    blocksPerGrid = std::min(blocksPerGrid, prop.maxGridSize[0]);
    
    polynomial_mod_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_polynomial, size, coeff_mod);

    CUDA_CHECK(cudaDeviceSynchronize());
    //CUDA_CHECK(cudaDeviceSynchronize());
    // Copy result back to host
    CUDA_CHECK(cudaMemcpy(polynomial, d_polynomial, size * sizeof(int), cudaMemcpyDeviceToHost));

    // Free device memory
    CUDA_CHECK(cudaFree(d_polynomial));
}

Writing polyModGPU.cu


In [None]:
# Compile the cuda code and produce a shared library to get linked to the python main program
!nvcc -shared -Xcompiler -fPIC -o polyModGPU.so polyModGPU.cu

# Testing Polynomial Modulo

In [None]:
# Python function calling the compiled C++/CUDA function


# ctypes in python bridges the gap between python dynamic data types and c static ones.
import ctypes

# Load the CUDA library
cuda_lib = ctypes.CDLL('./polyModGPU.so')  # Update with the correct path

# Define the function prototype
cuda_lib.polynomial_mod.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
cuda_lib.polynomial_mod.restype = None
# cuda_lib.my_cuda_function.restype = None

# Prepare data
polynomial_coeff = [1, 2, 3, 4, 8 ,9 ,10]
size = len(polynomial_coeff)
coeff_mod = 9

# Convert Python lists to ctypes arrays
polynomial_array = (ctypes.c_int * size)(*polynomial_coeff)

# Call the CUDA function
cuda_lib.polynomial_mod(polynomial_array, size, coeff_mod)

# Print the result
result = list(polynomial_array)
print('Coeff_mod:',coeff_mod )
print("Result:", result)

2147483647Coeff_mod: 9
Result: [1, 2, 3, 4, 8, 0, 1]


# Testing Large Input Size

In [None]:
import ctypes
import math

# -----------------------------------------------------------------------------
# 1) Load your CUDA library and define its prototype
# -----------------------------------------------------------------------------
cuda_lib = ctypes.CDLL('./polyModGPU.so')  
cuda_lib.polynomial_mod.argtypes = [
    ctypes.POINTER(ctypes.c_int),  # int *polynomial
    ctypes.c_int,                  # int   size
    ctypes.c_int,                  # int   coeff_mod
]
cuda_lib.polynomial_mod.restype = None

# -----------------------------------------------------------------------------
# 2) Define your device‐limit constants (tweak these to match your GPU)
#    On most cards:
#      maxThreadsPerBlock  = 1024 (but you chose 256 in your kernel)
#      maxGridSize[0]      ~ 2^31−1
# -----------------------------------------------------------------------------
MAX_THREADS_PER_BLOCK   = 256
MAX_BLOCKS_PER_GRID     = 2**16    # a safe “minimum” for modern GPUs; you can query prop.maxGridSize[0] if you like
MAX_ELEMS_PER_LAUNCH    = MAX_THREADS_PER_BLOCK * MAX_BLOCKS_PER_GRID

# -----------------------------------------------------------------------------
# 3) The “big‐array” wrapper
# -----------------------------------------------------------------------------
def polynomial_mod_large(polynomial: list[int], coeff_mod: int) -> None:
    """
    Applies polynomial_mod() in‑place over an arbitrarily large Python list
    by splitting it into chunks no bigger than MAX_ELEMS_PER_LAUNCH.
    """
    total = len(polynomial)
    polynomial_array = (ctypes.c_int * total)(*polynomial)

    # call the CUDA kernel on just this slice
    cuda_lib.polynomial_mod(polynomial_array, total, coeff_mod)

    # copy results back into our Python list
    polynomial = list(polynomial_array)
    return polynomial

# -----------------------------------------------------------------------------
# 4) Usage example
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    # some huge polynomial:
    poly = list(range(1_000_000))  # 1 million coefficients
    # poly = list(range(22_000_000_00))  # GPU limit is 2147483647 or 2^31 − 1
    mod  = 97

    result = polynomial_mod_large(poly, mod)
    print("First 10 results:", result[:10])
    print("Last 10 results: ", result[-10:])


2147483647First 10 results: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Last 10 results:  [17, 18, 19, 20, 21, 22, 23, 24, 25, 26]


# Testing Batching

In [None]:
import ctypes
import math

# -----------------------------------------------------------------------------
# 1) Load your CUDA library and define its prototype
# -----------------------------------------------------------------------------
cuda_lib = ctypes.CDLL('./polyModGPU.so')  
cuda_lib.polynomial_mod.argtypes = [
    ctypes.POINTER(ctypes.c_int),  # int *polynomial
    ctypes.c_int,                  # int   size
    ctypes.c_int,                  # int   coeff_mod
]
cuda_lib.polynomial_mod.restype = None
def polynomial_mod_batch(polynomials, degree, coeff_mod):
    # 1. Validate input
    if not polynomials or degree <= 0:
        return []

    # 2. Flatten list of polynomials
    flat_list = []
    length_polynomial = degree + 1
    for poly in polynomials:
        if len(poly) != length_polynomial:
            raise ValueError(f"Expected each polynomial to have degree {degree}, got {len(poly)}.")
        flat_list.extend(poly)

    total_size = len(flat_list)

    # 3. Convert to ctypes array
    flat_array = (ctypes.c_int * total_size)(*flat_list)

    # 4. Call the CUDA kernel
    cuda_lib.polynomial_mod(flat_array, total_size, coeff_mod)

    # 5. Convert result back to list of polynomials
    result = list(flat_array)
    num_polynomials = len(polynomials)
    result_polynomials = [
        result[i * length_polynomial : (i + 1) * length_polynomial]
        for i in range(num_polynomials)
    ]

    return result_polynomials

# ------------------ Example Usage ------------------
if __name__ == "__main__":
    polys = [
        [10, 11, 12, 13],
        [20, 21, 22, 23],
        [30, 31, 32, 33]
    ]
    coeff_mod = 10
    degree = 3

    result = polynomial_mod_batch(polys, degree, coeff_mod)
    print("Original:", polys)
    print("Modded  :", result)

Original: [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]]
Modded  : [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
