In [1]:
# Setup cuda environment
# !pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
# %load_ext nvcc4jupyter

### **Resources**

*   https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
*   https://developer.nvidia.com/blog/how-overlap-data-transfers-cuda-cc/#overlapping_kernel_execution_and_data_transfers
*   https://developer.nvidia.com/blog/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
*   https://vitalitylearning.medium.com/using-c-c-and-cuda-functions-as-regular-python-functions-716f01f7ca22



In [2]:
%%writefile polyReductionGPU.cu
#include <cstdio>
#include <cuda_runtime.h>
#define CUDA_CHECK(call)                                                     \
  do {                                                                        \
    cudaError_t err = call;                                                   \
    if (err != cudaSuccess) {                                                 \
      fprintf(stderr, "CUDA error at %s:%d: %s\n",                            \
              __FILE__, __LINE__, cudaGetErrorString(err));                   \
      exit(EXIT_FAILURE);                                                     \
    }                                                                         \
  } while (0)

// CUDA kernel: for tid in [0..degree), compute in[tid] - in[tid+degree]
__global__ void polynomial_reduction_kernel(const int *in,
                                            int       *out,
                                            int        degree)
{
    // tid will be in [0..degree] while the size of the input will be 2*degree
    int tid    = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;

    // Only need to cover [0..degree-1]
    for (; tid < degree; tid += stride) {
        int a = in[tid];
        int b = in[tid + degree];
        out[tid] = a - b;
    }
}

// Wrapper: reads 2*degree ints from `polynomial`, writes back `degree` ints
extern "C"
void polynomial_reduction(int *polynomial, int degree)
{
    int size = 2 * degree;

    // Allocate device buffers
    int *d_in = nullptr, *d_out = nullptr;
    CUDA_CHECK(cudaMalloc(&d_in,  size * sizeof *d_in));
    CUDA_CHECK(cudaMalloc(&d_out, degree * sizeof *d_out));

    // Copy the 2*degree input up, zero the output buffer (optional)
    CUDA_CHECK(cudaMemcpy(d_in,
                          polynomial,
                          size * sizeof *d_in,
                          cudaMemcpyHostToDevice));
    // (no need to clear d_out if we overwrite every element)

    // Launch kernel with grid-stride loop over [0..degree)
    int threadsPerBlock = 256;
    int blocksPerGrid   = (degree + threadsPerBlock - 1) / threadsPerBlock;

    polynomial_reduction_kernel
        <<<blocksPerGrid, threadsPerBlock>>>
        (d_in, d_out, degree);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());

    // Copy back only the first `degree` results
    CUDA_CHECK(cudaMemcpy(polynomial,
                          d_out,
                          degree * sizeof *d_out,
                          cudaMemcpyDeviceToHost));

    // Cleanup
    CUDA_CHECK(cudaFree(d_in));
    CUDA_CHECK(cudaFree(d_out));
}


Overwriting polyReductionGPU.cu


In [3]:
# Compile the cuda code and produce a shared library to get linked to the python main program
!nvcc -shared -Xcompiler -fPIC -o polyReductionGPU.so polyReductionGPU.cu

# Testing Polynomial Reduction

In [7]:
# test_poly_reduction.py

import ctypes

# 1) Load the shared library
lib = ctypes.CDLL('./polyReductionGPU.so')

# 2) Declare the kernel prototype
#    extern "C" void polynomial_reduction(int *polynomial, int degree);
lib.polynomial_reduction.argtypes = [
    ctypes.POINTER(ctypes.c_int),  # polynomial (length 2*degree)
    ctypes.c_int                   # degree
]
lib.polynomial_reduction.restype = None

def test_polynomial_reduction(degree: int):
    # size of the input buffer
    size = 2 * degree

    # 3) Build test input:
    #    first half = [0,10,20,...,10*(degree-1)]
    #    second half = [0,1,2,...,degree-1]
    host = [i * 10 for i in range(degree)] + [i for i in range(degree)]

    print(f"\n=== Test degree={degree} ===")
    if degree < 300: print("Input (2*degree elements):", host)

    # reference: out[i] = host[i] - host[i+degree] = 9*i
    expected = [9 * i for i in range(degree)]

    # 4) Marshal into a ctypes array
    ArrayType = ctypes.c_int * size
    c_poly    = ArrayType(*host)

    # 5) Call the GPU reduction (grid‑stride inside covers all 2*degree)
    lib.polynomial_reduction(c_poly, degree)

    # 6) Read back only the first `degree` elements
    result = list(c_poly[:degree])
    if degree< 300: print("Output (first degree elements):", result)

    # 7) Verify
    if result != expected:
        # find first mismatch for debugging
        for i, (r, e) in enumerate(zip(result, expected)):
            if r != e:
                print(f"Mismatch at index {i}: GPU={r}, expected={e}")
                break
        raise AssertionError(f"Test failed for degree={degree}")
    print(f"[PASS] polynomial_reduction with degree={degree}")

if __name__ == "__main__":
    # Test with sizes > your maxThreadsPerBlock (256),
    # e.g. degree=512, 1000, 10_000
    for deg in [256, 512, 1_000, 10_000]:
        test_polynomial_reduction(deg)

    print("All tests passed!")



=== Test degree=256 ===
Input (2*degree elements): [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990, 1000, 1010, 1020, 1030, 1040, 1050, 1060, 1070, 1080, 1090, 1100, 1110, 1120, 1130, 1140, 1150, 1160, 1170, 1180, 1190, 1200, 1210, 1220, 1230, 1240, 1250, 1260, 1270, 1280, 1290, 1300, 1310, 1320, 1330, 1340, 1350, 1360, 1370, 1380, 1390, 1400, 1410, 1420, 1430, 1440, 1450, 1460, 1470, 1480, 1490, 1500, 1510, 1520, 1530, 1540, 1550, 1560, 1570, 1580, 1590, 1600, 1610, 1620, 1630, 1640, 1650, 1660, 1670, 1680, 1690, 1700, 1710, 1720, 1730, 1740, 1750, 17

# Testing Large Input Size

In [None]:
# import ctypes
# import math

# # -----------------------------------------------------------------------------
# # 1) Load your CUDA library and define its prototype
# # -----------------------------------------------------------------------------
# cuda_lib = ctypes.CDLL('./polyModGPU.so')  
# cuda_lib.polynomial_mod.argtypes = [
#     ctypes.POINTER(ctypes.c_int),  # int *polynomial
#     ctypes.c_int,                  # int   size
#     ctypes.c_int,                  # int   coeff_mod
# ]
# cuda_lib.polynomial_mod.restype = None

# # -----------------------------------------------------------------------------
# # 2) Define your device‐limit constants (tweak these to match your GPU)
# #    On most cards:
# #      maxThreadsPerBlock  = 1024 (but you chose 256 in your kernel)
# #      maxGridSize[0]      ~ 2^31−1
# # -----------------------------------------------------------------------------
# MAX_THREADS_PER_BLOCK   = 256
# MAX_BLOCKS_PER_GRID     = 2**16    # a safe “minimum” for modern GPUs; you can query prop.maxGridSize[0] if you like
# MAX_ELEMS_PER_LAUNCH    = MAX_THREADS_PER_BLOCK * MAX_BLOCKS_PER_GRID

# # -----------------------------------------------------------------------------
# # 3) The “big‐array” wrapper
# # -----------------------------------------------------------------------------
# def polynomial_mod_large(polynomial: list[int], coeff_mod: int) -> None:
#     """
#     Applies polynomial_mod() in‑place over an arbitrarily large Python list
#     by splitting it into chunks no bigger than MAX_ELEMS_PER_LAUNCH.
#     """
#     total = len(polynomial)
#     polynomial_array = (ctypes.c_int * total)(*polynomial)

#     # call the CUDA kernel on just this slice
#     cuda_lib.polynomial_mod(polynomial_array, total, coeff_mod)

#     # copy results back into our Python list
#     polynomial = list(polynomial_array)
#     return polynomial

# # -----------------------------------------------------------------------------
# # 4) Usage example
# # -----------------------------------------------------------------------------
# if __name__ == "__main__":
#     # some huge polynomial:
#     poly = list(range(1_000_000))  # 1 million coefficients
#     # poly = list(range(22_000_000_00))  # GPU limit is 2147483647 or 2^31 − 1
#     mod  = 97

#     result = polynomial_mod_large(poly, mod)
#     print("First 10 results:", result[:10])
#     print("Last 10 results: ", result[-10:])


2147483647First 10 results: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Last 10 results:  [17, 18, 19, 20, 21, 22, 23, 24, 25, 26]


# Testing Batching

In [9]:
import ctypes
from typing import List

# 1) Load the compiled CUDA library and declare prototype
lib = ctypes.CDLL('./polyReductionGPU.so')
lib.polynomial_reduction.argtypes = [
    ctypes.POINTER(ctypes.c_int),  # int *polynomial (length = 2*kernel_degree)
    ctypes.c_int                   # int   kernel_degree
]
lib.polynomial_reduction.restype = None

def polynomial_reduction_batch(polynomials: List[List[int]],
                               degree: int,
                               coeff_mod: int = None      # unused
                              ) -> List[List[int]]:
    """
    Given a list of `n` polynomials, each of length 2*degree,
    performs the “fold‑and‑subtract” reduction for all of them in one GPU call.
    Returns a list of `n` reduced polynomials, each of length degree.
    """
    # --- TODO: Find number of polynomials ---
    n = len(polynomials)
    if n == 0:
        return []

    # --- TODO: Validate lengths and split halves ---
    for i, poly in enumerate(polynomials):
        if len(poly) != 2 * degree:
            raise ValueError(f"Polynomial #{i} has length {len(poly)}, expected {2*degree}")

    # --- TODO: Concatenate first halves, then second halves ---
    first_halves  = []
    second_halves = []
    for poly in polynomials:
        first_halves .extend(poly[:degree])
        second_halves.extend(poly[degree:])

    # --- TODO: Then concat those two big lists together ---
    flat = first_halves + second_halves

    # --- TODO: Compute kernel_degree = degree * num_polynomials ---
    kernel_degree = degree * n
    total_size    = 2 * kernel_degree  # length of our flat input

    # --- Marshal into a ctypes array of length 2*kernel_degree ---
    ArrayType = ctypes.c_int * total_size
    c_flat     = ArrayType(*flat)

    # --- TODO: Call the CUDA kernel on the concatenated buffer ---
    lib.polynomial_reduction(c_flat, kernel_degree)

    # --- Read back only the first kernel_degree entries ---
    result_flat = list(c_flat[:kernel_degree])

    # --- TODO: Split the result into `n` chunks of length `degree` ---
    result = [
        result_flat[i*degree : (i+1)*degree]
        for i in range(n)
    ]

    return result

# ---------------- Example Usage ----------------
if __name__ == "__main__":
    degree = 4
    # create 3 test polynomials, each length 2*degree:
    # first half = [1,2,3,4], [5,6,7,8], [9,10,11,12]
    # second half = [10,20,30,40], [50,60,70,80], [90,100,110,120]
    batch = [
        [1,2,3,4,   10,20,30,40],
        [5,6,7,8,   50,60,70,80],
        [9,10,11,12,90,100,110,120],
    ]

    out = polynomial_reduction_batch(batch, degree, coeff_mod=None)
    print("Input batch:")
    for p in batch:
        print(" ", p)
    print("\nReduced polynomials:")
    for r in out:
        # each r[i] == first_half[i] - second_half[i]
        print(" ", r)


Input batch:
  [1, 2, 3, 4, 10, 20, 30, 40]
  [5, 6, 7, 8, 50, 60, 70, 80]
  [9, 10, 11, 12, 90, 100, 110, 120]

Reduced polynomials:
  [-9, -18, -27, -36]
  [-45, -54, -63, -72]
  [-81, -90, -99, -108]
