In [1]:
import ctypes
import os
import numpy as np
from time import perf_counter

In [2]:
cuda_bin = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin"
os.add_dll_directory(cuda_bin)

dll_path = r"D:\CUFE\Parallel\Project\fourier-ckks\fft_cuda\fft.dll"
lib = ctypes.CDLL(dll_path)

In [3]:
# Prototype helper
def _make_proto(fn, restype, argtypes):
    f = getattr(lib, fn)
    f.restype = restype
    f.argtypes = argtypes
    return f

In [4]:
# 1D
fft1d = _make_proto(
    "fft1d_gpu_c", ctypes.c_int,
    [ctypes.POINTER(ctypes.c_float),  # in_real
     ctypes.POINTER(ctypes.c_float),  # in_imag
     ctypes.POINTER(ctypes.c_float),  # out_real
     ctypes.POINTER(ctypes.c_float),  # out_imag
     ctypes.c_uint]                   # N
)
ifft1d = _make_proto(
    "ifft1d_gpu_c", ctypes.c_int,
    fft1d.argtypes
)

# 2D
fft2d = _make_proto(
    "fft2d_gpu_c", ctypes.c_int,
    [ctypes.POINTER(ctypes.c_float),
     ctypes.POINTER(ctypes.c_float),
     ctypes.POINTER(ctypes.c_float),
     ctypes.POINTER(ctypes.c_float),
     ctypes.c_uint,
     ctypes.c_uint]
)
ifft2d = _make_proto(
    "ifft2d_gpu_c", ctypes.c_int,
    [ctypes.POINTER(ctypes.c_float),
     ctypes.POINTER(ctypes.c_float),
     ctypes.POINTER(ctypes.c_float),
     ctypes.POINTER(ctypes.c_float),
     ctypes.c_uint,
     ctypes.c_uint]
)


In [5]:
def run_1d(N=1024):
    # prepare random complex input
    x = np.random.randn(N) + 1j*np.random.randn(N)
    in_real = np.ascontiguousarray(x.real, dtype=np.float32)
    in_imag = np.ascontiguousarray(x.imag, dtype=np.float32)
    out_real = np.zeros(N, dtype=np.float32)
    out_imag = np.zeros(N, dtype=np.float32)

    # — CPU forward FFT timing —
    t0 = perf_counter()
    y_cpu = np.fft.fft(x)
    t1 = perf_counter()

    # — GPU forward FFT timing —
    t2 = perf_counter()
    err = fft1d(
        in_real.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        in_imag.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        out_real.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        out_imag.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        N
    )
    t3 = perf_counter()
    assert err == 0
    y_gpu = out_real + 1j*out_imag

    # print forward results & timings
    print(f"1D FFT forward:    CPU time = {t1-t0:.6f}s, GPU time = {t3-t2:.6f}s")
    print("1D FFT forward max abs error:", np.max(np.abs(y_gpu - y_cpu)))

    # — CPU inverse FFT timing —
    t4 = perf_counter()
    x_rec_cpu = np.fft.ifft(y_cpu)
    t5 = perf_counter()

    # — GPU inverse FFT timing —
    inv_real = np.zeros(N, dtype=np.float32)
    inv_imag = np.zeros(N, dtype=np.float32)
    t6 = perf_counter()
    err = ifft1d(
        out_real.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        out_imag.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        inv_real.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        inv_imag.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        N
    )
    t7 = perf_counter()
    assert err == 0
    x_rec_gpu = inv_real + 1j*inv_imag

    # print inverse results & timings
    print(f"1D FFT inverse:    CPU time = {t5-t4:.6f}s, GPU time = {t7-t6:.6f}s")
    print("1D reconstruction max abs error:", np.max(np.abs(x_rec_gpu - x_rec_cpu)))
    print()

def run_2d(rows=64, cols=128):
    # prepare random complex input
    x = np.random.randn(rows, cols) + 1j*np.random.randn(rows, cols)
    in_real = np.ascontiguousarray(x.real, dtype=np.float32)
    in_imag = np.ascontiguousarray(x.imag, dtype=np.float32)
    total = rows*cols
    out_real = np.zeros(total, dtype=np.float32)
    out_imag = np.zeros(total, dtype=np.float32)

    # — CPU forward FFT2 timing —
    t0 = perf_counter()
    y_cpu = np.fft.fft2(x)
    t1 = perf_counter()

    # — GPU forward FFT2 timing —
    t2 = perf_counter()
    err = fft2d(
        in_real.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        in_imag.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        out_real.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        out_imag.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        rows, cols
    )
    t3 = perf_counter()
    assert err == 0
    y_gpu = (out_real + 1j*out_imag).reshape(rows, cols)

    # print forward results & timings
    print(f"2D FFT forward:    CPU time = {t1-t0:.6f}s, GPU time = {t3-t2:.6f}s")
    print("2D FFT forward max abs error:", np.max(np.abs(y_gpu - y_cpu)))

    # — CPU inverse FFT2 timing —
    t4 = perf_counter()
    x_rec_cpu = np.fft.ifft2(y_cpu)
    t5 = perf_counter()

    # — GPU inverse FFT2 timing —
    inv_real = np.zeros(total, dtype=np.float32)
    inv_imag = np.zeros(total, dtype=np.float32)
    t6 = perf_counter()
    err = ifft2d(
        out_real.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        out_imag.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        inv_real.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        inv_imag.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        rows, cols
    )
    t7 = perf_counter()
    assert err == 0
    x_rec_gpu = (inv_real + 1j*inv_imag).reshape(rows, cols)

    # print inverse results & timings
    print(f"2D FFT inverse:    CPU time = {t5-t4:.6f}s, GPU time = {t7-t6:.6f}s")
    print("2D reconstruction max abs error:", np.max(np.abs(x_rec_gpu - x_rec_cpu)))
    print()


In [6]:
run_1d(2**25)

1D FFT forward:    CPU time = 1.486952s, GPU time = 1.874848s
1D FFT forward max abs error: 0.031141051865811446
1D FFT inverse:    CPU time = 1.256870s, GPU time = 0.396210s
1D reconstruction max abs error: 3.184663920455218e-06



In [7]:
run_2d(64, 128)

2D FFT forward:    CPU time = 0.000327s, GPU time = 0.001080s
2D FFT forward max abs error: 0.00011913365107303915
2D FFT inverse:    CPU time = 0.000209s, GPU time = 0.000624s
2D reconstruction max abs error: 1.4377007209375144e-06



In [8]:
def benchmark_1d(max_exp):
    """
    Runs 1D FFT for N = 2,4,8,...,2**max_exp.
    Prints CPU vs GPU time and max abs error against np.fft.fft.
    """
    header = f"{'N':>6}  {'CPU(s)':>8}  {'GPU(s)':>8}  {'MaxErr':>10}"
    print("1D FFT Benchmark")
    print(header)
    print("-"*len(header))
    for exp in range(1, max_exp+1):
        N = 1 << exp

        # make test vector
        x = (np.random.randn(N) + 1j*np.random.randn(N)).astype(np.complex64)
        in_r = np.ascontiguousarray(x.real, dtype=np.float32)
        in_i = np.ascontiguousarray(x.imag, dtype=np.float32)
        out_r = np.empty(N, dtype=np.float32)
        out_i = np.empty(N, dtype=np.float32)

        # CPU FFT
        t0 = perf_counter()
        y_cpu = np.fft.fft(x)
        t1 = perf_counter()

        # GPU FFT
        t2 = perf_counter()
        err_code = fft1d(
            in_r.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            in_i.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            out_r.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            out_i.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            ctypes.c_uint(N)
        )
        t3 = perf_counter()
        assert err_code == 0, f"fft1d failed @ N={N}: {err_code}"
        y_gpu = out_r + 1j*out_i

        # error
        max_err = np.max(np.abs(y_gpu - y_cpu))

        print(f"{N:6d}  {t1-t0:8.6f}  {t3-t2:8.6f}  {max_err:10.3e}")
    print()


def benchmark_2d(max_exp):
    """
    Runs 2D FFT for N×N, N = 2,4,8,...,2**max_exp.
    Prints CPU vs GPU time and max abs error against np.fft.fft2.
    """
    header = f"{'N':>6}  {'CPU(s)':>8}  {'GPU(s)':>8}  {'MaxErr':>10}"
    print("2D FFT Benchmark (square matrices)")
    print(header)
    print("-"*len(header))
    for exp in range(1, max_exp+1):
        N = 1 << exp
        total = N*N

        # make test matrix
        x = (np.random.randn(N, N) + 1j*np.random.randn(N, N)).astype(np.complex64)
        in_r = np.ascontiguousarray(x.real, dtype=np.float32)
        in_i = np.ascontiguousarray(x.imag, dtype=np.float32)
        out_r = np.empty(total, dtype=np.float32)
        out_i = np.empty(total, dtype=np.float32)

        # CPU FFT2
        t0 = perf_counter()
        y_cpu = np.fft.fft2(x)
        t1 = perf_counter()

        # GPU FFT2
        t2 = perf_counter()
        err_code = fft2d(
            in_r.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            in_i.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            out_r.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            out_i.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            ctypes.c_uint(N), ctypes.c_uint(N)
        )
        t3 = perf_counter()
        assert err_code == 0, f"fft2d failed @ {N}×{N}: {err_code}"
        y_gpu = (out_r + 1j*out_i).reshape(N, N)

        # error
        max_err = np.max(np.abs(y_gpu - y_cpu))

        print(f"{N:6d}  {t1-t0:8.6f}  {t3-t2:8.6f}  {max_err:10.3e}")
    print()


In [9]:
benchmark_1d(25)

1D FFT Benchmark
     N    CPU(s)    GPU(s)      MaxErr
--------------------------------------
     2  0.000055  0.000693   0.000e+00
     4  0.000033  0.000477   1.192e-07
     8  0.000026  0.000544   5.960e-07
    16  0.000021  0.000423   1.387e-06
    32  0.000020  0.000376   2.623e-06
    64  0.000018  0.000367   4.396e-06
   128  0.000020  0.000411   8.530e-06
   256  0.000024  0.000764   1.375e-05
   512  0.000028  0.000496   2.559e-05
  1024  0.000040  0.000458   3.820e-05
  2048  0.000042  0.000504   6.291e-05
  4096  0.000085  0.000766   9.651e-05
  8192  0.000182  0.000607   1.571e-04
 16384  0.000344  0.000724   2.625e-04
 32768  0.000808  0.000729   4.401e-04
 65536  0.002185  0.001601   6.602e-04
131072  0.004464  0.002282   9.317e-04
262144  0.010524  0.004065   1.490e-03
524288  0.018601  0.006513   2.251e-03
1048576  0.041643  0.013038   3.302e-03
2097152  0.076058  0.025028   6.118e-03
4194304  0.156984  0.049191   8.667e-03
8388608  0.350815  0.095061   1.314e-02
1677

In [12]:
benchmark_2d(13)

2D FFT Benchmark (square matrices)
     N    CPU(s)    GPU(s)      MaxErr
--------------------------------------
     2  0.002067  0.027193   0.000e+00
     4  0.000103  0.001305   1.066e-06
     8  0.000487  0.001350   3.016e-06
    16  0.000094  0.001032   8.959e-06
    32  0.000116  0.001578   3.053e-05
    64  0.000135  0.001134   7.938e-05
   128  0.000558  0.001422   2.053e-04
   256  0.002641  0.003559   5.758e-04
   512  0.009444  0.013302   1.346e-03
  1024  0.045872  0.075573   3.248e-03
  2048  0.199083  0.321283   6.905e-03
  4096  0.935693  0.520907   1.660e-02
  8192  4.100469  1.372155   3.847e-02

