-
Notifications
You must be signed in to change notification settings - Fork 172
Open
Labels
bugA request to fix an issueA request to fix an issuehelp wantedTasks, issues or features that could be implemented and contributed to the projectTasks, issues or features that could be implemented and contributed to the project
Description
Summary
When performing DPF transform
with oneapi::mkl::dft
on NVIDIA GPU for multidimensional arrays >=2D where the last dimension is 1
[(2,1), (5,1), (2,3,1)....]
The computation fails with oneMKL: dft/backends/cufft/commit: Invalid strides
This problem does not occur when running on Intel.
Version
I am using the latest release version v0.6
Environment
- HW you use : NVIDIA A100 80GB PCIe
- Backend library version: Build cuda_12.4.r12.4/compiler.33961263_0
- OS name and version : Linux Ubuntu 24.04 LTS 6.8.0-51-generic
- Compiler version : Intel(R) oneAPI DPC++/C++ Compiler 2025.0.4 (2025.0.4.20241205)
$ sycl-ls
[opencl:cpu][opencl:0] Intel(R) OpenCL, Intel(R) Xeon(R) Platinum 8480+ OpenCL 3.0 (Build 0) [2024.18.12.0.05_160000]
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100 80GB PCIe 8.0 [CUDA 12.7]
Steps to reproduce
Building:
cmake .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DMKL_ROOT=${MKLROOT} -DBUILD_FUNCTIONAL_TESTS=OFF \
-DENABLE_CUBLAS_BACKEND=True -DENABLE_CUSOLVER_BACKEND=True -DENABLE_CURAND_BACKEND=True -DENABLE_CUFFT_BACKEND=True
# build
cmake --build .
# install
export MKL_INTERFACE_ROOT=$(pwd)/../install
cmake --install . --prefix=${MKL_INTERFACE_ROOT}
Compiling:
icpx -fsycl fft_repro.cpp -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown -I${MKL_INTERFACE_ROOT}/include
\
-L${MKL_INTERFACE_ROOT}/lib -lonemkl -Wl,-rpath,${MKL_INTERFACE_ROOT}/lib -o run_repro
Running:
ONEAPI_DEVICE_SELECTOR=cuda:gpu ./run_repro
ONEAPI_DEVICE_SELECTOR=opencl:cpu ./run_repro
Reproducer here
#include <iostream>
#include <vector>
#include <sycl/sycl.hpp>
#include <oneapi/mkl/dft.hpp>
#include <complex>
int main() {
namespace dft = oneapi::mkl::dft;
sycl::queue q(sycl::default_selector{});
auto device = q.get_device();
std::cout << "Device name: " << device.get_info<sycl::info::device::name>() << "\n";
std::vector<std::int64_t> shape = {3, 1};
const std::int64_t size = shape[0] * shape[1];
std::vector<std::complex<float>> h_data(size, {1.0f, 0.0f});
std::vector<std::complex<float>> h_output(size);
std::complex<float>* d_data = sycl::malloc_device<std::complex<float>>(size, q);
std::complex<float>* d_output = sycl::malloc_device<std::complex<float>>(size, q);
q.memcpy(d_data, h_data.data(), size * sizeof(std::complex<float>)).wait();
dft::descriptor<dft::precision::SINGLE, dft::domain::COMPLEX> desc(shape);
std::vector<std::int64_t> fwd_strides = {0, 1, 1};
std::vector<std::int64_t> bwd_strides = {0, 1, 1};
desc.set_value(dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE);
desc.set_value(dft::config_param::NUMBER_OF_TRANSFORMS, 1);
// desc.set_value(dft::config_param::FWD_STRIDES, fwd_strides.data());
// desc.set_value(dft::config_param::BWD_STRIDES, bwd_strides.data());
desc.commit(q);
sycl::event fft_event = dft::compute_forward(desc, d_data, d_output);
fft_event.wait();
q.memcpy(h_output.data(), d_output, size * sizeof(std::complex<float>)).wait();
std::cout << "FFT Result:\n";
for (std::int64_t i = 0; i < size; i++) {
std::cout << h_output[i] << "\n";
}
sycl::free(d_data, q);
sycl::free(d_output, q);
return 0;
}
Observed behavior
When executing the attached code on NVIDIA the program fails with an error
Example output
$ ONEAPI_DEVICE_SELECTOR=cuda:gpu ./run_repro
Device name: NVIDIA A100 80GB PCIe
terminate called after throwing an instance of 'oneapi::mkl::exception'
what(): oneMKL: dft/backends/cufft/commit: Invalid strides.
Aborted (core dumped)
However, running the same code on Intel works as expected
$ ONEAPI_DEVICE_SELECTOR=opencl:cpu ./run_repro
Device name: Intel(R) Xeon(R) Platinum 8480+
FFT Result:
(3,0)
(0,0)
(0,0)
Expected behavior
Similar implementation that uses cuFFT directly works correctly
$ ./fft_cuda
FFT Result:
(3, 0)
(0, 0)
(0, 0)
Reproducer here
nvcc -o fft_cuda fft_cuda.cpp -lcufft
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include <cufft.h>
#include <complex>
int main() {
const int NX = 1;
const int NY = 3;
const int size = NX * NY;
std::vector<std::complex<float>> h_data(size, {1.0f, 0.0f});
cufftComplex* d_data;
cudaMalloc(reinterpret_cast<void**>(&d_data), size * sizeof(cufftComplex));
std::vector<cufftComplex> h_cufft_data(size);
for (int i = 0; i < size; i++) {
h_cufft_data[i].x = std::real(h_data[i]);
h_cufft_data[i].y = std::imag(h_data[i]);
}
cudaMemcpy(d_data, h_cufft_data.data(), size * sizeof(cufftComplex), cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan2d(&plan, NY, NX, CUFFT_C2C);
cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD);
cudaMemcpy(h_cufft_data.data(), d_data, size * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
std::cout << "FFT Result:\n";
for (int i = 0; i < size; i++) {
std::cout << "(" << h_cufft_data[i].x << ", " << h_cufft_data[i].y << ")\n";
}
cufftDestroy(plan);
cudaFree(d_data);
return 0;
}
Metadata
Metadata
Assignees
Labels
bugA request to fix an issueA request to fix an issuehelp wantedTasks, issues or features that could be implemented and contributed to the projectTasks, issues or features that could be implemented and contributed to the project