clfunctions keep using cpu device

`
// funtion one
void ImageProcess::GpuPreProcessConfigure(const uint32_t width_src, const uint32_t height_src, const uint32_t width_dst, const uint32_t height_dst) {
    std::cout << "TESTING GPU PREPROCESS CONFIGURE" << std::endl;
    std::cout << "CONFIGURE INFO width_src: " << width_src << " height_src: " << height_src << " width_dst: " << width_dst << " height_dst: " << height_dst << std::endl;
    // 0. cl shedule && init input tensor
    // CLScheduler::get().default_init();
    CLBackendType backend_type = CLBackendType::Native;
    auto ctx_dev_err           = create_opencl_context_and_device(backend_type);
    // print device info form tuple
    std::cout << "OpenCL device info: " << std::endl;
    std::cout << "-------------------" << std::endl;
    std::cout << "Device name: " << std::get<1>(ctx_dev_err).getInfo<CL_DEVICE_NAME>() << std::endl;
    CLScheduler::get().default_init_with_context(std::get<1>(ctx_dev_err), std::get<0>(ctx_dev_err), nullptr);
    input_tensor_.allocator()->init(TensorInfo(TensorShape(3U, width_src, height_src), 1, DataType::U8).set_data_layout(DataLayout::NHWC));
    input_tensor_.info()->set_format(Format::U8);

    // 1. configure image padding, padding rectangle to square [width_src, width_src]
    pad_output_tensor_.allocator()->init(TensorInfo(TensorShape(3U, width_src, width_src), 1, DataType::U8).set_data_layout(DataLayout::NHWC));
    pad_output_tensor_.info()->set_format(Format::U8);
    PaddingList padding_list = {{0, 0}, {0, 0}, {(width_src - height_src) / 2, (width_src - height_src) / 2}};  // nhwc 第一个维度是width，第二个维度是height，第三个维度是channel, 在这里padding的是height
    pad_layer_.configure(&input_tensor_, &pad_output_tensor_, padding_list, PixelValue(127), PaddingMode::CONSTANT);

    // 2. configure resize
    rescale_output_tensor_.allocator()->init(TensorInfo(TensorShape(3U, width_dst, height_dst), 1, DataType::U8).set_data_layout(DataLayout::NHWC));
    rescale_output_tensor_.info()->set_format(Format::U8);
    rescale_.configure(&pad_output_tensor_, &rescale_output_tensor_, ScaleKernelInfo(InterpolationPolicy::BILINEAR, BorderMode::CONSTANT));

    // 3. configure nhwc to nchw
    nchw_cl_tensor_.allocator()->init(TensorInfo(TensorShape(width_dst, width_dst, 3U), 1, DataType::U8).set_data_layout(DataLayout::NCHW));
    nchw_cl_tensor_.info()->set_format(Format::U8);
    const arm_compute::PermutationVector nhwc_to_nchw_permutation_vector = {1U, 2U, 0U};
    permute_nhwc_to_nchw_.configure(&rescale_output_tensor_, &nchw_cl_tensor_, nhwc_to_nchw_permutation_vector);

    // 4. allocate memory
    input_tensor_.allocator()->allocate();
    rescale_output_tensor_.allocator()->allocate();
    nchw_cl_tensor_.allocator()->allocate();
    pad_output_tensor_.allocator()->allocate();

    std::cout << "GpuPreProcessConfigure end" << std::endl;
}

void ImageProcess::GpuPreProcessXBGR(const char* src, const uint32_t width_src, const uint32_t height_src, char* dest, const uint32_t width_dst, const uint32_t height_dst) {
    // 0. cl shedule
    std::cout << "GpuPreProcessXBGR start" << std::endl;
    std::cout << "PROCESS INFO: width_src: " << width_src << ", height_src: " << height_src << ", width_dst: " << width_dst << ", height_dst: " << height_dst << std::endl;
    std::cout << "GpuPreProcessXBGR default init cl shedule" << std::endl;
    // CLScheduler::get().default_init();

    // 1. 开一个临时数组，并将xbgr转为rgb
    // 这里不建议将这部分内存零散的一个个拷贝到tensor显存中，而是开一个临时buffer在内存中先处理好后再一次性拷贝到tensor显存中，实测这样cpu占用率会降低5%左右
    std::cout << "GpuPreProcessXBGR convert xbgr to rgb" << std::endl;
    int size = width_src * height_src * 3;
    std::unique_ptr<uint8_t[]> rgb_image(new uint8_t[size]);
    for (uint32_t i = 0; i < height_src; ++i) {
        for (uint32_t j = 0; j < width_src; ++j) {
            rgb_image[i * width_src * 3 + j * 3 + 0] = src[i * width_src * 4 + j * 4 + 2];
            rgb_image[i * width_src * 3 + j * 3 + 1] = src[i * width_src * 4 + j * 4 + 1];
            rgb_image[i * width_src * 3 + j * 3 + 2] = src[i * width_src * 4 + j * 4 + 0];
        }
    }

    // 2. copy data to input_tensor_
    std::cout << "GpuPreProcessXBGR memcpy to input_tensor_" << std::endl;
    input_tensor_.map();
    memcpy(input_tensor_.buffer(), rgb_image.get(), size);
    input_tensor_.unmap();

    pad_layer_.run();
    // DrawTensorImage(&pad_output_tensor_, width_src, width_src, "pad_output_tensor_.jpg");
    rescale_.run();
    // DrawTensorImage(&rescale_output_tensor_, width_dst, height_dst, "rescale_output_tensor_.jpg");
    permute_nhwc_to_nchw_.run();

    // 5. 取出permute_nhwc_to_nchw_的结果作为返回
    std::cout << "GpuPreProcessXBGR get result" << std::endl;
    nchw_cl_tensor_.map();
    memcpy(dest, nchw_cl_tensor_.buffer(), width_dst * height_dst * 3);
    nchw_cl_tensor_.unmap();
    return;
}
`
platform rk3399， Gpu：maliT860
branch v22.08

This program runs on some rk3399 boards and can normally call the gpu. 
![image](https://user-images.githubusercontent.com/25675592/209965958-6d28df2b-1268-4186-97c7-a90ea96eea98.png)
You can see that the range occupied by the gpu is 20-100. 
![image](https://user-images.githubusercontent.com/25675592/209966306-015feb6f-a016-4be8-9878-a9a3c21f9ecc.png)
And the recognition thread using about 10% cpu.

However, when the same program is running on some other rk3399 boards, the thread that called the function has a high CPU usage, but a low gpu usage.
![image](https://user-images.githubusercontent.com/25675592/209966623-a709bddc-5240-4c23-b7ec-c8472c2f5dd3.png)

![image](https://user-images.githubusercontent.com/25675592/209966539-825294aa-c203-46ff-944f-f3e61f159303.png)
gpu usage remains between 0 and 1

And It is strange that all boards can print device Mali-T860 by the code 
    std::cout << "Device name: " << std::get<1>(ctx_dev_err).getInfo<CL_DEVICE_NAME>() << std::endl;
![image](https://user-images.githubusercontent.com/25675592/209967101-90d26562-c1d8-4541-82ad-421d1045fd18.png)



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

clfunctions keep using cpu device #1018

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

clfunctions keep using cpu device #1018

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions