Permalink
Cannot retrieve contributors at this time
| /* | |
| * This file is part of FFmpeg. | |
| * | |
| * FFmpeg is free software; you can redistribute it and/or | |
| * modify it under the terms of the GNU Lesser General Public | |
| * License as published by the Free Software Foundation; either | |
| * version 2.1 of the License, or (at your option) any later version. | |
| * | |
| * FFmpeg is distributed in the hope that it will be useful, | |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| * Lesser General Public License for more details. | |
| * | |
| * You should have received a copy of the GNU Lesser General Public | |
| * License along with FFmpeg; if not, write to the Free Software | |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| */ | |
| #include "buffer.h" | |
| #include "common.h" | |
| #include "hwcontext.h" | |
| #include "hwcontext_internal.h" | |
| #include "hwcontext_cuda_internal.h" | |
| #if CONFIG_VULKAN | |
| #include "hwcontext_vulkan.h" | |
| #endif | |
| #include "cuda_check.h" | |
| #include "mem.h" | |
| #include "pixdesc.h" | |
| #include "pixfmt.h" | |
| #include "imgutils.h" | |
| typedef struct CUDAFramesContext { | |
| int shift_width, shift_height; | |
| int tex_alignment; | |
| } CUDAFramesContext; | |
| static const enum AVPixelFormat supported_formats[] = { | |
| AV_PIX_FMT_NV12, | |
| AV_PIX_FMT_YUV420P, | |
| AV_PIX_FMT_YUVA420P, | |
| AV_PIX_FMT_YUV444P, | |
| AV_PIX_FMT_P010, | |
| AV_PIX_FMT_P016, | |
| AV_PIX_FMT_YUV444P16, | |
| AV_PIX_FMT_0RGB32, | |
| AV_PIX_FMT_0BGR32, | |
| #if CONFIG_VULKAN | |
| AV_PIX_FMT_VULKAN, | |
| #endif | |
| }; | |
| #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x) | |
| static int cuda_frames_get_constraints(AVHWDeviceContext *ctx, | |
| const void *hwconfig, | |
| AVHWFramesConstraints *constraints) | |
| { | |
| int i; | |
| constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1, | |
| sizeof(*constraints->valid_sw_formats)); | |
| if (!constraints->valid_sw_formats) | |
| return AVERROR(ENOMEM); | |
| for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) | |
| constraints->valid_sw_formats[i] = supported_formats[i]; | |
| constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE; | |
| constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats)); | |
| if (!constraints->valid_hw_formats) | |
| return AVERROR(ENOMEM); | |
| constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA; | |
| constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE; | |
| return 0; | |
| } | |
| static void cuda_buffer_free(void *opaque, uint8_t *data) | |
| { | |
| AVHWFramesContext *ctx = opaque; | |
| AVHWDeviceContext *device_ctx = ctx->device_ctx; | |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; | |
| CudaFunctions *cu = hwctx->internal->cuda_dl; | |
| CUcontext dummy; | |
| CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); | |
| CHECK_CU(cu->cuMemFree((CUdeviceptr)data)); | |
| CHECK_CU(cu->cuCtxPopCurrent(&dummy)); | |
| } | |
| static AVBufferRef *cuda_pool_alloc(void *opaque, buffer_size_t size) | |
| { | |
| AVHWFramesContext *ctx = opaque; | |
| AVHWDeviceContext *device_ctx = ctx->device_ctx; | |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; | |
| CudaFunctions *cu = hwctx->internal->cuda_dl; | |
| AVBufferRef *ret = NULL; | |
| CUcontext dummy = NULL; | |
| CUdeviceptr data; | |
| int err; | |
| err = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); | |
| if (err < 0) | |
| return NULL; | |
| err = CHECK_CU(cu->cuMemAlloc(&data, size)); | |
| if (err < 0) | |
| goto fail; | |
| ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0); | |
| if (!ret) { | |
| CHECK_CU(cu->cuMemFree(data)); | |
| goto fail; | |
| } | |
| fail: | |
| CHECK_CU(cu->cuCtxPopCurrent(&dummy)); | |
| return ret; | |
| } | |
| static int cuda_frames_init(AVHWFramesContext *ctx) | |
| { | |
| AVHWDeviceContext *device_ctx = ctx->device_ctx; | |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; | |
| CUDAFramesContext *priv = ctx->internal->priv; | |
| CudaFunctions *cu = hwctx->internal->cuda_dl; | |
| int err, i; | |
| for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) { | |
| if (ctx->sw_format == supported_formats[i]) | |
| break; | |
| } | |
| if (i == FF_ARRAY_ELEMS(supported_formats)) { | |
| av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n", | |
| av_get_pix_fmt_name(ctx->sw_format)); | |
| return AVERROR(ENOSYS); | |
| } | |
| err = CHECK_CU(cu->cuDeviceGetAttribute(&priv->tex_alignment, | |
| 14 /* CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT */, | |
| hwctx->internal->cuda_device)); | |
| if (err < 0) | |
| return err; | |
| av_log(ctx, AV_LOG_DEBUG, "CUDA texture alignment: %d\n", priv->tex_alignment); | |
| // YUV420P is a special case. | |
| // Since nvenc expects the U/V planes to have half the linesize of the Y plane | |
| // alignment has to be doubled to ensure the U/V planes still end up aligned. | |
| if (ctx->sw_format == AV_PIX_FMT_YUV420P) | |
| priv->tex_alignment *= 2; | |
| av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height); | |
| if (!ctx->pool) { | |
| int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, priv->tex_alignment); | |
| if (size < 0) | |
| return size; | |
| ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL); | |
| if (!ctx->internal->pool_internal) | |
| return AVERROR(ENOMEM); | |
| } | |
| return 0; | |
| } | |
| static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame) | |
| { | |
| CUDAFramesContext *priv = ctx->internal->priv; | |
| int res; | |
| frame->buf[0] = av_buffer_pool_get(ctx->pool); | |
| if (!frame->buf[0]) | |
| return AVERROR(ENOMEM); | |
| res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data, | |
| ctx->sw_format, ctx->width, ctx->height, priv->tex_alignment); | |
| if (res < 0) | |
| return res; | |
| // YUV420P is a special case. | |
| // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned | |
| if (ctx->sw_format == AV_PIX_FMT_YUV420P) { | |
| frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2; | |
| frame->data[2] = frame->data[1]; | |
| frame->data[1] = frame->data[2] + frame->linesize[2] * (ctx->height / 2); | |
| } | |
| frame->format = AV_PIX_FMT_CUDA; | |
| frame->width = ctx->width; | |
| frame->height = ctx->height; | |
| return 0; | |
| } | |
| static int cuda_transfer_get_formats(AVHWFramesContext *ctx, | |
| enum AVHWFrameTransferDirection dir, | |
| enum AVPixelFormat **formats) | |
| { | |
| enum AVPixelFormat *fmts; | |
| fmts = av_malloc_array(2, sizeof(*fmts)); | |
| if (!fmts) | |
| return AVERROR(ENOMEM); | |
| fmts[0] = ctx->sw_format; | |
| fmts[1] = AV_PIX_FMT_NONE; | |
| *formats = fmts; | |
| return 0; | |
| } | |
| static int cuda_transfer_data(AVHWFramesContext *ctx, AVFrame *dst, | |
| const AVFrame *src) | |
| { | |
| CUDAFramesContext *priv = ctx->internal->priv; | |
| AVHWDeviceContext *device_ctx = ctx->device_ctx; | |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; | |
| CudaFunctions *cu = hwctx->internal->cuda_dl; | |
| CUcontext dummy; | |
| int i, ret; | |
| if ((src->hw_frames_ctx && ((AVHWFramesContext*)src->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA) || | |
| (dst->hw_frames_ctx && ((AVHWFramesContext*)dst->hw_frames_ctx->data)->format != AV_PIX_FMT_CUDA)) | |
| return AVERROR(ENOSYS); | |
| ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx)); | |
| if (ret < 0) | |
| return ret; | |
| for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) { | |
| CUDA_MEMCPY2D cpy = { | |
| .srcPitch = src->linesize[i], | |
| .dstPitch = dst->linesize[i], | |
| .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]), | |
| .Height = src->height >> ((i == 0 || i == 3) ? 0 : priv->shift_height), | |
| }; | |
| if (src->hw_frames_ctx) { | |
| cpy.srcMemoryType = CU_MEMORYTYPE_DEVICE; | |
| cpy.srcDevice = (CUdeviceptr)src->data[i]; | |
| } else { | |
| cpy.srcMemoryType = CU_MEMORYTYPE_HOST; | |
| cpy.srcHost = src->data[i]; | |
| } | |
| if (dst->hw_frames_ctx) { | |
| cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; | |
| cpy.dstDevice = (CUdeviceptr)dst->data[i]; | |
| } else { | |
| cpy.dstMemoryType = CU_MEMORYTYPE_HOST; | |
| cpy.dstHost = dst->data[i]; | |
| } | |
| ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream)); | |
| if (ret < 0) | |
| goto exit; | |
| } | |
| if (!dst->hw_frames_ctx) { | |
| ret = CHECK_CU(cu->cuStreamSynchronize(hwctx->stream)); | |
| if (ret < 0) | |
| goto exit; | |
| } | |
| exit: | |
| CHECK_CU(cu->cuCtxPopCurrent(&dummy)); | |
| return 0; | |
| } | |
| static void cuda_device_uninit(AVHWDeviceContext *device_ctx) | |
| { | |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; | |
| if (hwctx->internal) { | |
| CudaFunctions *cu = hwctx->internal->cuda_dl; | |
| if (hwctx->internal->is_allocated && hwctx->cuda_ctx) { | |
| if (hwctx->internal->flags & AV_CUDA_USE_PRIMARY_CONTEXT) | |
| CHECK_CU(cu->cuDevicePrimaryCtxRelease(hwctx->internal->cuda_device)); | |
| else | |
| CHECK_CU(cu->cuCtxDestroy(hwctx->cuda_ctx)); | |
| hwctx->cuda_ctx = NULL; | |
| } | |
| cuda_free_functions(&hwctx->internal->cuda_dl); | |
| } | |
| av_freep(&hwctx->internal); | |
| } | |
| static int cuda_device_init(AVHWDeviceContext *ctx) | |
| { | |
| AVCUDADeviceContext *hwctx = ctx->hwctx; | |
| int ret; | |
| if (!hwctx->internal) { | |
| hwctx->internal = av_mallocz(sizeof(*hwctx->internal)); | |
| if (!hwctx->internal) | |
| return AVERROR(ENOMEM); | |
| } | |
| if (!hwctx->internal->cuda_dl) { | |
| ret = cuda_load_functions(&hwctx->internal->cuda_dl, ctx); | |
| if (ret < 0) { | |
| av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n"); | |
| goto error; | |
| } | |
| } | |
| return 0; | |
| error: | |
| cuda_device_uninit(ctx); | |
| return ret; | |
| } | |
| static int cuda_context_init(AVHWDeviceContext *device_ctx, int flags) { | |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; | |
| CudaFunctions *cu; | |
| CUcontext dummy; | |
| int ret, dev_active = 0; | |
| unsigned int dev_flags = 0; | |
| const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC; | |
| cu = hwctx->internal->cuda_dl; | |
| hwctx->internal->flags = flags; | |
| if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) { | |
| ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device, | |
| &dev_flags, &dev_active)); | |
| if (ret < 0) | |
| return ret; | |
| if (dev_active && dev_flags != desired_flags) { | |
| av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n"); | |
| return AVERROR(ENOTSUP); | |
| } else if (dev_flags != desired_flags) { | |
| ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device, | |
| desired_flags)); | |
| if (ret < 0) | |
| return ret; | |
| } | |
| ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx, | |
| hwctx->internal->cuda_device)); | |
| if (ret < 0) | |
| return ret; | |
| } else { | |
| ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags, | |
| hwctx->internal->cuda_device)); | |
| if (ret < 0) | |
| return ret; | |
| CHECK_CU(cu->cuCtxPopCurrent(&dummy)); | |
| } | |
| hwctx->internal->is_allocated = 1; | |
| // Setting stream to NULL will make functions automatically use the default CUstream | |
| hwctx->stream = NULL; | |
| return 0; | |
| } | |
| static int cuda_device_create(AVHWDeviceContext *device_ctx, | |
| const char *device, | |
| AVDictionary *opts, int flags) | |
| { | |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; | |
| CudaFunctions *cu; | |
| int ret, device_idx = 0; | |
| if (device) | |
| device_idx = strtol(device, NULL, 0); | |
| if (cuda_device_init(device_ctx) < 0) | |
| goto error; | |
| cu = hwctx->internal->cuda_dl; | |
| ret = CHECK_CU(cu->cuInit(0)); | |
| if (ret < 0) | |
| goto error; | |
| ret = CHECK_CU(cu->cuDeviceGet(&hwctx->internal->cuda_device, device_idx)); | |
| if (ret < 0) | |
| goto error; | |
| ret = cuda_context_init(device_ctx, flags); | |
| if (ret < 0) | |
| goto error; | |
| return 0; | |
| error: | |
| cuda_device_uninit(device_ctx); | |
| return AVERROR_UNKNOWN; | |
| } | |
| static int cuda_device_derive(AVHWDeviceContext *device_ctx, | |
| AVHWDeviceContext *src_ctx, AVDictionary *opts, | |
| int flags) { | |
| AVCUDADeviceContext *hwctx = device_ctx->hwctx; | |
| CudaFunctions *cu; | |
| const char *src_uuid = NULL; | |
| int ret, i, device_count; | |
| #if CONFIG_VULKAN | |
| VkPhysicalDeviceIDProperties vk_idp = { | |
| .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES, | |
| }; | |
| #endif | |
| switch (src_ctx->type) { | |
| #if CONFIG_VULKAN | |
| case AV_HWDEVICE_TYPE_VULKAN: { | |
| AVVulkanDeviceContext *vkctx = src_ctx->hwctx; | |
| VkPhysicalDeviceProperties2 vk_dev_props = { | |
| .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, | |
| .pNext = &vk_idp, | |
| }; | |
| vkGetPhysicalDeviceProperties2(vkctx->phys_dev, &vk_dev_props); | |
| src_uuid = vk_idp.deviceUUID; | |
| break; | |
| } | |
| #endif | |
| default: | |
| return AVERROR(ENOSYS); | |
| } | |
| if (!src_uuid) { | |
| av_log(device_ctx, AV_LOG_ERROR, | |
| "Failed to get UUID of source device.\n"); | |
| goto error; | |
| } | |
| if (cuda_device_init(device_ctx) < 0) | |
| goto error; | |
| cu = hwctx->internal->cuda_dl; | |
| ret = CHECK_CU(cu->cuInit(0)); | |
| if (ret < 0) | |
| goto error; | |
| ret = CHECK_CU(cu->cuDeviceGetCount(&device_count)); | |
| if (ret < 0) | |
| goto error; | |
| hwctx->internal->cuda_device = -1; | |
| for (i = 0; i < device_count; i++) { | |
| CUdevice dev; | |
| CUuuid uuid; | |
| ret = CHECK_CU(cu->cuDeviceGet(&dev, i)); | |
| if (ret < 0) | |
| goto error; | |
| ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev)); | |
| if (ret < 0) | |
| goto error; | |
| if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) { | |
| hwctx->internal->cuda_device = dev; | |
| break; | |
| } | |
| } | |
| if (hwctx->internal->cuda_device == -1) { | |
| av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n"); | |
| goto error; | |
| } | |
| ret = cuda_context_init(device_ctx, flags); | |
| if (ret < 0) | |
| goto error; | |
| return 0; | |
| error: | |
| cuda_device_uninit(device_ctx); | |
| return AVERROR_UNKNOWN; | |
| } | |
| const HWContextType ff_hwcontext_type_cuda = { | |
| .type = AV_HWDEVICE_TYPE_CUDA, | |
| .name = "CUDA", | |
| .device_hwctx_size = sizeof(AVCUDADeviceContext), | |
| .frames_priv_size = sizeof(CUDAFramesContext), | |
| .device_create = cuda_device_create, | |
| .device_derive = cuda_device_derive, | |
| .device_init = cuda_device_init, | |
| .device_uninit = cuda_device_uninit, | |
| .frames_get_constraints = cuda_frames_get_constraints, | |
| .frames_init = cuda_frames_init, | |
| .frames_get_buffer = cuda_get_buffer, | |
| .transfer_get_formats = cuda_transfer_get_formats, | |
| .transfer_data_to = cuda_transfer_data, | |
| .transfer_data_from = cuda_transfer_data, | |
| .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE }, | |
| }; |