From 3e2d125e523ad0b748f52907165054b63ad21425 Mon Sep 17 00:00:00 2001 From: Jacob He Date: Mon, 22 Jan 2018 16:40:24 +0800 Subject: [PATCH] Update XGL from commit: 2072cab 1. Implement VK_AMD_buffer_marker extension 2. Implement VK_EXT_debug_report extension 3. Pass layout to InitImmutableDescriptors(). This removes 80% of the time in DescriptorSet::Reassign() 4. Calculate location of bindings for descriptor set layout to avoid a memory lookup 5. Disable depth clamping when enableDepthClamp is set to false 6. Fix CTS dEQP-VK.tessellation.shader_input_output.barrier failure, simplify the TessFactorToBuffer offset calculation 7. Fix CTS dEQP-VK.glsl.440.linkage.varying.component group testing failure --- CMakeLists.txt | 6 + icd/CMakeLists.txt | 1 + icd/api/include/internal_mem_mgr.h | 9 +- .../khronos/devext/vk_amd_buffer_marker.h | 54 ++ icd/api/include/khronos/vulkan.h | 1 + icd/api/include/vk_buffer.h | 1 + icd/api/include/vk_cmdbuffer.h | 19 +- icd/api/include/vk_conv.h | 31 + icd/api/include/vk_debug_report.h | 102 ++ icd/api/include/vk_descriptor_set.h | 4 +- icd/api/include/vk_descriptor_set_layout.h | 11 +- icd/api/include/vk_extensions.h | 3 + icd/api/include/vk_instance.h | 39 + icd/api/internal_mem_mgr.cpp | 22 +- icd/api/llpc/CMakeLists.txt | 4 + icd/api/llpc/context/llpcCopyShader.cpp | 119 ++- icd/api/llpc/context/llpcCopyShader.h | 13 +- icd/api/llpc/context/llpcGraphicsContext.cpp | 41 +- icd/api/llpc/context/llpcPipelineContext.h | 10 +- icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp | 136 ++- .../lower/llpcSpirvLowerResourceCollect.cpp | 15 - .../patch/generate/gfx6/glslArithOpEmu.ll | 0 .../patch/generate/gfx9/glslArithOpEmu.ll | 0 icd/api/llpc/patch/generate/glslNullFsEmu.ll | 14 +- .../llpc/patch/llpcPatchEntryPointMutate.cpp | 17 +- .../llpc/patch/llpcPatchEntryPointMutate.h | 5 +- .../llpc/patch/llpcPatchInOutImportExport.cpp | 877 ++++++++++++------ .../llpc/patch/llpcPatchInOutImportExport.h | 80 +- .../llpc/patch/llpcPatchResourceCollect.cpp | 117 +-- icd/api/llpc/patch/llpcVertexFetch.cpp | 143 +-- icd/api/llpc/patch/llpcVertexFetch.h | 2 +- icd/api/llpc/translator/SPIRVInternal.h | 5 +- icd/api/llpc/translator/SPIRVReader.cpp | 17 +- icd/api/open_strings/entry_points.txt | 6 + icd/api/open_strings/extensions.txt | 2 + icd/api/open_strings/g_entry_points_decl.h | 16 + icd/api/open_strings/g_entry_points_impl.h | 4 + icd/api/open_strings/g_extensions_decl.h | 4 + icd/api/open_strings/g_extensions_impl.h | 2 + icd/api/open_strings/g_func_table.cpp | 20 + icd/api/open_strings/g_func_table.h | 8 + icd/api/vert_buf_binding_mgr.cpp | 4 +- icd/api/vk_buffer.cpp | 16 +- icd/api/vk_cmdbuffer.cpp | 81 +- icd/api/vk_conv.cpp | 515 +++++++++- icd/api/vk_debug_report.cpp | 175 ++++ icd/api/vk_descriptor_set.cpp | 18 +- icd/api/vk_descriptor_set_layout.cpp | 14 +- icd/api/vk_dispatch.cpp | 6 + icd/api/vk_instance.cpp | 179 +++- icd/api/vk_memory.cpp | 19 - icd/api/vk_physical_device.cpp | 2 + icd/api/vk_pipeline.cpp | 6 - icd/api/vk_query.cpp | 11 +- icd/make/importdefs | 2 +- icd/res/ver.h | 2 +- icd/settings/settings.cpp | 10 +- 57 files changed, 2392 insertions(+), 648 deletions(-) create mode 100644 icd/api/include/khronos/devext/vk_amd_buffer_marker.h create mode 100644 icd/api/include/vk_debug_report.h mode change 100644 => 100755 icd/api/llpc/patch/generate/gfx6/glslArithOpEmu.ll mode change 100644 => 100755 icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll create mode 100644 icd/api/vk_debug_report.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index acfc0f05..e5659abe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,12 @@ option(USE_NEXT_SDK "Use next SDK?" OFF) option(ICD_BUILD_VIDEO "Build Video Support?" OFF) +option(ICD_UPSTREAM_LLVM "Build with upstream LLVM?" OFF) + +if(NOT ICD_BUILD_LLPC) + set(ICD_UPSTREAM_LLVM OFF CACHE BOOL "ICD_UPSTREAM_LLVM is overrided to false." FORCE) +endif() + option(ICD_GPUOPEN_DEVMODE_BUILD "Build ${PROJECT_NAME} with GPU Open Developer Mode driver support?" ON) option(ICD_MEMTRACK "Turn on memory tracking?" ${CMAKE_BUILD_TYPE_DEBUG}) diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt index 554eaf02..1bd513af 100644 --- a/icd/CMakeLists.txt +++ b/icd/CMakeLists.txt @@ -169,6 +169,7 @@ target_sources(xgl PRIVATE api/vk_cmd_pool.cpp api/vk_compute_pipeline.cpp api/vk_conv.cpp + api/vk_debug_report.cpp api/vk_descriptor_set.cpp api/vk_descriptor_set_layout.cpp api/vk_descriptor_pool.cpp diff --git a/icd/api/include/internal_mem_mgr.h b/icd/api/include/internal_mem_mgr.h index 6a30880a..c745c706 100644 --- a/icd/api/include/internal_mem_mgr.h +++ b/icd/api/include/internal_mem_mgr.h @@ -146,6 +146,12 @@ class InternalMemory return m_gpuVA[idx]; } + void* CpuAddr(int32_t idx = DefaultDeviceIndex) const + { + VK_ASSERT((idx >= 0) && (idx < static_cast(MaxPalDevices))); + return m_memoryPool.groupMemory.CpuAddr(idx); + } + Pal::gpusize Offset() const { return m_offset; } @@ -210,7 +216,8 @@ class InternalMemMgr Pal::IGpuMemoryBindable* pBindable, bool readOnly, InternalMemory* pInternalMemory, - bool removeInvisibleHeap = false); + bool removeInvisibleHeap = false, + bool persistentMapped = false); void FreeGpuMem( const InternalMemory* pInternalMemory); diff --git a/icd/api/include/khronos/devext/vk_amd_buffer_marker.h b/icd/api/include/khronos/devext/vk_amd_buffer_marker.h new file mode 100644 index 00000000..97373e85 --- /dev/null +++ b/icd/api/include/khronos/devext/vk_amd_buffer_marker.h @@ -0,0 +1,54 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + ********************************************************************************************************************** + * @file vk_amd_buffer_marker.h + * @brief Header for VK_AMD_buffer marker extension. + ********************************************************************************************************************** + */ +#ifndef VK_AMD_BUFFER_MARKER_H_ +#define VK_AMD_BUFFER_MARKER_H_ + +#define VK_AMD_buffer_marker 1 +#define VK_AMD_BUFFER_MARKER_SPEC_VERSION 1 +#define VK_AMD_BUFFER_MARKER_EXTENSION_NUMBER 180 + +#define VK_AMD_BUFFER_MARKER_EXTENSION_NAME "VK_AMD_buffer_marker" + +typedef void (VKAPI_PTR *PFN_vkCmdWriteBufferMarkerAMD)( + VkCommandBuffer commandBuffer, + VkPipelineStageFlagBits pipelineStage, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + uint32_t marker); + +VKAPI_ATTR void VKAPI_CALL vkCmdWriteBufferMarkerAMD( + VkCommandBuffer commandBuffer, + VkPipelineStageFlagBits pipelineStage, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + uint32_t marker); + +#endif /* VK_AMD_BUFFER_MARKER_H_ */ diff --git a/icd/api/include/khronos/vulkan.h b/icd/api/include/khronos/vulkan.h index c730e525..708937ae 100644 --- a/icd/api/include/khronos/vulkan.h +++ b/icd/api/include/khronos/vulkan.h @@ -55,6 +55,7 @@ // Internal (under development) extension definitions #include "devext/vk_amd_gpa_interface.h" +#include "devext/vk_amd_buffer_marker.h" enum class DynamicStatesInternal : uint32_t { VIEWPORT = 0, diff --git a/icd/api/include/vk_buffer.h b/icd/api/include/vk_buffer.h index 0d4fbef8..62523c7e 100644 --- a/icd/api/include/vk_buffer.h +++ b/icd/api/include/vk_buffer.h @@ -126,6 +126,7 @@ class Buffer : public NonDispatchable BufferFlags internalFlags); void CalcBarrierUsage( + const Device* pDevice, VkBufferUsageFlags usage); Pal::IGpuMemory* m_pGpuMemory[MaxPalDevices]; diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 8a06f86a..93983e2c 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -499,6 +499,12 @@ class CmdBuffer uint32_t length, const void* values); + void WriteBufferMarker( + VkPipelineStageFlagBits pipelineStage, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + uint32_t marker); + VK_INLINE void SetDeviceMask(uint32_t deviceMask) { // Ensure we are enabling valid devices within the group @@ -541,9 +547,10 @@ class CmdBuffer return m_pPalCmdBuffers[idx]; } - static Pal::uint32 ConvertBarrierSrcAccessFlags(VkAccessFlags accessMask); - static Pal::uint32 ConvertBarrierDstAccessFlags(VkAccessFlags accessMask); + static Pal::uint32 ConvertBarrierSrcAccessFlags(const Device* pDevice, VkAccessFlags accessMask); + static Pal::uint32 ConvertBarrierDstAccessFlags(const Device* pDevice, VkAccessFlags accessMask); static void ConvertBarrierCacheFlags( + const Device* pDevice, VkAccessFlags srcAccess, VkAccessFlags dstAccess, uint32_t supportInputCacheMask, @@ -1337,6 +1344,14 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerInsertEXT( VKAPI_ATTR void VKAPI_CALL vkCmdSetSampleLocationsEXT( VkCommandBuffer commandBuffer, const VkSampleLocationsInfoEXT* pSampleLocationsInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdWriteBufferMarkerAMD( + VkCommandBuffer commandBuffer, + VkPipelineStageFlagBits pipelineStage, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + uint32_t marker); + } // namespace entry } // namespace vk diff --git a/icd/api/include/vk_conv.h b/icd/api/include/vk_conv.h index c11eee4e..a2326e95 100644 --- a/icd/api/include/vk_conv.h +++ b/icd/api/include/vk_conv.h @@ -1510,6 +1510,37 @@ VK_INLINE Pal::HwPipePoint VkToPalSrcPipePointForTimestampWrite(VkPipelineStageF return srcPipePoint; } +// ===================================================================================================================== +// Converts Vulkan source pipeline stage flags to PAL buffer marker writes (top/bottom only) +VK_INLINE Pal::HwPipePoint VkToPalSrcPipePointForMarkers( + VkPipelineStageFlags flags, + Pal::EngineType engineType) +{ + // This function is written against the following three engine types. If you hit this assert then check if this + // new engine supports top of pipe writes at all (e.g. SDMA doesn't). + VK_ASSERT(engineType == Pal::EngineTypeDma || + engineType == Pal::EngineTypeUniversal || + engineType == Pal::EngineTypeCompute); + + // Flags that allow signaling at top-of-pipe (anything else maps to bottom) + constexpr VkPipelineStageFlags SrcTopOfPipeFlags = + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + Pal::HwPipePoint srcPipePoint; + + if (((flags & ~SrcTopOfPipeFlags) == 0) && + (engineType != Pal::EngineTypeDma)) // SDMA engines only support bottom of pipe writes + { + srcPipePoint = Pal::HwPipeTop; + } + else + { + srcPipePoint = Pal::HwPipeBottom; + } + + return srcPipePoint; +} + // Helper structure for mapping stage flag sets to PAL pipe points struct HwPipePointMappingEntry { diff --git a/icd/api/include/vk_debug_report.h b/icd/api/include/vk_debug_report.h new file mode 100644 index 00000000..9f09e68c --- /dev/null +++ b/icd/api/include/vk_debug_report.h @@ -0,0 +1,102 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#ifndef __VK_DEBUG_REPORT_H__ +#define __VK_DEBUG_REPORT_H__ + +#pragma once + +#include "include/vk_dispatch.h" + +namespace vk +{ + +// ===================================================================================================================== +// Vulkan implementation of VK_EXT_debug_report extension +class DebugReportCallback : public NonDispatchable +{ +public: + static VkResult Create( + Instance* pInstance, + const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDebugReportCallbackEXT* pCallback); + + void Destroy( + Instance* pInstance, + const VkAllocationCallbacks* pAllocator); + + void Message( + Instance* pInstance, + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char* pLayerPrefix, + const char* pMessage); + + VkDebugReportFlagsEXT GetFlags(); + + PFN_vkDebugReportCallbackEXT GetCallbackFunc(); + + void* GetUserData(); + +protected: + DebugReportCallback() + { + }; + +private: + VkDebugReportCallbackCreateInfoEXT m_createInfo; +}; + +namespace entry +{ +VKAPI_ATTR VkResult VKAPI_CALL vkCreateDebugReportCallbackEXT( + VkInstance instance, + const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDebugReportCallbackEXT* pCallback); + +VKAPI_ATTR void VKAPI_CALL vkDestroyDebugReportCallbackEXT( + VkInstance instance, + VkDebugReportCallbackEXT callback, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR void VKAPI_CALL vkDebugReportMessageEXT( + VkInstance instance, + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char* pLayerPrefix, + const char* pMessage); +} // namespace entry + +} // namespace vk + +#endif /* __VK_DEBUG_REPORT_H__ */ diff --git a/icd/api/include/vk_descriptor_set.h b/icd/api/include/vk_descriptor_set.h index 9ffc761f..73a04864 100644 --- a/icd/api/include/vk_descriptor_set.h +++ b/icd/api/include/vk_descriptor_set.h @@ -195,7 +195,9 @@ class DescriptorSet : public NonDispatchable void* pAllocHandle, VkDescriptorSet* pHandle); - void InitImmutableDescriptors(uint32_t numPalDevices); + void InitImmutableDescriptors( + const DescriptorSetLayout* pLayout, + uint32_t numPalDevices); void* AllocHandle() const { return m_pAllocHandle; } diff --git a/icd/api/include/vk_descriptor_set_layout.h b/icd/api/include/vk_descriptor_set_layout.h index 3be67b11..fb436351 100644 --- a/icd/api/include/vk_descriptor_set_layout.h +++ b/icd/api/include/vk_descriptor_set_layout.h @@ -89,7 +89,6 @@ class DescriptorSetLayout : public NonDispatchable(Util::VoidPtrInc(this, sizeof(*this))); + return pBindings[bindingIndex]; + } const CreateInfo& Info() const { return m_info; } @@ -133,7 +137,8 @@ class DescriptorSetLayout : public NonDispatchable KHX_DEVICE_GROUP_CREATION, KHR_EXTERNAL_SEMAPHORE_CAPABILITIES, KHR_EXTERNAL_FENCE_CAPABILITIES, + EXT_DEBUG_REPORT, Count }; }; @@ -237,6 +238,8 @@ class DeviceExtensions : public Extensions KHR_EXTERNAL_FENCE_WIN32, KHR_WIN32_KEYED_MUTEX, EXT_GLOBAL_PRIORITY, + AMD_BUFFER_MARKER, + Count }; }; diff --git a/icd/api/include/vk_instance.h b/icd/api/include/vk_instance.h index 897d9e4f..991ba268 100644 --- a/icd/api/include/vk_instance.h +++ b/icd/api/include/vk_instance.h @@ -42,11 +42,14 @@ #include "include/vk_dispatch.h" #include "include/vk_utils.h" #include "include/vk_extensions.h" +#include "include/vk_debug_report.h" #include "palDeveloperHooks.h" #include "palLib.h" #include "palScreen.h" #include "palSysMemory.h" +#include "palList.h" +#include "palMutex.h" namespace Pal { @@ -194,6 +197,26 @@ class Instance VkResult QueryApplicationProfile(RuntimeSettings* pRuntimeSettings = nullptr); + VkResult RegisterDebugCallback( + DebugReportCallback* pCallback); + + void UnregisterDebugCallback( + DebugReportCallback* pCallback); + + void LogMessage(uint32_t level, + uint64_t categoryMask, + const char* pFormat, + va_list args); + + void CallExternalCallbacks( + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char* pLayerPrefix, + const char* pMessage); + private: Instance( const VkAllocationCallbacks* pAllocCb, @@ -218,6 +241,13 @@ class Instance Pal::Developer::CallbackType type, void* pCbData); + static void PAL_STDCALL LogCallback( + void* pClientData, + Pal::uint32 level, + Pal::uint64 categoryMask, + const char* pFormat, + va_list args); + Pal::IPlatform* m_pPalPlatform; // Pal Platform object. VkAllocationCallbacks m_allocCallbacks; @@ -258,6 +288,15 @@ class Instance ChillSettings m_chillSettings; // Dynamic chill settings structure #endif + Util::List m_debugReportCallbacks; // List of registered Debug + // Report Callbacks + Util::Mutex m_logCallbackInternalOnlyMutex; // Serialize internal log + // message translation prior + // to calling external callbacks + Util::Mutex m_logCallbackInternalExternalMutex; // Serialize all calls to + // external callbacks from + // internal and external sources + #ifdef PAL_ENABLE_PRINTS_ASSERTS mutable uint32_t m_dispatchTableQueryCount; #endif diff --git a/icd/api/internal_mem_mgr.cpp b/icd/api/internal_mem_mgr.cpp index 996263cc..207178ce 100644 --- a/icd/api/internal_mem_mgr.cpp +++ b/icd/api/internal_mem_mgr.cpp @@ -595,7 +595,8 @@ VkResult InternalMemMgr::AllocAndBindGpuMem( Pal::IGpuMemoryBindable* pBindable, bool readOnly, InternalMemory* pInternalMemory, - bool removeInvisibleHeap) + bool removeInvisibleHeap, + bool persistentMapped) { VK_ASSERT(pBindable != nullptr); VK_ASSERT(pInternalMemory != nullptr); @@ -610,20 +611,21 @@ VkResult InternalMemMgr::AllocAndBindGpuMem( return VK_SUCCESS; } + // Fill in the GPU memory object creation info based on the memory requirements + InternalMemCreateInfo createInfo = {}; + if (removeInvisibleHeap) { FilterInvisibleHeap(&memReqs); } - // Fill in the GPU memory object creation info based on the memory requirements - InternalMemCreateInfo createInfo = {}; - - createInfo.pal.size = memReqs.size; - createInfo.pal.alignment = memReqs.alignment; - createInfo.pal.vaRange = Pal::VaRange::Default; - createInfo.pal.priority = Pal::GpuMemPriority::Normal; - createInfo.pal.heapCount = memReqs.heapCount; - createInfo.flags.readOnly = readOnly; + createInfo.pal.size = memReqs.size; + createInfo.pal.alignment = memReqs.alignment; + createInfo.pal.vaRange = Pal::VaRange::Default; + createInfo.pal.priority = Pal::GpuMemPriority::Normal; + createInfo.pal.heapCount = memReqs.heapCount; + createInfo.flags.readOnly = readOnly; + createInfo.flags.persistentMapped = persistentMapped ? 1 : 0; for (uint32_t h = 0; h < memReqs.heapCount; ++h) { diff --git a/icd/api/llpc/CMakeLists.txt b/icd/api/llpc/CMakeLists.txt index 5a382c4f..f7546781 100644 --- a/icd/api/llpc/CMakeLists.txt +++ b/icd/api/llpc/CMakeLists.txt @@ -89,6 +89,10 @@ target_compile_definitions(llpc PRIVATE ${TARGET_ARCHITECTURE_ENDIANESS}ENDIAN_C target_compile_definitions(llpc PRIVATE _SPIRV_LLVM_API) target_compile_definitions(llpc PRIVATE LLPC_BUILD_GFX9) +if(ICD_UPSTREAM_LLVM) + target_compile_definitions(llpc PRIVATE LLVM_SOURCE_PROMOTION=1) +endif() + target_include_directories(llpc PUBLIC ${PROJECT_SOURCE_DIR}/include diff --git a/icd/api/llpc/context/llpcCopyShader.cpp b/icd/api/llpc/context/llpcCopyShader.cpp index fb9da3ed..43eb986c 100644 --- a/icd/api/llpc/context/llpcCopyShader.cpp +++ b/icd/api/llpc/context/llpcCopyShader.cpp @@ -95,6 +95,25 @@ Result CopyShader::Run( auto& inOutUsage = m_pContext->GetShaderResourceUsage(ShaderStageCopyShader)->inOutUsage; inOutUsage.gs.pGsVsRingBufDesc = pGsVsRingBufDesc; + if (m_pContext->IsGsOnChip()) + { + // Construct LDS type: [ldsSize * i32], address space 3 + auto ldsSize = m_pContext->GetGpuProperty()->ldsSizePerCu; + auto pLdsTy = ArrayType::get(m_pContext->Int32Ty(), ldsSize / sizeof(uint32_t)); + + m_pLds = new GlobalVariable(*m_pModule, + pLdsTy, + false, + GlobalValue::ExternalLinkage, + nullptr, + "lds", + nullptr, + GlobalValue::NotThreadLocal, + ADDR_SPACE_LOCAL); + LLPC_ASSERT(m_pLds != nullptr); + m_pLds->setAlignment(sizeof(uint32_t)); + } + // Export GS outputs to FS if (result == Result::Success) { @@ -175,9 +194,15 @@ void CopyShader::ExportOutput() for (auto& byteSizeMap : genericOutByteSizes) { + // > uint32_t loc = byteSizeMap.first; - uint32_t byteSize = byteSizeMap.second; + uint32_t byteSize = 0; + for (uint32_t i = 0; i < 4; ++i) + { + byteSize += byteSizeMap.second[i]; + } + LLPC_ASSERT(byteSize % 4 == 0); uint32_t dwordSize = byteSize / 4; auto pOutputTy = VectorType::get(m_pContext->FloatTy(), dwordSize); @@ -359,8 +384,8 @@ Result CopyShader::DoPatch() } // ===================================================================================================================== -// Calculates GS to VS buffer offset from input/output location -Value* CopyShader::CalcGsVsRingBufferOffsetForOutput( +// Calculates GS to VS ring offset from input location +Value* CopyShader::CalcGsVsRingOffsetForInput( uint32_t location, // Output location uint32_t compIdx, // Output component Instruction* pInsertPos) // [in] Where to insert the instruction @@ -369,22 +394,38 @@ Value* CopyShader::CalcGsVsRingBufferOffsetForOutput( auto pResUsage = m_pContext->GetShaderResourceUsage(ShaderStageGeometry); - uint32_t outputVertices = pResUsage->builtInUsage.gs.outputVertices; + Value* pRingOffset = nullptr; + if (m_pContext->IsGsOnChip()) + { + // ringOffset = esGsLdsSize + vertexOffset + location * 4 + compIdx + pRingOffset = ConstantInt::get(m_pContext->Int32Ty(), pResUsage->inOutUsage.gs.esGsLdsSize); - // byteOffset = vertexOffset * 4 + (location * 4 + compIdx) * 64 * maxVertices - Value* pRingBufOffset = BinaryOperator::CreateMul(pVertexOffset, - ConstantInt::get(m_pContext->Int32Ty(), 4), - "", - pInsertPos); + pRingOffset = BinaryOperator::CreateAdd(pRingOffset, pVertexOffset, "", pInsertPos); - pRingBufOffset = BinaryOperator::CreateAdd(pRingBufOffset, - ConstantInt::get(m_pContext->Int32Ty(), - (location * 4 + compIdx) * 64 * - outputVertices), - "", - pInsertPos); + pRingOffset = BinaryOperator::CreateAdd(pRingOffset, + ConstantInt::get(m_pContext->Int32Ty(), (location * 4) + compIdx), + "", + pInsertPos); + } + else + { + uint32_t outputVertices = pResUsage->builtInUsage.gs.outputVertices; + + // ringOffset = vertexOffset * 4 + (location * 4 + compIdx) * 64 * maxVertices + pRingOffset = BinaryOperator::CreateMul(pVertexOffset, + ConstantInt::get(m_pContext->Int32Ty(), 4), + "", + pInsertPos); + + pRingOffset = BinaryOperator::CreateAdd(pRingOffset, + ConstantInt::get(m_pContext->Int32Ty(), + (location * 4 + compIdx) * 64 * + outputVertices), + "", + pInsertPos); + } - return pRingBufOffset; + return pRingOffset; } // ===================================================================================================================== @@ -394,21 +435,39 @@ Value* CopyShader::LoadValueFromGsVsRingBuffer( uint32_t compIdx, // Output component Instruction* pInsertPos) // [in] Where to insert the load instruction { - Value* pRingBufOffset = CalcGsVsRingBufferOffsetForOutput(location, compIdx, pInsertPos); - auto& inOutUsage = m_pContext->GetShaderResourceUsage(ShaderStageCopyShader)->inOutUsage; + Value* pLoadValue = nullptr; + Value* pRingOffset = CalcGsVsRingOffsetForInput(location, compIdx, pInsertPos); - std::vector args; - args.push_back(inOutUsage.gs.pGsVsRingBufDesc); - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); - args.push_back(pRingBufOffset); - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // slc - return EmitCall(m_pModule, - "llvm.amdgcn.buffer.load.f32", - m_pContext->FloatTy(), - args, - NoAttrib, - pInsertPos); + if (m_pContext->IsGsOnChip()) + { + std::vector idxs; + idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); + idxs.push_back(pRingOffset); + + Value* pLoadPtr = GetElementPtrInst::Create(nullptr, m_pLds, idxs, "", pInsertPos); + pLoadValue = new LoadInst(pLoadPtr, "", false, m_pLds->getAlignment(), pInsertPos); + + pLoadValue = BitCastInst::Create(Instruction::BitCast, pLoadValue, m_pContext->FloatTy(), "", pInsertPos); + } + else + { + auto& inOutUsage = m_pContext->GetShaderResourceUsage(ShaderStageCopyShader)->inOutUsage; + + std::vector args; + args.push_back(inOutUsage.gs.pGsVsRingBufDesc); + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); + args.push_back(pRingOffset); + args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc + args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // slc + pLoadValue = EmitCall(m_pModule, + "llvm.amdgcn.buffer.load.f32", + m_pContext->FloatTy(), + args, + NoAttrib, + pInsertPos); + } + + return pLoadValue; } // ===================================================================================================================== diff --git a/icd/api/llpc/context/llpcCopyShader.h b/icd/api/llpc/context/llpcCopyShader.h index 45992a45..b85c9b92 100644 --- a/icd/api/llpc/context/llpcCopyShader.h +++ b/icd/api/llpc/context/llpcCopyShader.h @@ -55,9 +55,9 @@ class CopyShader void ExportOutput(); Result DoPatch(); - llvm::Value* CalcGsVsRingBufferOffsetForOutput(uint32_t location, - uint32_t compIdx, - llvm::Instruction* pInsertPos); + llvm::Value* CalcGsVsRingOffsetForInput(uint32_t location, + uint32_t compIdx, + llvm::Instruction* pInsertPos); llvm::Value* LoadValueFromGsVsRingBuffer(uint32_t location, uint32_t compIdx, @@ -74,9 +74,10 @@ class CopyShader // Start offset of currently-processed vertex in GS-VS ring buffer static const uint32_t EntryArgIdxVertexOffset = 2; - llvm::Module* m_pModule; // LLVM module for copy shader - Context* m_pContext; // LLPC context - llvm::Function* m_pEntryPoint; // Entry point of copy shader module + llvm::Module* m_pModule; // LLVM module for copy shader + Context* m_pContext; // LLPC context + llvm::Function* m_pEntryPoint; // Entry point of copy shader module + llvm::GlobalVariable* m_pLds; // Global variable to model LDS }; } // Llpc diff --git a/icd/api/llpc/context/llpcGraphicsContext.cpp b/icd/api/llpc/context/llpcGraphicsContext.cpp index b38fdba8..df506384 100644 --- a/icd/api/llpc/context/llpcGraphicsContext.cpp +++ b/icd/api/llpc/context/llpcGraphicsContext.cpp @@ -72,7 +72,8 @@ GraphicsContext::GraphicsContext( m_pPipelineInfo(pPipelineInfo), m_stageMask(0), m_activeStageCount(0), - m_tessOffchip(cl::EnableTessOffChip) + m_tessOffchip(cl::EnableTessOffChip), + m_gsOnChip(false) { #ifdef LLPC_BUILD_GFX9 if (gfxIp.major >= 9) @@ -332,7 +333,7 @@ uint64_t GraphicsContext::GetShaderHashCode( // Determines whether or not GS on-chip mode is valid for this pipeline. bool GraphicsContext::CanGsOnChip() { - bool gsOnChip = false; + bool gsOnChip = true; uint32_t stageMask = GetShaderStageMask(); const bool hasTs = ((stageMask & (ShaderStageToMask(ShaderStageTessControl) | @@ -345,12 +346,12 @@ bool GraphicsContext::CanGsOnChip() { uint32_t gsPrimsPerSubgroup = m_pGpuProperty->gsOnChipDefaultPrimsPerSubgroup; - const uint32_t esGsItemSize = 4 * pEsResUsage->inOutUsage.outputMapLocCount; - const uint32_t gsInstanceCount = pGsResUsage->builtInUsage.gs.invocations; - const uint32_t gsVsItemSize = 4 * - pGsResUsage->inOutUsage.outputMapLocCount * - pGsResUsage->builtInUsage.gs.outputVertices * - gsInstanceCount; + const uint32_t esGsRingItemSize = 4 * pEsResUsage->inOutUsage.outputMapLocCount; + const uint32_t gsInstanceCount = pGsResUsage->builtInUsage.gs.invocations; + const uint32_t gsVsRingItemSize = 4 * + pGsResUsage->inOutUsage.outputMapLocCount * + pGsResUsage->builtInUsage.gs.outputVertices * + gsInstanceCount; uint32_t vertsPerPrim = 1; bool useAdjacency = false; @@ -393,11 +394,11 @@ bool GraphicsContext::CanGsOnChip() } // Compute GS-VS LDS size based on target GS primitives per subgroup - uint32_t gsVsLdsSize = (gsVsItemSize * gsPrimsPerSubgroup); + uint32_t gsVsLdsSize = (gsVsRingItemSize * gsPrimsPerSubgroup); // Compute ES-GS LDS size based on the worst case number of ES vertices needed to create the target number of // GS primitives per subgroup. - uint32_t esGsLdsSize = esGsItemSize * esMinVertsPerSubgroup * gsPrimsPerSubgroup; + uint32_t esGsLdsSize = esGsRingItemSize * esMinVertsPerSubgroup * gsPrimsPerSubgroup; // Total LDS use per subgroup aligned to the register granularity uint32_t gsOnChipLdsSize = Pow2Align((esGsLdsSize + gsVsLdsSize), m_pGpuProperty->ldsSizeDwordGranularity); @@ -412,18 +413,18 @@ bool GraphicsContext::CanGsOnChip() // If total LDS usage is too big, refactor partitions based on ratio of ES-GS and GS-VS item sizes. if (gsOnChipLdsSize > maxLdsSize) { - const uint32_t esGsItemSizePerPrim = esGsItemSize * esMinVertsPerSubgroup; - const uint32_t itemSizeTotal = esGsItemSizePerPrim + gsVsItemSize; + const uint32_t esGsItemSizePerPrim = esGsRingItemSize * esMinVertsPerSubgroup; + const uint32_t itemSizeTotal = esGsItemSizePerPrim + gsVsRingItemSize; esGsLdsSize = RoundUpToMultiple((esGsItemSizePerPrim * maxLdsSize) / itemSizeTotal, esGsItemSizePerPrim); - gsVsLdsSize = RoundDownToMultiple(maxLdsSize - esGsLdsSize, gsVsItemSize); + gsVsLdsSize = RoundDownToMultiple(maxLdsSize - esGsLdsSize, gsVsRingItemSize); gsOnChipLdsSize = maxLdsSize; } // Based on the LDS space, calculate how many GS prims per subgroup and ES vertices per subgroup can be dispatched. - gsPrimsPerSubgroup = (gsVsLdsSize / gsVsItemSize); - uint32_t esVertsPerSubgroup = (esGsLdsSize / esGsItemSize); + gsPrimsPerSubgroup = (gsVsLdsSize / gsVsRingItemSize); + uint32_t esVertsPerSubgroup = (esGsLdsSize / esGsRingItemSize); LLPC_ASSERT(esVertsPerSubgroup >= esMinVertsPerSubgroup); @@ -441,7 +442,15 @@ bool GraphicsContext::CanGsOnChip() esVertsPerSubgroup -= (esMinVertsPerSubgroup - 1); // TODO: Accept GsOffChipDefaultThreshold from panel option - constexpr uint32_t GsOffChipDefaultThreshold = 64; + // TODO: Value of GsOffChipDefaultThreshold should be 64, due to an issue it's changed to 32 in order to test + // on-chip GS code generation before fixing that issue. + // The issue is because we only remove unused builtin output till final GS output store generation, when + // determining onchip/offchip mode, unused builtin output like PointSize and Clip/CullDistance is factored in + // LDS usage and deactivates onchip GS when GsOffChipDefaultThreshold is 64. To fix this we will probably + // need to clear unused builtin ouput before determining onchip/offchip GS mode. + constexpr uint32_t GsOffChipDefaultThreshold = 32; + + pGsResUsage->inOutUsage.gs.esGsLdsSize = esGsLdsSize; if (((gsPrimsPerSubgroup * gsInstanceCount) < GsOffChipDefaultThreshold) || (esVertsPerSubgroup == 0)) { diff --git a/icd/api/llpc/context/llpcPipelineContext.h b/icd/api/llpc/context/llpcPipelineContext.h index 8187a3a1..76b64ba6 100644 --- a/icd/api/llpc/context/llpcPipelineContext.h +++ b/icd/api/llpc/context/llpcPipelineContext.h @@ -371,13 +371,17 @@ struct ResourceUsage // outputs to fragment shader, always from vertex stream 0) std::unordered_map builtInOutLocs; - // Map from tightly packed locations to byte sizes of generic outputs (used by copy shader to export - // generic outputs to fragment shader, always from vertex stream 0) - std::unordered_map genericOutByteSizes; + // Map from tightly packed locations to byte sizes of generic outputs (used by copy shader to + // export generic outputs to fragment shader, always from vertex stream 0): + // > + std::unordered_map genericOutByteSizes; llvm::Value* pEsGsOffsets; // ES -> GS offsets (GS in) llvm::Value* pGsVsRingBufDesc; // GS -> VS ring buffer descriptor (GS out); llvm::Value* pEmitCounterPtr; + + uint32_t esGsLdsSize; // ES -> GS ring LDS size (GS in) + uint32_t gsVsRingItemSize; // Size of each primitive written to the GSVS Ring ( in dwords) } gs; struct diff --git a/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp b/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp index e62e3154..fd338ec8 100644 --- a/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp +++ b/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp @@ -1400,23 +1400,64 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport( } } - if ((m_shaderStage == ShaderStageTessControl) || - (m_shaderStage == ShaderStageTessEval) || + if ((m_shaderStage == ShaderStageTessControl) || (m_shaderStage == ShaderStageTessEval) || (interpLoc != InterpLocUnknown)) { - // NOTE: For tessellation shader and fragment shader with interpolation functions, we add element indexing - // as an addition parameter to do addressing for the input/output. - if (pElemIdx == nullptr) + if (inOutMeta.IsBuiltIn) { - // When element indexing is not specified, we set it to don't-care value - pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), InvalidValue); + if (pElemIdx == nullptr) + { + // When element indexing is not specified, we set it to don't-care value + pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), InvalidValue); + } } + else + { + LLPC_ASSERT(pInOutTy->isSingleValueType()); + + uint32_t elemIdx = inOutMeta.Component; + LLPC_ASSERT(inOutMeta.Component <= 3); + if (pInOutTy->getScalarSizeInBits() == 64) + { + LLPC_ASSERT(inOutMeta.Component % 2 == 0); // Must be even for 64-bit type + elemIdx = inOutMeta.Component / 2; + } + + if (pElemIdx != nullptr) + { + pElemIdx = BinaryOperator::CreateAdd(pElemIdx, + ConstantInt::get(m_pContext->Int32Ty(), elemIdx), + "", + pInsertPos); + } + else + { + pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), elemIdx); + } + } + args.push_back(pElemIdx); } else { // Element indexing is not valid for other shader stages LLPC_ASSERT(pElemIdx == nullptr); + + if ((inOutMeta.IsBuiltIn == false) && (m_shaderStage != ShaderStageCompute)) + { + LLPC_ASSERT(pInOutTy->isSingleValueType()); + + uint32_t elemIdx = inOutMeta.Component; + LLPC_ASSERT(inOutMeta.Component <= 3); + if (pInOutTy->getScalarSizeInBits() == 64) + { + LLPC_ASSERT(inOutMeta.Component % 2 == 0); // Must be even for 64-bit type + elemIdx = inOutMeta.Component / 2; + } + + pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), elemIdx); + args.push_back(pElemIdx); + } } if ((m_shaderStage == ShaderStageTessControl) || @@ -1452,7 +1493,7 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport( } // - // VS: @llpc.input.import.generic.%Type%(i32 location) + // VS: @llpc.input.import.generic.%Type%(i32 location, i32 elemIdx) // @llpc.input.import.builtin.%BuiltIn%(i32 builtInId) // // TCS: @llpc.input.import.generic.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 vertexIdx) @@ -1465,12 +1506,13 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport( // TES: @llpc.input.import.generic.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 vertexIdx) // @llpc.input.import.builtin.%BuiltIn%.%Type%(i32 builtInId, i32 elemIdx, i32 vertexIdx) - // GS: @llpc.input.import.generic.%Type%(i32 location, i32 vertexIdx) + // GS: @llpc.input.import.generic.%Type%(i32 location, i32 elemIdx, i32 vertexIdx) // @llpc.input.import.builtin.%BuiltIn%(i32 builtInId, i32 vertexIdx) // - // FS: @llpc.input.import.generic.%Type%(i32 location, i32 interpMode, i32 interpLoc) + // FS: @llpc.input.import.generic.%Type%(i32 location, i32 elemIdx, i32 interpMode, i32 interpLoc) // @llpc.input.import.builtin.%BuiltIn%(i32 builtInId) - // @llpc.input.import.interpolant.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 interpMode, <2 x float> ij) + // @llpc.input.import.interpolant.%Type%(i32 location, i32 locOffset, i32 elemIdx, + // i32 interpMode, <2 x float> ij) // // CS: @llpc.input.import.builtin.%BuiltIn%(i32 builtInId) // @@ -1681,15 +1723,65 @@ void SpirvLowerGlobal::AddCallInstForOutputExport( if (m_shaderStage == ShaderStageTessControl) { - // NOTE: For tessellation control shader, we add element indexing as an addition parameter to do addressing - // for the output. - if (pElemIdx == nullptr) + if (outputMeta.IsBuiltIn) { - // When element indexing is not specified, we set it to don't-care value - pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), InvalidValue); + if (pElemIdx == nullptr) + { + // When element indexing is not specified, we set it to don't-care value + pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), InvalidValue); + } + } + else + { + LLPC_ASSERT(pOutputTy->isSingleValueType()); + + uint32_t elemIdx = outputMeta.Component; + LLPC_ASSERT(outputMeta.Component <= 3); + if (pOutputTy->getScalarSizeInBits() == 64) + { + LLPC_ASSERT(outputMeta.Component % 2 == 0); // Must be even for 64-bit type + elemIdx = outputMeta.Component / 2; + } + + if (pElemIdx != nullptr) + { + pElemIdx = BinaryOperator::CreateAdd(pElemIdx, + ConstantInt::get(m_pContext->Int32Ty(), elemIdx), + "", + pInsertPos); + } + else + { + pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), elemIdx); + } } + args.push_back(pElemIdx); + } + else + { + // Element indexing is not valid for other shader stages + LLPC_ASSERT(pElemIdx == nullptr); + + if ((outputMeta.IsBuiltIn == false) && (m_shaderStage != ShaderStageCompute)) + { + LLPC_ASSERT(pOutputTy->isSingleValueType()); + + uint32_t elemIdx = outputMeta.Component; + LLPC_ASSERT(outputMeta.Component <= 3); + if (pOutputTy->getScalarSizeInBits() == 64) + { + LLPC_ASSERT(outputMeta.Component % 2 == 0); // Must be even for 64-bit type + elemIdx = outputMeta.Component / 2; + } + pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), elemIdx); + args.push_back(pElemIdx); + } + } + + if (m_shaderStage == ShaderStageTessControl) + { // NOTE: For tessellation control shader, we add vertex indexing as an addition parameter to do addressing // for the output. if (pVertexIdx == nullptr) @@ -1701,8 +1793,8 @@ void SpirvLowerGlobal::AddCallInstForOutputExport( } else { - // Element and vertex indexing is not valid for other shader stages - LLPC_ASSERT((pElemIdx == nullptr) && (pVertexIdx == nullptr)); + // Vertex indexing is not valid for other shader stages + LLPC_ASSERT(pVertexIdx == nullptr); } if (m_shaderStage == ShaderStageGeometry) @@ -1720,7 +1812,7 @@ void SpirvLowerGlobal::AddCallInstForOutputExport( args.push_back(pOutputValue); // - // VS: @llpc.output.export.generic.%Type%(i32 location, %Type% outputValue) + // VS: @llpc.output.export.generic.%Type%(i32 location, i32 elemIdx, %Type% outputValue) // @llpc.output.export.builtin.%BuiltIn%(i32 builtInId, %Type% outputValue) // // TCS: @llpc.output.export.generic.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 vertexIdx, @@ -1728,13 +1820,13 @@ void SpirvLowerGlobal::AddCallInstForOutputExport( // @llpc.output.export.builtin.%BuiltIn%.%Type%(i32 builtInId, i32 elemIdx, i32 vertexIdx, // %Type% outputValue) // - // TES: @llpc.output.export.generic.%Type%(i32 location, %Type% outputValue) + // TES: @llpc.output.export.generic.%Type%(i32 location, i32 elemIdx, %Type% outputValue) // @llpc.output.export.builtin.%BuiltIn%.%Type%(i32 builtInId, %Type% outputValue) - // GS: @llpc.output.export.generic.%Type%(i32 location, i32 streamId, %Type% outputValue) + // GS: @llpc.output.export.generic.%Type%(i32 location, i32 elemIdx, i32 streamId, %Type% outputValue) // @llpc.output.export.builtin.%BuiltIn%(i32 builtInId, i32 streamId, %Type% outputValue) // - // FS: @llpc.output.export.generic.%Type%(i32 location, %Type% outputValue) + // FS: @llpc.output.export.generic.%Type%(i32 location, i32 elemIdx, %Type% outputValue) // @llpc.output.export.builtin.%BuiltIn%(i32 builtInId, %Type% outputValue) // EmitCall(m_pModule, instName, m_pContext->VoidTy(), args, NoAttrib, pInsertPos); diff --git a/icd/api/llpc/lower/llpcSpirvLowerResourceCollect.cpp b/icd/api/llpc/lower/llpcSpirvLowerResourceCollect.cpp index 22d2ff13..c1c32da0 100644 --- a/icd/api/llpc/lower/llpcSpirvLowerResourceCollect.cpp +++ b/icd/api/llpc/lower/llpcSpirvLowerResourceCollect.cpp @@ -945,21 +945,6 @@ void SpirvLowerResourceCollect::CollectInOutUsage( m_pResUsage->builtInUsage.fs.runAtSampleRate = true; } } - else - { - LLPC_ASSERT(addrSpace == SPIRAS_Output); - - // Collect CB shader mask - LLPC_ASSERT(pBaseTy->isSingleValueType()); - const uint32_t compCount = pBaseTy->isVectorTy() ? pBaseTy->getVectorNumElements() : 1; - const uint32_t channelMask = ((1 << compCount) - 1); - - LLPC_ASSERT(startLoc + locCount <= MaxColorTargets); - for (uint32_t i = 0; i < locCount; ++i) - { - m_pResUsage->inOutUsage.fs.cbShaderMask |= (channelMask << 4 * (startLoc + i)); - } - } } } } diff --git a/icd/api/llpc/patch/generate/gfx6/glslArithOpEmu.ll b/icd/api/llpc/patch/generate/gfx6/glslArithOpEmu.ll old mode 100644 new mode 100755 diff --git a/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll b/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll old mode 100644 new mode 100755 diff --git a/icd/api/llpc/patch/generate/glslNullFsEmu.ll b/icd/api/llpc/patch/generate/glslNullFsEmu.ll index dbd71739..46a902ef 100644 --- a/icd/api/llpc/patch/generate/glslNullFsEmu.ll +++ b/icd/api/llpc/patch/generate/glslNullFsEmu.ll @@ -22,8 +22,8 @@ target triple = "spir64-unknown-unknown" ; ; #version 450 ; -; layout (location = 0) in vec4 fragIn; -; layout (location = 0) out vec4 fragOut; +; layout (location = 0) in float fragIn; +; layout (location = 0) out float fragOut; ; ; void main() ; { @@ -31,16 +31,16 @@ target triple = "spir64-unknown-unknown" ; } ; -define dllexport void @main() #0 !spirv.ExecutionModel !5 +define dllexport amdgpu_ps void @main() #0 !spirv.ExecutionModel !5 { .entry: - %0 = tail call float @llpc.input.import.generic.f32(i32 0, i32 0, i32 1) #0 - tail call void @llpc.output.export.generic.f32(i32 0, float %0) #0 + %0 = tail call float @llpc.input.import.generic.f32(i32 0, i32 0, i32 0, i32 1) #0 + tail call void @llpc.output.export.generic.f32(i32 0, i32 0, float %0) #0 ret void } -declare float @llpc.input.import.generic.f32(i32, i32, i32) #0 -declare void @llpc.output.export.generic.f32(i32, float) #0 +declare float @llpc.input.import.generic.f32(i32, i32, i32, i32) #0 +declare void @llpc.output.export.generic.f32(i32, i32, float) #0 attributes #0 = { nounwind } diff --git a/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp b/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp index 3a4d8412..5b002f88 100644 --- a/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp +++ b/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp @@ -80,7 +80,7 @@ char PatchEntryPointMutate::ID = 0; PatchEntryPointMutate::PatchEntryPointMutate() : Patch(ID), - m_hasTes(false), + m_hasTs(false), m_hasGs(false) { initializePatchEntryPointMutatePass(*PassRegistry::getPassRegistry()); @@ -96,8 +96,9 @@ bool PatchEntryPointMutate::runOnModule( Patch::Init(&module); const uint32_t stageMask = m_pContext->GetShaderStageMask(); - m_hasTes = ((stageMask & ShaderStageToMask(ShaderStageTessEval)) != 0); - m_hasGs = ((stageMask & ShaderStageToMask(ShaderStageGeometry)) != 0); + m_hasTs = ((stageMask & (ShaderStageToMask(ShaderStageTessControl) | + ShaderStageToMask(ShaderStageTessEval))) != 0); + m_hasGs = ((stageMask & ShaderStageToMask(ShaderStageGeometry)) != 0); const auto& dataLayout = m_pModule->getDataLayout(); @@ -594,7 +595,7 @@ bool PatchEntryPointMutate::runOnModule( } // Setup ES-GS ring buffer descriptor - if (((m_shaderStage == ShaderStageVertex) && m_hasGs && (m_hasTes == false)) || + if (((m_shaderStage == ShaderStageVertex) && m_hasGs && (m_hasTs == false)) || ((m_shaderStage == ShaderStageTessEval) && m_hasGs)) { // Setup ES-GS ring buffer descriptor for VS or TES output @@ -630,9 +631,8 @@ bool PatchEntryPointMutate::runOnModule( switch (m_shaderStage) { case ShaderStageVertex: - callingConv = m_hasTes ? - CallingConv::AMDGPU_LS : - (m_hasGs ? CallingConv::AMDGPU_ES : CallingConv::AMDGPU_VS); + callingConv = m_hasTs ? CallingConv::AMDGPU_LS : + (m_hasGs ? CallingConv::AMDGPU_ES : CallingConv::AMDGPU_VS); break; case ShaderStageTessControl: callingConv = CallingConv::AMDGPU_HS; @@ -651,6 +651,7 @@ bool PatchEntryPointMutate::runOnModule( } pEntryPoint->setCallingConv(callingConv); pEntryPoint->setDLLStorageClass(GlobalValue::DefaultStorageClass); + // Set the entry name required by PAL ABI auto entryStage = Util::Abi::PipelineSymbolType::CsMainEntry; switch (callingConv) @@ -1165,7 +1166,7 @@ FunctionType* PatchEntryPointMutate::GenerateEntryPointType( { case ShaderStageVertex: { - if (m_hasGs && (m_hasTes == false)) + if (m_hasGs && (m_hasTs == false)) { argTys.push_back(m_pContext->Int32Ty()); // ES to GS offset entryArgIdxs.vs.esGsOffset = argIdx; diff --git a/icd/api/llpc/patch/llpcPatchEntryPointMutate.h b/icd/api/llpc/patch/llpcPatchEntryPointMutate.h index a5c2253d..4f45b095 100644 --- a/icd/api/llpc/patch/llpcPatchEntryPointMutate.h +++ b/icd/api/llpc/patch/llpcPatchEntryPointMutate.h @@ -76,8 +76,9 @@ class PatchEntryPointMutate: // Reserved argument count for single DWORD descriptor table pointer static const uint32_t TablePtrReservedArgCount = 2; - bool m_hasTes; // Whether the pipeline has tessllation evaluation shader - bool m_hasGs; // Whether the pipeline has geometry shader + + bool m_hasTs; // Whether the pipeline has tessllation shader + bool m_hasGs; // Whether the pipeline has geometry shader }; } // Llpc diff --git a/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp b/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp index 121158c2..90b7dd0b 100644 --- a/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp +++ b/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp @@ -71,7 +71,8 @@ PatchInOutImportExport::PatchInOutImportExport() #endif m_hasTs(false), m_hasGs(false), - m_pLds(nullptr) + m_pLds(nullptr), + m_pThreadId(nullptr) { memset(&m_gfxIp, 0, sizeof(m_gfxIp)); @@ -264,8 +265,24 @@ bool PatchInOutImportExport::runOnModule( ShaderStageToMask(ShaderStageTessEval))) != 0); m_hasGs = ((stageMask & ShaderStageToMask(ShaderStageGeometry)) != 0); + // Calculate and store thread ID, it will be used in on-chip GS offset calculation + if (m_hasGs && m_pContext->IsGsOnChip()) + { + auto pInsertPos = m_pEntryPoint->begin()->getFirstInsertionPt(); + + std::vector args; + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), -1)); + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); + m_pThreadId = EmitCall(m_pModule, "llvm.amdgcn.mbcnt.lo", m_pContext->Int32Ty(), args, NoAttrib, &*pInsertPos); + + args.clear(); + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), -1)); + args.push_back(m_pThreadId); + m_pThreadId = EmitCall(m_pModule, "llvm.amdgcn.mbcnt.hi", m_pContext->Int32Ty(), args, NoAttrib, &*pInsertPos); + } + // Create the global variable that is to model LDS - if (m_hasTs) + if (m_hasTs || (m_hasGs && m_pContext->IsGsOnChip())) { // Construct LDS type: [ldsSize * i32], address space 3 auto ldsSize = m_pContext->GetGpuProperty()->ldsSizePerCu; @@ -491,13 +508,18 @@ void PatchInOutImportExport::visitCallInst( { case ShaderStageVertex: { - pInput = PatchVsGenericInputImport(pInputTy, loc, &callInst); + LLPC_ASSERT(callInst.getNumArgOperands() == 2); + const uint32_t compIdx = cast(callInst.getOperand(1))->getZExtValue(); + pInput = PatchVsGenericInputImport(pInputTy, loc, compIdx, &callInst); break; } case ShaderStageTessControl: { LLPC_ASSERT(callInst.getNumArgOperands() == 4); - auto pElemIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2); + + auto pElemIdx = callInst.getOperand(2); + LLPC_ASSERT(IsDontCareValue(pElemIdx) == false); + auto pVertexIdx = callInst.getOperand(3); LLPC_ASSERT(IsDontCareValue(pVertexIdx) == false); @@ -507,7 +529,10 @@ void PatchInOutImportExport::visitCallInst( case ShaderStageTessEval: { LLPC_ASSERT(callInst.getNumArgOperands() == 4); - auto pElemIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2); + + auto pElemIdx = callInst.getOperand(2); + LLPC_ASSERT(IsDontCareValue(pElemIdx) == false); + auto pVertexIdx = IsDontCareValue(callInst.getOperand(3)) ? nullptr : callInst.getOperand(3); pInput = PatchTesGenericInputImport(pInputTy, loc, pLocOffset, pElemIdx, pVertexIdx, &callInst); @@ -515,40 +540,48 @@ void PatchInOutImportExport::visitCallInst( } case ShaderStageGeometry: { - LLPC_ASSERT(callInst.getNumArgOperands() == 2); - Value* pVertexIdx = callInst.getOperand(1); + LLPC_ASSERT(callInst.getNumArgOperands() == 3); + + const uint32_t compIdx = cast(callInst.getOperand(1))->getZExtValue(); + + Value* pVertexIdx = callInst.getOperand(2); LLPC_ASSERT(IsDontCareValue(pVertexIdx) == false); - pInput = PatchGsGenericInputImport(pInputTy, loc, pVertexIdx, &callInst); + + pInput = PatchGsGenericInputImport(pInputTy, loc, compIdx, pVertexIdx, &callInst); break; } case ShaderStageFragment: { uint32_t interpMode = InterpModeSmooth; uint32_t interpLoc = InterpLocCenter; - Value* pLocOffset = nullptr; - Value* pCompIdx = nullptr; + + Value* pElemIdx = callInst.getOperand(1); + LLPC_ASSERT(IsDontCareValue(pElemIdx) == false); + Value* pIJ = nullptr; if (isGenericInputImport) { - LLPC_ASSERT(callInst.getNumArgOperands() == 3); - interpMode = cast(callInst.getOperand(1))->getZExtValue(); - interpLoc = cast(callInst.getOperand(2))->getZExtValue(); + LLPC_ASSERT(callInst.getNumArgOperands() == 4); + + interpMode = cast(callInst.getOperand(2))->getZExtValue(); + interpLoc = cast(callInst.getOperand(3))->getZExtValue(); } else { LLPC_ASSERT(isInterpolantInputImport); LLPC_ASSERT(callInst.getNumArgOperands() == 5); + interpMode = cast(callInst.getOperand(3))->getZExtValue(); interpLoc = InterpLocUnknown; - pCompIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2); + pIJ = callInst.getOperand(4); } pInput = PatchFsGenericInputImport(pInputTy, loc, pLocOffset, - pCompIdx, + pElemIdx, pIJ, interpMode, interpLoc, @@ -630,7 +663,8 @@ void PatchInOutImportExport::visitCallInst( LLPC_ASSERT(loc != InvalidValue); LLPC_ASSERT(callInst.getNumArgOperands() == 4); - auto pElemIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2); + auto pElemIdx = callInst.getOperand(2); + LLPC_ASSERT(IsDontCareValue(pElemIdx) == false); auto pVertexIdx = IsDontCareValue(callInst.getOperand(3)) ? nullptr : callInst.getOperand(3); pOutput = PatchTcsGenericOutputImport(pOutputTy, loc, pLocOffset, pElemIdx, pVertexIdx, &callInst); @@ -771,13 +805,18 @@ void PatchInOutImportExport::visitCallInst( { case ShaderStageVertex: { - PatchVsGenericOutputExport(pOutput, loc, &callInst); + LLPC_ASSERT(callInst.getNumArgOperands() == 3); + const uint32_t compIdx = cast(callInst.getOperand(1))->getZExtValue(); + PatchVsGenericOutputExport(pOutput, loc, compIdx, &callInst); break; } case ShaderStageTessControl: { LLPC_ASSERT(callInst.getNumArgOperands() == 5); - auto pElemIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2); + + auto pElemIdx = callInst.getOperand(2); + LLPC_ASSERT(IsDontCareValue(pElemIdx) == false); + auto pVertexIdx = IsDontCareValue(callInst.getOperand(3)) ? nullptr : callInst.getOperand(3); PatchTcsGenericOutputExport(pOutput, loc, pLocOffset, pElemIdx, pVertexIdx, &callInst); @@ -785,19 +824,24 @@ void PatchInOutImportExport::visitCallInst( } case ShaderStageTessEval: { - PatchTesGenericOutputExport(pOutput, loc, &callInst); + LLPC_ASSERT(callInst.getNumArgOperands() == 3); + const uint32_t compIdx = cast(callInst.getOperand(1))->getZExtValue(); + PatchTesGenericOutputExport(pOutput, loc, compIdx, &callInst); break; } case ShaderStageGeometry: { - LLPC_ASSERT(callInst.getNumArgOperands() == 3); - uint32_t streamId = cast(callInst.getOperand(1))->getZExtValue(); - PatchGsGenericOutputExport(pOutput, loc, streamId, &callInst); + LLPC_ASSERT(callInst.getNumArgOperands() == 4); + const uint32_t compIdx = cast(callInst.getOperand(1))->getZExtValue(); + const uint32_t streamId = cast(callInst.getOperand(2))->getZExtValue(); + PatchGsGenericOutputExport(pOutput, loc, compIdx, streamId, &callInst); break; } case ShaderStageFragment: { - PatchFsGenericOutputExport(pOutput, loc, &callInst); + LLPC_ASSERT(callInst.getNumArgOperands() == 3); + const uint32_t compIdx = cast(callInst.getOperand(1))->getZExtValue(); + PatchFsGenericOutputExport(pOutput, loc, compIdx, &callInst); break; } case ShaderStageCopyShader: @@ -1412,6 +1456,48 @@ void PatchInOutImportExport::visitReturnInst( EmitCall(m_pModule, "llvm.amdgcn.exp.f32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos)); } + // Export fragment colors + for (uint32_t location = 0; location < MaxColorTargets; ++location) + { + auto& expFragColor = m_expFragColors[location]; + if (expFragColor.size() > 0) + { + Value* pOutput = nullptr; + uint32_t compCount = expFragColor.size(); + LLPC_ASSERT(compCount <= 4); + + // Set CB shader mask + auto pResUsage = m_pContext->GetShaderResourceUsage(ShaderStageFragment); + const uint32_t channelMask = ((1 << compCount) - 1); + pResUsage->inOutUsage.fs.cbShaderMask |= (channelMask << (4 * location)); + + // Construct exported fragment colors + if (compCount == 1) + { + pOutput = expFragColor[0]; + } + else + { + pOutput = UndefValue::get(VectorType::get(m_pContext->Int32Ty(), compCount)); + for (uint32_t i = 0; i < compCount; ++i) + { + pOutput = InsertElementInst::Create(pOutput, + expFragColor[i], + ConstantInt::get(m_pContext->Int32Ty(), i), + "", + pInsertPos); + } + } + + // Do fragment color exporting + auto pExport = m_pFragColorExport->Run(pOutput, location, pInsertPos); + if (pExport != nullptr) + { + m_pLastExport = cast(pExport); + } + } + } + // NOTE: If outputs are present in fragment shader, we have to export a dummy one if (m_pLastExport == nullptr) { @@ -1451,13 +1537,14 @@ void PatchInOutImportExport::visitReturnInst( Value* PatchInOutImportExport::PatchVsGenericInputImport( Type* pInputTy, // [in] Type of input value uint32_t location, // Location of the input + uint32_t compIdx, // Index used for vector element indexing Instruction* pInsertPos) // [in] Where to insert the patch instruction { Value* pInput = UndefValue::get(pInputTy); // Do vertex fetch operations (returns ) LLPC_ASSERT(m_pVertexFetch != nullptr); - auto pVertex = m_pVertexFetch->Run(pInputTy, location, pInsertPos); + auto pVertex = m_pVertexFetch->Run(pInputTy, location, compIdx, pInsertPos); // Cast vertex fetch results if necessary const Type* pVertexTy = pVertex->getType(); @@ -1480,14 +1567,14 @@ Value* PatchInOutImportExport::PatchTcsGenericInputImport( Type* pInputTy, // [in] Type of input value uint32_t location, // Base location of the input Value* pLocOffset, // [in] Relative location offset - Value* pCompIdx, // [in] Index used for vector element indexing (could be null) + Value* pCompIdx, // [in] Index used for vector element indexing Value* pVertexIdx, // [in] Input array outermost index used for vertex indexing Instruction* pInsertPos) // [in] Where to insert the patch instruction { - LLPC_ASSERT(pVertexIdx != nullptr); + LLPC_ASSERT((pCompIdx != nullptr) && (pVertexIdx != nullptr)); auto pLdsOffset = CalcLdsOffsetForTcsInput(pInputTy, location, pLocOffset, pCompIdx, pVertexIdx, pInsertPos); - return ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + return ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); } // ===================================================================================================================== @@ -1496,12 +1583,14 @@ Value* PatchInOutImportExport::PatchTesGenericInputImport( Type* pInputTy, // [in] Type of input value uint32_t location, // Base location of the input Value* pLocOffset, // [in] Relative location offset - Value* pCompIdx, // [in] Index used for vector element indexing (could be null) + Value* pCompIdx, // [in] Index used for vector element indexing Value* pVertexIdx, // [in] Input array outermost index used for vertex indexing (could be null) Instruction* pInsertPos) // [in] Where to insert the patch instruction { + LLPC_ASSERT(pCompIdx != nullptr); + auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, location, pLocOffset, pCompIdx, pVertexIdx, pInsertPos); - return ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + return ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); } // ===================================================================================================================== @@ -1509,6 +1598,7 @@ Value* PatchInOutImportExport::PatchTesGenericInputImport( Value* PatchInOutImportExport::PatchGsGenericInputImport( Type* pInputTy, // [in] Type of input value uint32_t location, // Location of the input + uint32_t compIdx, // Index used for vector element indexing Value* pVertexIdx, // [in] Input array outermost index used for vertex indexing Instruction* pInsertPos) // [in] Where to insert the patch instruction { @@ -1520,6 +1610,9 @@ Value* PatchInOutImportExport::PatchGsGenericInputImport( const uint32_t bitWidth = pInputTy->getScalarSizeInBits(); if (bitWidth == 64) { + // For 64-bit data type, the component indexing must multiply by 2 + compIdx *= 2; + if (pInputTy->isVectorTy()) { pInputTy = VectorType::get(m_pContext->FloatTy(), pInputTy->getVectorNumElements() * 2); @@ -1534,7 +1627,7 @@ Value* PatchInOutImportExport::PatchGsGenericInputImport( LLPC_ASSERT(bitWidth == 32); } - Value* pInput = LoadValueFromEsGsRingBuffer(pInputTy, location, 0, pVertexIdx, pInsertPos); + Value* pInput = LoadValueFromEsGsRing(pInputTy, location, compIdx, pVertexIdx, pInsertPos); if (pInputTy != pOrigInputTy) { @@ -1641,7 +1734,7 @@ Value* PatchInOutImportExport::PatchFsGenericInputImport( const uint32_t numChannels = (bitWidth * compCout) / 32; Type* pInterpTy = (numChannels > 1) ? VectorType::get(m_pContext->FloatTy(), numChannels) : m_pContext->FloatTy(); - Value* pInterp = nullptr; + Value* pInterp = UndefValue::get(pInterpTy); uint32_t startChannel = 0; if (pCompIdx != nullptr) @@ -1665,10 +1758,10 @@ Value* PatchInOutImportExport::PatchFsGenericInputImport( LLPC_ASSERT((pBasicTy->isFloatTy()) && (numChannels <= 4)); args.clear(); - args.push_back(pI); // i - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i)); // attr_chan - args.push_back(pLoc); // attr - args.push_back(pPrimMask); // m0 + args.push_back(pI); // i + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i)); // attr_chan + args.push_back(pLoc); // attr + args.push_back(pPrimMask); // m0 pCompValue = EmitCall(m_pModule, "llvm.amdgcn.interp.p1", @@ -1678,11 +1771,11 @@ Value* PatchInOutImportExport::PatchFsGenericInputImport( pInsertPos); args.clear(); - args.push_back(pCompValue); // p1 - args.push_back(pJ); // j - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i)); // attr_chan - args.push_back(pLoc); // attr - args.push_back(pPrimMask); // m0 + args.push_back(pCompValue); // p1 + args.push_back(pJ); // j + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i)); // attr_chan + args.push_back(pLoc); // attr + args.push_back(pPrimMask); // m0 pCompValue = EmitCall(m_pModule, "llvm.amdgcn.interp.p2", @@ -1716,8 +1809,7 @@ Value* PatchInOutImportExport::PatchFsGenericInputImport( } else { - auto pVec = (i == 0) ? UndefValue::get(pInterpTy) : pInterp; - pInterp = InsertElementInst::Create(pVec, + pInterp = InsertElementInst::Create(pInterp, pCompValue, ConstantInt::get(m_pContext->Int32Ty(), i - startChannel), "", @@ -1745,12 +1837,14 @@ Value* PatchInOutImportExport::PatchTcsGenericOutputImport( Type* pOutputTy, // [in] Type of output value uint32_t location, // Base location of the output Value* pLocOffset, // [in] Relative location offset - Value* pCompIdx, // [in] Index used for vector element indexing (could be null) + Value* pCompIdx, // [in] Index used for vector element indexing Value* pVertexIdx, // [in] Input array outermost index used for vertex indexing (could be null) Instruction* pInsertPos) // [in] Where to insert the patch instruction { + LLPC_ASSERT(pCompIdx != nullptr); + auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, location, pLocOffset, pCompIdx, pVertexIdx, pInsertPos); - return ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos); + return ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos); } // ===================================================================================================================== @@ -1758,23 +1852,28 @@ Value* PatchInOutImportExport::PatchTcsGenericOutputImport( void PatchInOutImportExport::PatchVsGenericOutputExport( Value* pOutput, // [in] Output value uint32_t location, // Location of the output + uint32_t compIdx, // Index used for vector element indexing Instruction* pInsertPos) // [in] Where to insert the patch instruction { + auto pOutputTy = pOutput->getType(); + if (m_hasTs) { - auto pLdsOffset = CalcLdsOffsetForVsOutput(location, pInsertPos); + auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy, location, compIdx, pInsertPos); WriteValueToLds(pOutput, pLdsOffset, pInsertPos); } else { if (m_hasGs) { - auto pOutputTy = pOutput->getType(); LLPC_ASSERT(pOutputTy->isIntOrIntVectorTy() || pOutputTy->isFPOrFPVectorTy()); const uint32_t bitWidth = pOutputTy->getScalarSizeInBits(); if (bitWidth == 64) { + // For 64-bit data type, the component indexing must multiply by 2 + compIdx *= 2; + uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() * 2 : 2; pOutputTy = VectorType::get(m_pContext->FloatTy(), compCount); pOutput = BitCastInst::Create(Instruction::BitCast, pOutput, pOutputTy, "", pInsertPos); @@ -1784,11 +1883,11 @@ void PatchInOutImportExport::PatchVsGenericOutputExport( LLPC_ASSERT(bitWidth == 32); } - StoreValueToEsGsRingBuffer(pOutput, location, 0, pInsertPos); + StoreValueToEsGsRing(pOutput, location, compIdx, pInsertPos); } else { - AddExportInstForGenericOutput(pOutput, location, pInsertPos); + AddExportInstForGenericOutput(pOutput, location, compIdx, pInsertPos); } } } @@ -1799,10 +1898,12 @@ void PatchInOutImportExport::PatchTcsGenericOutputExport( Value* pOutput, // [in] Output value uint32_t location, // Base location of the output Value* pLocOffset, // [in] Relative location offset - Value* pCompIdx, // [in] Index used for vector element indexing (could be null) + Value* pCompIdx, // [in] Index used for vector element indexing Value* pVertexIdx, // [in] Input array outermost index used for vertex indexing (could be null) Instruction* pInsertPos) // [in] Where to insert the patch instruction { + LLPC_ASSERT(pCompIdx != nullptr); + Type* pOutputTy = pOutput->getType(); auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, location, pLocOffset, pCompIdx, pVertexIdx, pInsertPos); WriteValueToLds(pOutput, pLdsOffset, pInsertPos); @@ -1813,6 +1914,7 @@ void PatchInOutImportExport::PatchTcsGenericOutputExport( void PatchInOutImportExport::PatchTesGenericOutputExport( Value* pOutput, // [in] Output value uint32_t location, // Location of the output + uint32_t compIdx, // Index used for vector element indexing Instruction* pInsertPos) // [in] Where to insert the patch instruction { if (m_hasGs) @@ -1823,6 +1925,9 @@ void PatchInOutImportExport::PatchTesGenericOutputExport( const uint32_t bitWidth = pOutputTy->getScalarSizeInBits(); if (bitWidth == 64) { + // For 64-bit data type, the component indexing must multiply by 2 + compIdx *= 2; + uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() * 2 : 2; pOutputTy = VectorType::get(m_pContext->FloatTy(), compCount); pOutput = BitCastInst::Create(Instruction::BitCast, pOutput, pOutputTy, "", pInsertPos); @@ -1832,11 +1937,11 @@ void PatchInOutImportExport::PatchTesGenericOutputExport( LLPC_ASSERT(bitWidth == 32); } - StoreValueToEsGsRingBuffer(pOutput, location, 0, pInsertPos); + StoreValueToEsGsRing(pOutput, location, compIdx, pInsertPos); } else { - AddExportInstForGenericOutput(pOutput, location, pInsertPos); + AddExportInstForGenericOutput(pOutput, location, compIdx, pInsertPos); } } @@ -1845,6 +1950,7 @@ void PatchInOutImportExport::PatchTesGenericOutputExport( void PatchInOutImportExport::PatchGsGenericOutputExport( Value* pOutput, // [in] Output value uint32_t location, // Location of the output + uint32_t compIdx, // Index used for vector element indexing uint32_t streamId, // ID of output vertex stream Instruction* pInsertPos) // [in] Where to insert the patch instruction { @@ -1856,6 +1962,9 @@ void PatchInOutImportExport::PatchGsGenericOutputExport( const uint32_t bitWidth = pOutputTy->getScalarSizeInBits(); if (bitWidth == 64) { + // For 64-bit data type, the component indexing must multiply by 2 + compIdx *= 2; + if (pOutputTy->isVectorTy()) { pOutputTy = VectorType::get(m_pContext->FloatTy(), pOutputTy->getVectorNumElements() * 2); @@ -1871,13 +1980,14 @@ void PatchInOutImportExport::PatchGsGenericOutputExport( const uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() : 1; const uint32_t byteSize = pCompTy->getScalarSizeInBits() / 8 * compCount; + LLPC_ASSERT(compIdx <= 4); auto& genericOutByteSizes = m_pContext->GetShaderResourceUsage(ShaderStageGeometry)->inOutUsage.gs.genericOutByteSizes; - genericOutByteSizes[location] = byteSize; + genericOutByteSizes[location][compIdx] = byteSize; if (compCount == 1) { - StoreValueToGsVsRingBuffer(pOutput, location, 0, pInsertPos); + StoreValueToGsVsRingBuffer(pOutput, location, compIdx, pInsertPos); } else { @@ -1887,7 +1997,7 @@ void PatchInOutImportExport::PatchGsGenericOutputExport( ConstantInt::get(m_pContext->Int32Ty(), i), "", pInsertPos); - StoreValueToGsVsRingBuffer(pComp, location + (i / 4), i % 4, pInsertPos); + StoreValueToGsVsRingBuffer(pComp, location + ((compIdx + i) / 4), (compIdx + i) % 4, pInsertPos); } } @@ -1898,13 +2008,73 @@ void PatchInOutImportExport::PatchGsGenericOutputExport( void PatchInOutImportExport::PatchFsGenericOutputExport( Value* pOutput, // [in] Output value uint32_t location, // Location of the output + uint32_t compIdx, // Index used for vector element indexing Instruction* pInsertPos) // [in] Where to insert the patch instruction { - // "Done" flag is valid for exporting MRT - auto pExport = m_pFragColorExport->Run(pOutput, location, pInsertPos); - if (pExport != nullptr) + Type* pOutputTy = pOutput->getType(); + + const uint32_t bitWidth = pOutputTy->getScalarSizeInBits(); + LLPC_ASSERT((bitWidth == 16) || (bitWidth == 32)); + + auto pCompTy = pOutputTy->isVectorTy() ? pOutputTy->getVectorElementType() : pOutputTy; + uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() : 1; + + std::vector outputComps; + for (uint32_t i = 0; i < compCount; ++i) + { + Value* pOutputComp = nullptr; + if (compCount == 1) + { + pOutputComp = pOutput; + } + else + { + pOutputComp = ExtractElementInst::Create(pOutput, + ConstantInt::get(m_pContext->Int32Ty(), i), + "", + pInsertPos); + } + + // Translate components of exported output values to i32 values + if (pCompTy->isFloatingPointTy()) + { + if (bitWidth == 16) + { + pOutputComp = new BitCastInst(pOutputComp, m_pContext->Int16Ty(), "", pInsertPos); + pOutputComp = new ZExtInst(pOutputComp, m_pContext->Int32Ty(), "", pInsertPos); + } + else + { + LLPC_ASSERT(bitWidth == 32); + pOutputComp = new BitCastInst(pOutputComp, m_pContext->Int32Ty(), "", pInsertPos); + } + } + else if (pCompTy->isIntegerTy()) + { + if (bitWidth == 16) + { + pOutputComp = new ZExtInst(pOutputComp, m_pContext->Int32Ty(), "", pInsertPos); + } + else + { + LLPC_ASSERT(bitWidth == 32); + } + } + + outputComps.push_back(pOutputComp); + } + + LLPC_ASSERT(location < MaxColorTargets); + auto& expFragColor = m_expFragColors[location]; + + while (compIdx + compCount > expFragColor.size()) + { + expFragColor.push_back(UndefValue::get(m_pContext->Int32Ty())); + } + + for (uint32_t i = 0; i < compCount; ++i) { - m_pLastExport = cast(pExport); + expFragColor[compIdx + i] = outputComps[i]; } } @@ -1980,7 +2150,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport( const uint32_t loc = builtInInLocMap[builtInId]; auto pLdsOffset = CalcLdsOffsetForTcsInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); break; } @@ -1991,7 +2161,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport( const uint32_t loc = builtInInLocMap[builtInId]; auto pLdsOffset = CalcLdsOffsetForTcsInput(pInputTy, loc, nullptr, nullptr, pVertexIdx, pInsertPos); - pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); break; } @@ -2012,7 +2182,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport( auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i); auto pLdsOffset = CalcLdsOffsetForTcsInput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos); + auto pElem = ReadValueFromLds(false, pElemTy, pLdsOffset, pInsertPos); std::vector idxs; idxs.push_back(i); @@ -2022,7 +2192,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport( else { auto pLdsOffset = CalcLdsOffsetForTcsInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); } break; @@ -2079,7 +2249,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport( const uint32_t loc = builtInInLocMap[builtInId]; auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); break; } @@ -2090,7 +2260,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport( const uint32_t loc = builtInInLocMap[builtInId]; auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, loc, nullptr, nullptr, pVertexIdx, pInsertPos); - pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); break; } @@ -2111,7 +2281,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport( auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i); auto pLdsOffset = CalcLdsOffsetForTesInput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos); + auto pElem = ReadValueFromLds(false, pElemTy, pLdsOffset, pInsertPos); std::vector idxs; idxs.push_back(i); @@ -2121,7 +2291,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport( else { auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); } break; @@ -2179,7 +2349,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport( auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i); auto pLdsOffset = CalcLdsOffsetForTesInput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos); + auto pElem = ReadValueFromLds(false, pElemTy, pLdsOffset, pInsertPos); std::vector idxs; idxs.push_back(i); pInput = InsertValueInst::Create(pInput, pElem, idxs, "", pInsertPos); @@ -2188,7 +2358,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport( else { auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos); + pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos); } break; @@ -2225,11 +2395,11 @@ Value* PatchInOutImportExport::PatchGsBuiltInInputImport( case BuiltInPosition: case BuiltInPointSize: { - pInput = LoadValueFromEsGsRingBuffer(pInputTy, - loc, - 0, - pVertexIdx, - pInsertPos); + pInput = LoadValueFromEsGsRing(pInputTy, + loc, + 0, + pVertexIdx, + pInsertPos); break; } case BuiltInClipDistance: @@ -2237,11 +2407,11 @@ Value* PatchInOutImportExport::PatchGsBuiltInInputImport( pInput = UndefValue::get(pInputTy); for (uint32_t i = 0; i < builtInUsage.clipDistanceIn; ++i) { - auto pComp = LoadValueFromEsGsRingBuffer(pInputTy->getArrayElementType(), - loc + i / 4, - i % 4, - pVertexIdx, - pInsertPos); + auto pComp = LoadValueFromEsGsRing(pInputTy->getArrayElementType(), + loc + i / 4, + i % 4, + pVertexIdx, + pInsertPos); std::vector idxs; idxs.push_back(i); @@ -2254,11 +2424,11 @@ Value* PatchInOutImportExport::PatchGsBuiltInInputImport( pInput = UndefValue::get(pInputTy); for (uint32_t i = 0; i < builtInUsage.cullDistanceIn; ++i) { - auto pComp = LoadValueFromEsGsRingBuffer(pInputTy->getArrayElementType(), - loc + i / 4, - i % 4, - pVertexIdx, - pInsertPos); + auto pComp = LoadValueFromEsGsRing(pInputTy->getArrayElementType(), + loc + i / 4, + i % 4, + pVertexIdx, + pInsertPos); std::vector idxs; idxs.push_back(i); @@ -2704,7 +2874,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport( uint32_t loc = builtInOutLocMap[builtInId]; auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - pOutput = ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos); + pOutput = ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos); break; } @@ -2717,7 +2887,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport( uint32_t loc = builtInOutLocMap[builtInId]; auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, loc, nullptr, nullptr, pVertexIdx, pInsertPos); - pOutput = ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos); + pOutput = ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos); break; } @@ -2748,7 +2918,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport( auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i); auto pLdsOffset = CalcLdsOffsetForTcsOutput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos); + auto pElem = ReadValueFromLds(true, pElemTy, pLdsOffset, pInsertPos); std::vector idxs; idxs.push_back(i); @@ -2758,7 +2928,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport( else { auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - pOutput = ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos); + pOutput = ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos); } break; @@ -2791,7 +2961,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport( auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i); auto pLdsOffset = CalcLdsOffsetForTcsOutput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos); + auto pElem = ReadValueFromLds(true, pElemTy, pLdsOffset, pInsertPos); std::vector idxs; idxs.push_back(i); @@ -2801,7 +2971,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport( else { auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos); - pOutput = ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos); + pOutput = ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos); } break; @@ -2829,8 +2999,6 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( auto& builtInUsage = pResUsage->builtInUsage.vs; auto& builtInOutLocMap = pResUsage->inOutUsage.builtInOutputLocMap; - const auto pUndef = UndefValue::get(m_pContext->FloatTy()); - std::vector args; switch (builtInId) @@ -2846,7 +3014,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( if (m_hasTs) { uint32_t loc = builtInOutLocMap[builtInId]; - auto pLdsOffset = CalcLdsOffsetForVsOutput(loc, pInsertPos); + auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy, loc, 0, pInsertPos); WriteValueToLds(pOutput, pLdsOffset, pInsertPos); } else @@ -2856,7 +3024,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( LLPC_ASSERT(builtInOutLocMap.find(builtInId) != builtInOutLocMap.end()); uint32_t loc = builtInOutLocMap[builtInId]; - StoreValueToEsGsRingBuffer(pOutput, loc, 0, pInsertPos); + StoreValueToEsGsRing(pOutput, loc, 0, pInsertPos); } else { @@ -2884,7 +3052,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( if (m_hasTs) { uint32_t loc = builtInOutLocMap[builtInId]; - auto pLdsOffset = CalcLdsOffsetForVsOutput(loc, pInsertPos); + auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy, loc, 0, pInsertPos); WriteValueToLds(pOutput, pLdsOffset, pInsertPos); } else @@ -2894,7 +3062,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( LLPC_ASSERT(builtInOutLocMap.find(builtInId) != builtInOutLocMap.end()); uint32_t loc = builtInOutLocMap[builtInId]; - StoreValueToEsGsRingBuffer(pOutput, loc, 0, pInsertPos); + StoreValueToEsGsRing(pOutput, loc, 0, pInsertPos); } else { @@ -2924,7 +3092,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( LLPC_ASSERT(pOutputTy->isArrayTy()); uint32_t loc = builtInOutLocMap[builtInId]; - auto pLdsOffset = CalcLdsOffsetForVsOutput(loc, pInsertPos); + auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy->getArrayElementType(), loc, 0, pInsertPos); for (int i = 0; i < pOutputTy->getArrayNumElements(); ++i) { @@ -2952,7 +3120,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( std::vector idxs; idxs.push_back(i); auto pElem = ExtractValueInst::Create(pOutput, idxs, "", pInsertPos); - StoreValueToEsGsRingBuffer(pElem, loc + i / 4, i % 4, pInsertPos); + StoreValueToEsGsRing(pElem, loc + i / 4, i % 4, pInsertPos); } } else @@ -2984,7 +3152,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( LLPC_ASSERT(pOutputTy->isArrayTy()); uint32_t loc = builtInOutLocMap[builtInId]; - auto pLdsOffset = CalcLdsOffsetForVsOutput(loc, pInsertPos); + auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy->getArrayElementType(), loc, 0, pInsertPos); for (int i = 0; i < pOutputTy->getArrayNumElements(); ++i) { @@ -3012,7 +3180,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( std::vector idxs; idxs.push_back(i); auto pElem = ExtractValueInst::Create(pOutput, idxs, "", pInsertPos); - StoreValueToEsGsRingBuffer(pElem, loc + i / 4, i % 4, pInsertPos); + StoreValueToEsGsRing(pElem, loc + i / 4, i % 4, pInsertPos); } } else @@ -3367,7 +3535,7 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport( LLPC_ASSERT(builtInOutLocMap.find(builtInId) != builtInOutLocMap.end()); uint32_t loc = builtInOutLocMap[builtInId]; - StoreValueToEsGsRingBuffer(pOutput, loc, 0, pInsertPos); + StoreValueToEsGsRing(pOutput, loc, 0, pInsertPos); } else { @@ -3396,7 +3564,7 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport( LLPC_ASSERT(builtInOutLocMap.find(builtInId) != builtInOutLocMap.end()); uint32_t loc = builtInOutLocMap[builtInId]; - StoreValueToEsGsRingBuffer(pOutput, loc, 0, pInsertPos); + StoreValueToEsGsRing(pOutput, loc, 0, pInsertPos); } else { @@ -3430,7 +3598,7 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport( std::vector idxs; idxs.push_back(i); auto pElem = ExtractValueInst::Create(pOutput, idxs, "", pInsertPos); - StoreValueToEsGsRingBuffer(pElem, loc + i / 4, i % 4, pInsertPos); + StoreValueToEsGsRing(pElem, loc + i / 4, i % 4, pInsertPos); } } else @@ -3466,7 +3634,7 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport( std::vector idxs; idxs.push_back(i); auto pElem = ExtractValueInst::Create(pOutput, idxs, "", pInsertPos); - StoreValueToEsGsRingBuffer(pElem, loc + i / 4, i % 4, pInsertPos); + StoreValueToEsGsRing(pElem, loc + i / 4, i % 4, pInsertPos); } } else @@ -3773,7 +3941,7 @@ void PatchInOutImportExport::PatchCopyShaderGenericOutputExport( uint32_t location, // Location of the output Instruction* pInsertPos) // [in] Where to insert the patch instruction { - AddExportInstForGenericOutput(pOutput, location, pInsertPos); + AddExportInstForGenericOutput(pOutput, location, 0, pInsertPos); } // ===================================================================================================================== @@ -3850,8 +4018,8 @@ void PatchInOutImportExport::PatchCopyShaderBuiltInOutputExport( } // ===================================================================================================================== -// Stores value to ES-GS ring buffer. -void PatchInOutImportExport::StoreValueToEsGsRingBuffer( +// Stores value to ES-GS ring (buffer or LDS). +void PatchInOutImportExport::StoreValueToEsGsRing( Value* pStoreValue, // [in] Value to store uint32_t location, // Output location uint32_t compIdx, // Output component index @@ -3871,7 +4039,7 @@ void PatchInOutImportExport::StoreValueToEsGsRingBuffer( "", pInsertPos); - StoreValueToEsGsRingBuffer(pStoreComp, location + i / 4, i % 4, pInsertPos); + StoreValueToEsGsRing(pStoreComp, location + i / 4, i % 4, pInsertPos); } } else @@ -3886,7 +4054,7 @@ void PatchInOutImportExport::StoreValueToEsGsRingBuffer( LLPC_ASSERT(pStoreTy->isIntegerTy()); } - // Call buffer store intrinsic + // Call buffer store intrinsic or LDS store const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage; LLPC_ASSERT(inOutUsage.pEsGsRingBufDesc != nullptr); @@ -3905,28 +4073,40 @@ void PatchInOutImportExport::StoreValueToEsGsRingBuffer( pRingBufDesc = inOutUsage.pEsGsRingBufDesc; } - auto pRingBufOffset = CalcEsGsRingBufferOffsetForOutput(location, compIdx, pInsertPos); + auto pRingOffset = CalcEsGsRingOffsetForOutput(location, compIdx, pEsGsOffset, pInsertPos); - // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit control - // of soffset. This is required by swizzle enabled mode when address range checking should be complied with. - std::vector args; - args.push_back(pStoreValue); // vdata - args.push_back(pRingBufDesc); // rsrc - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // vindex - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // voffset - args.push_back(pEsGsOffset); // soffset - args.push_back(pRingBufOffset); // offset - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32)); // dfmt - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_UINT)); // nfmt - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // slc - EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.i32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos); + if (m_pContext->IsGsOnChip()) + { + std::vector idxs; + idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); + idxs.push_back(pRingOffset); + + Value* pStorePtr = GetElementPtrInst::Create(nullptr, m_pLds, idxs, "", pInsertPos); + new StoreInst(pStoreValue, pStorePtr, false, m_pLds->getAlignment(), pInsertPos); + } + else + { + // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit control + // of soffset. This is required by swizzle enabled mode when address range checking should be complied with. + std::vector args; + args.push_back(pStoreValue); // vdata + args.push_back(pRingBufDesc); // rsrc + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // vindex + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // voffset + args.push_back(pEsGsOffset); // soffset + args.push_back(pRingOffset); // offset + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32)); // dfmt + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_UINT)); // nfmt + args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc + args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // slc + EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.i32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos); + } } } // ===================================================================================================================== -// Loads value from ES-GS ring buffer. -Value* PatchInOutImportExport::LoadValueFromEsGsRingBuffer( +// Loads value from ES-GS ring (buffer or LDS). +Value* PatchInOutImportExport::LoadValueFromEsGsRing( Type* pLoadTy, // [in] Load value type uint32_t location, // Input location uint32_t compIdx, // Input component index @@ -3951,56 +4131,55 @@ Value* PatchInOutImportExport::LoadValueFromEsGsRingBuffer( for (uint32_t i = compIdx; i < compCount; ++i) { - auto pRingBufOffset = CalcEsGsRingBufferOffsetForInput(location + i / 4, - i % 4, - pVertexIdx, - pInsertPos); + auto pLoadCompValue = LoadValueFromEsGsRing(pCompTy, + location + i / 4, + i % 4, + pVertexIdx, + pInsertPos); + pLoadValue = InsertElementInst::Create(pLoadValue, + pLoadCompValue, + ConstantInt::get(m_pContext->Int32Ty(), i), + "", + pInsertPos); + } + } + else + { + Value* pRingOffset = CalcEsGsRingOffsetForInput(location, compIdx, pVertexIdx, pInsertPos); + if (m_pContext->IsGsOnChip()) + { + std::vector idxs; + idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); + idxs.push_back(pRingOffset); + + Value* pLoadPtr = GetElementPtrInst::Create(nullptr, m_pLds, idxs, "", pInsertPos); + pLoadValue = new LoadInst(pLoadPtr, "", false, m_pLds->getAlignment(), pInsertPos); + + if (pLoadTy->isFloatTy()) + { + pLoadValue = BitCastInst::Create(Instruction::BitCast, pLoadValue, pLoadTy, "", pInsertPos); + } + } + else + { std::vector args; args.push_back(inOutUsage.pEsGsRingBufDesc); args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); - args.push_back(pRingBufOffset); - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc + args.push_back(pRingOffset); + args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // slc - auto pComp = EmitCall(m_pModule, + pLoadValue = EmitCall(m_pModule, "llvm.amdgcn.buffer.load.f32", m_pContext->FloatTy(), args, NoAttrib, pInsertPos); - if (pCompTy->isIntegerTy()) + if (pLoadTy->isIntegerTy()) { - pComp = BitCastInst::Create(Instruction::BitCast, pComp, pCompTy, "", pInsertPos); + pLoadValue = BitCastInst::Create(Instruction::BitCast, pLoadValue, pLoadTy, "", pInsertPos); } - - pLoadValue = InsertElementInst::Create(pLoadValue, - pComp, - ConstantInt::get(m_pContext->Int32Ty(), i), - "", - pInsertPos); - } - } - else - { - auto pRingBufOffset = CalcEsGsRingBufferOffsetForInput(location, compIdx, pVertexIdx, pInsertPos); - - std::vector args; - args.push_back(inOutUsage.pEsGsRingBufDesc); - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); - args.push_back(pRingBufOffset); - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // slc - pLoadValue = EmitCall(m_pModule, - "llvm.amdgcn.buffer.load.f32", - m_pContext->FloatTy(), - args, - NoAttrib, - pInsertPos); - - if (pLoadTy->isIntegerTy()) - { - pLoadValue = BitCastInst::Create(Instruction::BitCast, pLoadValue, pLoadTy, "", pInsertPos); } } @@ -4039,96 +4218,209 @@ void PatchInOutImportExport::StoreValueToGsVsRingBuffer( auto pEmitCounter = new LoadInst(inOutUsage.gs.pEmitCounterPtr, "", pInsertPos); - auto pRingBufOffset = CalcGsVsRingBufferOffsetForOutput(location, compIdx, pEmitCounter, pInsertPos); + auto pRingOffset = CalcGsVsRingOffsetForOutput(location, compIdx, pEmitCounter, pGsVsOffset, pInsertPos); - // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit - // control of soffset. This is required by swizzle enabled mode when address range checking should be - // complied with. - std::vector args; - args.push_back(pStoreValue); // vdata - args.push_back(inOutUsage.gs.pGsVsRingBufDesc); // rsrc - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // vindex - args.push_back(pRingBufOffset); // voffset - args.push_back(pGsVsOffset); // soffset - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // offset - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32)); // dfmt - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_UINT)); // nfmt - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // slc - EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.i32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos); + if (m_pContext->IsGsOnChip()) + { + std::vector idxs; + idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); + idxs.push_back(pRingOffset); + + Value* pStorePtr = GetElementPtrInst::Create(nullptr, m_pLds, idxs, "", pInsertPos); + new StoreInst(pStoreValue, pStorePtr, false, m_pLds->getAlignment(), pInsertPos); + } + else + { + // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit + // control of soffset. This is required by swizzle enabled mode when address range checking should be + // complied with. + std::vector args; + args.push_back(pStoreValue); // vdata + args.push_back(inOutUsage.gs.pGsVsRingBufDesc); // rsrc + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // vindex + args.push_back(pRingOffset); // voffset + args.push_back(pGsVsOffset); // soffset + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // offset + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32)); // dfmt + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_UINT)); // nfmt + args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc + args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // slc + EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.i32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos); + } } // ===================================================================================================================== -// Calculates the byte offset to store the output value to ES-GS ring buffer based on the specified output info. -Value* PatchInOutImportExport::CalcEsGsRingBufferOffsetForOutput( +// Calculates the byte offset to store the output value to ES-GS ring based on the specified output info. +Value* PatchInOutImportExport::CalcEsGsRingOffsetForOutput( uint32_t location, // Output location uint32_t compIdx, // Output component index + Value* pEsGsOffset, // [in] ES-GS ring offset in bytes Instruction* pInsertPos) // [in] Where to insert the instruction { - return ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx) * 4); + Value* pRingOffset = nullptr; + if (m_pContext->IsGsOnChip()) + { + // ringOffset = esGsOffset + threadId * esGsRingItemSize + location * 4 + compIdx + const auto pResUsage = m_pContext->GetShaderResourceUsage(m_shaderStage); + const auto& inOutUsage = pResUsage->inOutUsage; + uint32_t esGsRingItemSize = inOutUsage.outputMapLocCount * 4; + pEsGsOffset = BinaryOperator::CreateExact(Instruction::LShr, + pEsGsOffset, + ConstantInt::get(m_pContext->Int32Ty(), 2), + "", + pInsertPos); + + pRingOffset = BinaryOperator::CreateMul(m_pThreadId, + ConstantInt::get(m_pContext->Int32Ty(), esGsRingItemSize), + "", + pInsertPos); + + pRingOffset = BinaryOperator::CreateAdd(pRingOffset, pEsGsOffset, "", pInsertPos); + + pRingOffset = BinaryOperator::CreateAdd(pRingOffset, + ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx)), + "", + pInsertPos); + } + else + { + // ringOffset = (location * 4 + compIdx) * 4 + pRingOffset = ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx) * 4); + } + return pRingOffset; } // ===================================================================================================================== -// Calculates the byte offset to load the input value from ES-GS ring buffer based on the specified input info. -Value* PatchInOutImportExport::CalcEsGsRingBufferOffsetForInput( +// Calculates the byte offset to load the input value from ES-GS ring based on the specified input info. +Value* PatchInOutImportExport::CalcEsGsRingOffsetForInput( uint32_t location, // Input location uint32_t compIdx, // Input Component index Value* pVertexIdx, // [in] Vertex index Instruction* pInsertPos) // [in] Where to insert the instruction { - const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage; - LLPC_ASSERT(inOutUsage.gs.pEsGsOffsets != nullptr); + Value* pRingOffset = nullptr; + if (m_pContext->IsGsOnChip()) + { + const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage; + LLPC_ASSERT(inOutUsage.gs.pEsGsOffsets != nullptr); - Value* pVertexOffset = ExtractElementInst::Create(inOutUsage.gs.pEsGsOffsets, - pVertexIdx, - "", - pInsertPos); + Value* pVertexOffset = ExtractElementInst::Create(inOutUsage.gs.pEsGsOffsets, + pVertexIdx, + "", + pInsertPos); - // byteOffset = vertexOffset[N] * 4 + (location * 4 + compIdx) * 64 * 4; - auto pRingBufOffset = BinaryOperator::CreateMul(pVertexOffset, - ConstantInt::get(m_pContext->Int32Ty(), 4), - "", - pInsertPos); + // ringOffset = vertexOffset[N] + (location * 4 + compIdx); + pRingOffset = + BinaryOperator::CreateAdd(pVertexOffset, + ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx)), + "", + pInsertPos); + } + else + { + const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage; + LLPC_ASSERT(inOutUsage.gs.pEsGsOffsets != nullptr); - pRingBufOffset = - BinaryOperator::CreateAdd(pRingBufOffset, - ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx) * 64 * 4), - "", - pInsertPos); + Value* pVertexOffset = ExtractElementInst::Create(inOutUsage.gs.pEsGsOffsets, + pVertexIdx, + "", + pInsertPos); + + // ringOffset = vertexOffset[N] * 4 + (location * 4 + compIdx) * 64 * 4; + pRingOffset = BinaryOperator::CreateMul(pVertexOffset, + ConstantInt::get(m_pContext->Int32Ty(), 4), + "", + pInsertPos); - return pRingBufOffset; + pRingOffset = + BinaryOperator::CreateAdd(pRingOffset, + ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx) * 64 * 4), + "", + pInsertPos); + } + + return pRingOffset; } // ===================================================================================================================== -// Calculates the byte offset to store the output value to GS-VS ring buffer based on the specified output info. -Value* PatchInOutImportExport::CalcGsVsRingBufferOffsetForOutput( +// Calculates the offset to store the output value to GS-VS ring based on the specified output info. +Value* PatchInOutImportExport::CalcGsVsRingOffsetForOutput( uint32_t location, // Output location uint32_t compIdx, // Output component Value* pVertexIdx, // [in] Vertex index + Value* pGsVsOffset, // [in] ES-GS ring offset in bytes Instruction* pInsertPos) // [in] Where to insert the instruction { auto pResUsage = m_pContext->GetShaderResourceUsage(ShaderStageGeometry); - uint32_t outputVertices = pResUsage->builtInUsage.gs.outputVertices; + Value* pRingOffset = nullptr; + if (m_pContext->IsGsOnChip()) + { + // ringOffset = esGsLdsSize + + // gsVsOffset + + // threadId * gsVsRingItemSize + + // (vertexIdx * vertexSize) + location * 4 + compIdx - // byteOffset = ((location * 4 + compIdx) * maxVertices + vertexIdx) * 4; - auto pRingBufOffset = BinaryOperator::CreateAdd(ConstantInt::get(m_pContext->Int32Ty(), - (location * 4 + compIdx) * outputVertices), - pVertexIdx, - "", - pInsertPos); + uint32_t gsVsRingItemSize = 4 * + pResUsage->inOutUsage.outputMapLocCount * + pResUsage->builtInUsage.gs.outputVertices; - pRingBufOffset = BinaryOperator::CreateMul(pRingBufOffset, - ConstantInt::get(m_pContext->Int32Ty(), 4), - "", - pInsertPos); + auto pEsGsLdsSize = ConstantInt::get(m_pContext->Int32Ty(), pResUsage->inOutUsage.gs.esGsLdsSize); + + pGsVsOffset = BinaryOperator::CreateExact(Instruction::LShr, + pGsVsOffset, + ConstantInt::get(m_pContext->Int32Ty(), 2), + "", + pInsertPos); + + auto pRingItemOffset = BinaryOperator::CreateMul(m_pThreadId, + ConstantInt::get(m_pContext->Int32Ty(), gsVsRingItemSize), + "", + pInsertPos); + + uint32_t vertexSize = pResUsage->inOutUsage.outputMapLocCount * 4; + auto pVertexItemOffset = BinaryOperator::CreateMul(pVertexIdx, + ConstantInt::get(m_pContext->Int32Ty(), vertexSize), + "", + pInsertPos); + + pRingOffset = BinaryOperator::CreateAdd(pEsGsLdsSize, pGsVsOffset, "", pInsertPos); + + pRingOffset = BinaryOperator::CreateAdd(pRingOffset, pRingItemOffset, "", pInsertPos); + + pRingOffset = BinaryOperator::CreateAdd(pRingOffset, pVertexItemOffset, "", pInsertPos); + + pRingOffset = BinaryOperator::CreateAdd(pRingOffset, + ConstantInt::get(m_pContext->Int32Ty(), (location * 4) + compIdx), + "", + pInsertPos); + + } + else + { + // ringOffset = ((location * 4 + compIdx) * maxVertices + vertexIdx) * 4; + + uint32_t outputVertices = pResUsage->builtInUsage.gs.outputVertices; + + pRingOffset = BinaryOperator::CreateAdd(ConstantInt::get(m_pContext->Int32Ty(), + (location * 4 + compIdx) * outputVertices), + pVertexIdx, + "", + pInsertPos); + + pRingOffset = BinaryOperator::CreateMul(pRingOffset, + ConstantInt::get(m_pContext->Int32Ty(), 4), + "", + pInsertPos); + } - return pRingBufOffset; + return pRingOffset; } // ===================================================================================================================== // Reads value from LDS. Value* PatchInOutImportExport::ReadValueFromLds( + bool isOutput, // is the value from output variable Type* pReadTy, // [in] Type of value read from LDS Value* pLdsOffset, // [in] Start offset to do LDS read operations Instruction* pInsertPos) // [in] Where to insert read instructions @@ -4143,12 +4435,21 @@ Value* PatchInOutImportExport::ReadValueFromLds( std::vector loadValues(numChannels); - if (m_pContext->IsTessOffChip() && m_shaderStage == ShaderStageTessEval) // Read from off-chip LDS buffer + const bool isTcsOutput = (isOutput && (m_shaderStage == ShaderStageTessControl)); + const bool isTesInput = ((isOutput == false) && (m_shaderStage == ShaderStageTessEval)); + + if (m_pContext->IsTessOffChip() && (isTcsOutput || isTesInput)) // Read from off-chip LDS buffer { - auto& entryArgIdxs = m_pContext->GetShaderInterfaceData(m_shaderStage)->entryArgIdxs.tes; - const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage.tes; + const auto& offChipLdsBase = (m_shaderStage == ShaderStageTessEval) ? + m_pContext->GetShaderInterfaceData(m_shaderStage)->entryArgIdxs.tes.offChipLdsBase : + m_pContext->GetShaderInterfaceData(m_shaderStage)->entryArgIdxs.tcs.offChipLdsBase; + + const auto& pOffChipLdsDesc = (m_shaderStage == ShaderStageTessEval) ? + m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage.tes.pOffChipLdsDesc : + m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage.tcs.pOffChipLdsDesc; + + auto pOffChipLdsBase = GetFunctionArgument(m_pEntryPoint, offChipLdsBase); - auto pOcldsBufferBase = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.offChipLdsBase); // Convert DWORD off-chip LDS offset to byte offset pLdsOffset = BinaryOperator::CreateMul(pLdsOffset, ConstantInt::get(m_pContext->Int32Ty(), 4), @@ -4158,10 +4459,10 @@ Value* PatchInOutImportExport::ReadValueFromLds( for (uint32_t i = 0; i < numChannels; ++i) { std::vector args; - args.push_back(inOutUsage.pOffChipLdsDesc); // rsrc + args.push_back(pOffChipLdsDesc); // rsrc args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // vindex args.push_back(pLdsOffset); // voffset - args.push_back(pOcldsBufferBase); // soffset + args.push_back(pOffChipLdsBase); // soffset args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i * 4)); // offset args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32)); // dfmt args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_FLOAT)); // nfmt @@ -4455,39 +4756,29 @@ void PatchInOutImportExport::StoreTessFactorToBuffer( ConstantInt::get(m_pContext->Int32Ty(), 4), "", pInsertPos); - pTfBufferOffset = BinaryOperator::CreateAdd(pTfBufferOffset, - ConstantInt::get(m_pContext->Int32Ty(), tessFactorOffset * 4), - "", - pInsertPos); - if (m_pContext->IsTessOffChip()) + for (uint32_t i = 0; i < tessFactors.size(); ++i) { - // NOTE: GFX9 does not support dynamic tessellation control, so additional 4-byte offset is not required for - // tessellation off-chip mode. - const auto gfxIp = m_pContext->GetGfxIpVersion(); - if (gfxIp.major != 9) + uint32_t tessFactorByteOffset = i * 4 + tessFactorOffset * 4; + if (m_pContext->GetGfxIpVersion().major != 9) { - pTfBufferOffset = BinaryOperator::CreateAdd(pTfBufferOffset, - ConstantInt::get(m_pContext->Int32Ty(), 4), - "", - pInsertPos); + // NOTE: GFX9 does not support dynamic tessellation control, so additional 4-byte offset is not required for + // tessellation off-chip mode. + tessFactorByteOffset += (m_pContext->IsTessOffChip() ? 4 : 0); } - } - for (uint32_t i = 0; i < tessFactors.size(); ++i) - { std::vector args; - args.push_back(tessFactors[i]); // vdata - args.push_back(inOutUsage.pTessFactorBufDesc); // rsrc - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // vindex - args.push_back(pTfBufferOffset); // voffset - args.push_back(pTfBufferBase); // soffset - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i * 4)); // offset - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32)); // dfmt - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_FLOAT)); // nfmt - args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc - args.push_back(ConstantInt::get(m_pContext->BoolTy(), false)); // slc + args.push_back(tessFactors[i]); // vdata + args.push_back(inOutUsage.pTessFactorBufDesc); // rsrc + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); // vindex + args.push_back(pTfBufferOffset); // voffset + args.push_back(pTfBufferBase); // soffset + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), tessFactorByteOffset)); // offset + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32)); // dfmt + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_FLOAT)); // nfmt + args.push_back(ConstantInt::get(m_pContext->BoolTy(), true)); // glc + args.push_back(ConstantInt::get(m_pContext->BoolTy(), false)); // slc EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.f32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos); } @@ -4627,23 +4918,40 @@ void PatchInOutImportExport::CreateTessBufferStoreFunction() // ===================================================================================================================== // Calculates the DWORD offset to write value to LDS based on the specified VS output info. Value* PatchInOutImportExport::CalcLdsOffsetForVsOutput( + Type* pOutputTy, // [in] Type of the output uint32_t location, // Base location of the output + uint32_t compIdx, // Index used for vector element indexing Instruction* pInsertPos) // [in] Where to insert calculation instructions { LLPC_ASSERT(m_shaderStage == ShaderStageVertex); + // attribOffset = location * 4 + compIdx + Value* pAttribOffset = ConstantInt::get(m_pContext->Int32Ty(), location * 4); + + const uint32_t bitWidth = pOutputTy->getScalarSizeInBits(); + LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64)); + + if (bitWidth == 64) + { + // For 64-bit data type, the component indexing must multiply by 2 + compIdx *= 2; + } + + pAttribOffset = BinaryOperator::CreateAdd(pAttribOffset, + ConstantInt::get(m_pContext->Int32Ty(), compIdx), + "", + pInsertPos); + const auto& entryArgIdxs = m_pContext->GetShaderInterfaceData(ShaderStageVertex)->entryArgIdxs.vs; auto pRelVertexId = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.relVertexId); const auto& calcFactor = m_pContext->GetShaderResourceUsage(ShaderStageTessControl)->inOutUsage.tcs.calcFactor; auto pVertexStride = ConstantInt::get(m_pContext->Int32Ty(), calcFactor.inVertexStride); - // dwordOffset = relVertexId * vertexStride + location * 4 + // dwordOffset = relVertexId * vertexStride + attribOffset auto pLdsOffset = BinaryOperator::CreateMul(pRelVertexId, pVertexStride, "", pInsertPos); - pLdsOffset = BinaryOperator::CreateAdd(pLdsOffset, - ConstantInt::get(m_pContext->Int32Ty(), location * 4), - "", - pInsertPos); + pLdsOffset = BinaryOperator::CreateAdd(pLdsOffset, pAttribOffset, "", pInsertPos); + return pLdsOffset; } @@ -4930,6 +5238,7 @@ uint32_t PatchInOutImportExport::CalcPatchCountPerThreadGroup( void PatchInOutImportExport::AddExportInstForGenericOutput( Value* pOutput, // [in] Output value uint32_t location, // Location of the output + uint32_t compIdx, // Index used for vector element indexing Instruction* pInsertPos) // [in] Where to insert the "exp" instruction { // Check if the shader stage is valid to use "exp" instruction to export output @@ -4945,10 +5254,12 @@ void PatchInOutImportExport::AddExportInstForGenericOutput( const uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() : 1; const uint32_t bitWidth = pOutputTy->getScalarSizeInBits(); + LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64)); // Convert the output value to floating-point export value Value* pExport = nullptr; - const uint32_t numChannels = (bitWidth * compCount) / 32; + const uint32_t numChannels = (bitWidth == 64) ? compCount * 2 : compCount; + uint32_t startChannel = (bitWidth == 64) ? compIdx * 2 : compIdx; Type* pExportTy = (numChannels > 1) ? VectorType::get(m_pContext->FloatTy(), numChannels) : m_pContext->FloatTy(); if (pOutputTy != pExportTy) @@ -4961,32 +5272,48 @@ void PatchInOutImportExport::AddExportInstForGenericOutput( pExport = pOutput; } + LLPC_ASSERT(numChannels <= 8); + Value* exportValues[8] = { nullptr }; + + if (numChannels == 1) + { + exportValues[0] = pExport; + } + else + { + for (uint32_t i = 0; i < numChannels; ++i) + { + exportValues[i] = ExtractElementInst::Create(pExport, + ConstantInt::get(m_pContext->Int32Ty(), i), + "", + pInsertPos); + } + } + std::vector args; if (numChannels <= 4) { + LLPC_ASSERT(startChannel + numChannels <= 4); + const uint32_t channelMask = ((1 << (startChannel + numChannels)) - 1) - ((1 << startChannel) - 1); + args.clear(); args.push_back(ConstantInt::get(m_pContext->Int32Ty(), EXP_TARGET_PARAM_0 + location)); // tgt - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0xF)); // en + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), channelMask)); // en // src0 ~ src3 - if (numChannels == 1) + for (uint32_t i = 0; i < startChannel; ++i) { - args.push_back(pExport); + // Inactive components (dummy) + args.push_back(UndefValue::get(m_pContext->FloatTy())); } - else + + for (uint32_t i = startChannel; i < startChannel + numChannels; ++i) { - for (uint32_t i = 0; i < numChannels; ++i) - { - auto pCompValue = ExtractElementInst::Create(pExport, - ConstantInt::get(m_pContext->Int32Ty(), i), - "", - pInsertPos); - args.push_back(pCompValue); - } + args.push_back(exportValues[i - startChannel]); } - for (uint32_t i = numChannels; i < 4; ++i) + for (uint32_t i = startChannel + numChannels; i < 4; ++i) { // Inactive components (dummy) args.push_back(UndefValue::get(m_pContext->FloatTy())); @@ -5001,6 +5328,7 @@ void PatchInOutImportExport::AddExportInstForGenericOutput( else { // We have to do exporting twice for this output + LLPC_ASSERT(startChannel == 0); // Other values are disallowed according to GLSL spec LLPC_ASSERT((numChannels == 6) || (numChannels == 8)); // Do the first exporting @@ -5011,31 +5339,26 @@ void PatchInOutImportExport::AddExportInstForGenericOutput( // src0 ~ src3 for (uint32_t i = 0; i < 4; ++i) { - auto pCompValue = ExtractElementInst::Create(pExport, - ConstantInt::get(m_pContext->Int32Ty(), i), - "", - pInsertPos); - args.push_back(pCompValue); + args.push_back(exportValues[i]); } args.push_back(ConstantInt::get(m_pContext->BoolTy(), false)); // done args.push_back(ConstantInt::get(m_pContext->BoolTy(), false)); // vm EmitCall(m_pModule, "llvm.amdgcn.exp.f32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos); + ++inOutUsage.expCount; // Do the second exporting + const uint32_t channelMask = ((1 << (numChannels - 4)) - 1); + args.clear(); args.push_back(ConstantInt::get(m_pContext->Int32Ty(), EXP_TARGET_PARAM_0 + location + 1)); // tgt - args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0xF)); // en + args.push_back(ConstantInt::get(m_pContext->Int32Ty(), channelMask)); // en // src0 ~ src3 for (uint32_t i = 4; i < numChannels; ++i) { - auto pCompValue = ExtractElementInst::Create(pExport, - ConstantInt::get(m_pContext->Int32Ty(), i), - "", - pInsertPos); - args.push_back(pCompValue); + args.push_back(exportValues[i]); } for (uint32_t i = numChannels; i < 8; ++i) @@ -5048,7 +5371,7 @@ void PatchInOutImportExport::AddExportInstForGenericOutput( args.push_back(ConstantInt::get(m_pContext->BoolTy(), false)); // vm EmitCall(m_pModule, "llvm.amdgcn.exp.f32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos); - inOutUsage.expCount += 2; + ++inOutUsage.expCount; } } diff --git a/icd/api/llpc/patch/llpcPatchInOutImportExport.h b/icd/api/llpc/patch/llpcPatchInOutImportExport.h index 386f49bd..6e688637 100644 --- a/icd/api/llpc/patch/llpcPatchInOutImportExport.h +++ b/icd/api/llpc/patch/llpcPatchInOutImportExport.h @@ -50,6 +50,7 @@ class PatchInOutImportExport: public: PatchInOutImportExport(); virtual ~PatchInOutImportExport(); + virtual bool runOnModule(llvm::Module& module); virtual void visitCallInst(llvm::CallInst& callInst); virtual void visitReturnInst(llvm::ReturnInst& retInst); @@ -64,7 +65,10 @@ class PatchInOutImportExport: private: LLPC_DISALLOW_COPY_AND_ASSIGN(PatchInOutImportExport); - llvm::Value* PatchVsGenericInputImport(llvm::Type* pInputTy, uint32_t location, llvm::Instruction* pInsertPos); + llvm::Value* PatchVsGenericInputImport(llvm::Type* pInputTy, + uint32_t location, + uint32_t compIdx, + llvm::Instruction* pInsertPos); llvm::Value* PatchTcsGenericInputImport(llvm::Type* pInputTy, uint32_t location, llvm::Value* pLocOffset, @@ -79,6 +83,7 @@ class PatchInOutImportExport: llvm::Instruction* pInsertPos); llvm::Value* PatchGsGenericInputImport(llvm::Type* pInputTy, uint32_t location, + uint32_t compIdx, llvm::Value* pVertexIdx, llvm::Instruction* pInsertPos); llvm::Value* PatchFsGenericInputImport(llvm::Type* pInputTy, @@ -97,19 +102,29 @@ class PatchInOutImportExport: llvm::Value* pVertexIdx, llvm::Instruction* pInsertPos); - void PatchVsGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos); + void PatchVsGenericOutputExport(llvm::Value* pOutput, + uint32_t location, + uint32_t compIdx, + llvm::Instruction* pInsertPos); void PatchTcsGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Value* pLocOffset, llvm::Value* pCompIdx, llvm::Value* pVertexIdx, llvm::Instruction* pInsertPos); - void PatchTesGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos); + void PatchTesGenericOutputExport(llvm::Value* pOutput, + uint32_t location, + uint32_t compIdx, + llvm::Instruction* pInsertPos); void PatchGsGenericOutputExport(llvm::Value* pOutput, uint32_t location, + uint32_t compIdx, uint32_t streamId, llvm::Instruction* pInsertPos); - void PatchFsGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos); + void PatchFsGenericOutputExport(llvm::Value* pOutput, + uint32_t location, + uint32_t compIdx, + llvm::Instruction* pInsertPos); llvm::Value* PatchVsBuiltInInputImport(llvm::Type* pInputTy, uint32_t builtInId, llvm::Instruction* pInsertPos); llvm::Value* PatchTcsBuiltInInputImport(llvm::Type* pInputTy, @@ -151,37 +166,39 @@ class PatchInOutImportExport: void PatchCopyShaderGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos); void PatchCopyShaderBuiltInOutputExport(llvm::Value* pOutput, uint32_t builtInId, llvm::Instruction* pInsertPos); - void StoreValueToEsGsRingBuffer(llvm::Value* pStoreValue, - uint32_t location, - uint32_t compIdx, - llvm::Instruction* pInsertPos); + void StoreValueToEsGsRing(llvm::Value* pStoreValue, + uint32_t location, + uint32_t compIdx, + llvm::Instruction* pInsertPos); - llvm::Value* LoadValueFromEsGsRingBuffer(llvm::Type* pLoadType, - uint32_t location, - uint32_t compIdx, - llvm::Value* pVertexIdx, - llvm::Instruction* pInsertPos); + llvm::Value* LoadValueFromEsGsRing(llvm::Type* pLoadType, + uint32_t location, + uint32_t compIdx, + llvm::Value* pVertexIdx, + llvm::Instruction* pInsertPos); void StoreValueToGsVsRingBuffer(llvm::Value* pStoreValue, uint32_t location, uint32_t compIdx, llvm::Instruction* pInsertPos); - llvm::Value* CalcEsGsRingBufferOffsetForOutput(uint32_t location, - uint32_t compIdx, - llvm::Instruction* pInsertPos); + llvm::Value* CalcEsGsRingOffsetForOutput(uint32_t location, + uint32_t compIdx, + llvm::Value* pEsGsOffset, + llvm::Instruction* pInsertPos); - llvm::Value* CalcEsGsRingBufferOffsetForInput(uint32_t location, - uint32_t compIdx, - llvm::Value* pVertexIdx, - llvm::Instruction* pInsertPos); + llvm::Value* CalcEsGsRingOffsetForInput(uint32_t location, + uint32_t compIdx, + llvm::Value* pVertexIdx, + llvm::Instruction* pInsertPos); - llvm::Value* CalcGsVsRingBufferOffsetForOutput(uint32_t location, - uint32_t compIdx, - llvm::Value* pVertexIdx, - llvm::Instruction* pInsertPos); + llvm::Value* CalcGsVsRingOffsetForOutput(uint32_t location, + uint32_t compIdx, + llvm::Value* pVertexIdx, + llvm::Value* pGsVsOffset, + llvm::Instruction* pInsertPos); - llvm::Value* ReadValueFromLds(llvm::Type* pReadTy, llvm::Value* pLdsOffset, llvm::Instruction* pInsertPos); + llvm::Value* ReadValueFromLds(bool isOutput, llvm::Type* pReadTy, llvm::Value* pLdsOffset, llvm::Instruction* pInsertPos); void WriteValueToLds(llvm::Value* pWriteValue, llvm::Value* pLdsOffset, llvm::Instruction* pInsertPos); llvm::Value* CalcTessFactorOffset(bool isOuter, llvm::Value* pElemIdx, llvm::Instruction* pInsertPos); @@ -198,7 +215,10 @@ class PatchInOutImportExport: uint32_t outVertexStride, uint32_t patchConstCount) const; - llvm::Value* CalcLdsOffsetForVsOutput(uint32_t location, llvm::Instruction* pInsertPos); + llvm::Value* CalcLdsOffsetForVsOutput(Type* pOutputTy, + uint32_t location, + uint32_t compIdx, + llvm::Instruction* pInsertPos); llvm::Value* CalcLdsOffsetForTcsInput(Type* pInputTy, uint32_t location, @@ -221,7 +241,10 @@ class PatchInOutImportExport: llvm::Value* pVertexIdx, llvm::Instruction* pInsertPos); - void AddExportInstForGenericOutput(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos); + void AddExportInstForGenericOutput(llvm::Value* pOutput, + uint32_t location, + uint32_t compIdx, + llvm::Instruction* pInsertPos); void AddExportInstForBuiltInOutput(llvm::Value* pOutput, uint32_t builtInId, llvm::Instruction* pInsertPos); // ----------------------------------------------------------------------------------------------------------------- @@ -253,6 +276,9 @@ class PatchInOutImportExport: bool m_hasGs; // Whether the pipeline has geometry shader GlobalVariable* m_pLds; // Global variable to model LDS + llvm::Value* m_pThreadId; // Thread ID + + std::vector m_expFragColors[MaxColorTargets]; // Exported fragment colors std::vector m_importCalls; // List of "call" instructions to import inputs std::vector m_exportCalls; // List of "call" instructions to export outputs diff --git a/icd/api/llpc/patch/llpcPatchResourceCollect.cpp b/icd/api/llpc/patch/llpcPatchResourceCollect.cpp index e74ed39b..d1c18b16 100644 --- a/icd/api/llpc/patch/llpcPatchResourceCollect.cpp +++ b/icd/api/llpc/patch/llpcPatchResourceCollect.cpp @@ -229,56 +229,41 @@ void PatchResourceCollect::visitCallInst( if ((m_shaderStage == ShaderStageTessControl) || (m_shaderStage == ShaderStageTessEval)) { auto pLocOffset = callInst.getOperand(1); - auto pCompIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2); + auto pCompIdx = callInst.getOperand(2); + if (isa(pLocOffset)) { // Location offset is constant auto locOffset = cast(pLocOffset)->getZExtValue(); loc += locOffset; - if (pCompIdx != nullptr) + auto bitWidth = pInputTy->getScalarSizeInBits(); + if (bitWidth == 64) { - // Vector component addressing - LLPC_ASSERT(pInputTy->isVectorTy() == false); // Must be scalar type - - auto bitWidth = pInputTy->getScalarSizeInBits(); - if (bitWidth == 64) + if (isa(pCompIdx)) { - if (isa(pCompIdx)) - { - auto compIdx = cast(pCompIdx)->getZExtValue(); - - m_activeInputLocs.insert(loc); - if (compIdx >= 2) - { - // NOTE: For the addressing of .z/.w component of 64-bit vector, the count of - // occupied locations are two. - m_activeInputLocs.insert(loc + 1); - } - } - else + auto compIdx = cast(pCompIdx)->getZExtValue(); + + m_activeInputLocs.insert(loc); + if (compIdx >= 2) { - // NOTE: If vector component index is not constant, we treat this as dynamic indexing. - m_hasDynIndexedInput = true; + // NOTE: For the addressing of .z/.w component of 64-bit vector/scalar, the count of + // occupied locations are two. + m_activeInputLocs.insert(loc + 1); } } else { - // NOTE: For 32-bit scalar, one location is sufficient regardless of vector component - // addressing. - LLPC_ASSERT(bitWidth == 32); - m_activeInputLocs.insert(loc); + // NOTE: If vector component index is not constant, we treat this as dynamic indexing. + m_hasDynIndexedInput = true; } } else { - // Not vector component addressing + // NOTE: For 32-bit vector/scalar, one location is sufficient regardless of vector component + // addressing. + LLPC_ASSERT(bitWidth == 32); m_activeInputLocs.insert(loc); - if (pInputTy->getPrimitiveSizeInBits() > (8 * SizeOfVec4)) - { - LLPC_ASSERT(pInputTy->getPrimitiveSizeInBits() <= (8 * 2 * SizeOfVec4)); - m_activeInputLocs.insert(loc + 1); - } } } else @@ -353,7 +338,7 @@ void PatchResourceCollect::visitCallInst( auto loc = cast(callInst.getOperand(0))->getZExtValue(); auto pLocOffset = callInst.getOperand(1); - auto pCompIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2); + auto pCompIdx = callInst.getOperand(2); if (isa(pLocOffset)) { @@ -361,49 +346,33 @@ void PatchResourceCollect::visitCallInst( auto locOffset = cast(pLocOffset)->getZExtValue(); loc += locOffset; - if (pCompIdx != nullptr) + auto bitWidth = pOutputTy->getScalarSizeInBits(); + if (bitWidth == 64) { - // Vector component addressing - LLPC_ASSERT(pOutputTy->isVectorTy() == false); // Must be scalar type - - auto bitWidth = pOutputTy->getScalarSizeInBits(); - if (bitWidth == 64) + if (isa(pCompIdx)) { - if (isa(pCompIdx)) - { - auto compIdx = cast(pCompIdx)->getZExtValue(); + auto compIdx = cast(pCompIdx)->getZExtValue(); - m_importedOutputLocs.insert(loc); - if (compIdx >= 2) - { - // NOTE: For the addressing of .z/.w component of 64-bit vector, the count of - // occupied locations are two. - m_importedOutputLocs.insert(loc + 1); - } - } - else + m_importedOutputLocs.insert(loc); + if (compIdx >= 2) { - // NOTE: If vector component index is not constant, we treat this as dynamic indexing. - m_hasDynIndexedOutput = true; + // NOTE: For the addressing of .z/.w component of 64-bit vector/scalar, the count of + // occupied locations are two. + m_importedOutputLocs.insert(loc + 1); } } else { - // NOTE: For 32-bit scalar, one location is sufficient regardless of vector component - // addressing. - LLPC_ASSERT(bitWidth == 32); - m_importedOutputLocs.insert(loc); + // NOTE: If vector component index is not constant, we treat this as dynamic indexing. + m_hasDynIndexedOutput = true; } } else { - // Not vector component addressing + // NOTE: For 32-bit vector/scalar, one location is sufficient regardless of vector component + // addressing. + LLPC_ASSERT(bitWidth == 32); m_importedOutputLocs.insert(loc); - if (pOutputTy->getPrimitiveSizeInBits() > (8 * SizeOfVec4)) - { - LLPC_ASSERT(pOutputTy->getPrimitiveSizeInBits() <= (8 * 2 * SizeOfVec4)); - m_importedOutputLocs.insert(loc + 1); - } } } else @@ -431,25 +400,19 @@ void PatchResourceCollect::visitCallInst( auto loc = cast(callInst.getOperand(0))->getZExtValue(); auto pLocOffset = callInst.getOperand(1); - auto pCompIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2); + auto pCompIdx = callInst.getOperand(2); if (isa(pLocOffset)) { // Location offset is constant - if (pCompIdx != nullptr) - { - // Vector component addressing - LLPC_ASSERT(pOutputTy->isVectorTy() == false); // Must be scalar type - - auto bitWidth = pOutputTy->getScalarSizeInBits(); - LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64)); + auto bitWidth = pOutputTy->getScalarSizeInBits(); + LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64)); - if ((bitWidth == 64) && (isa(pCompIdx) == false)) - { - // NOTE: If vector component index is not constant and it is vector component addressing for - // 64-bit vector, we treat this as dynamic indexing. - m_hasDynIndexedOutput = true; - } + if ((bitWidth == 64) && (isa(pCompIdx) == false)) + { + // NOTE: If vector component index is not constant and it is vector component addressing for + // 64-bit vector, we treat this as dynamic indexing. + m_hasDynIndexedOutput = true; } } else diff --git a/icd/api/llpc/patch/llpcVertexFetch.cpp b/icd/api/llpc/patch/llpcVertexFetch.cpp index b6825f96..cafa6903 100644 --- a/icd/api/llpc/patch/llpcVertexFetch.cpp +++ b/icd/api/llpc/patch/llpcVertexFetch.cpp @@ -983,6 +983,7 @@ VertexFetch::VertexFetch( Value* VertexFetch::Run( Type* pInputTy, // [in] Type of vertex input uint32_t location, // Location of vertex input + uint32_t compIdx, // Index used for vector element indexing Instruction* pInsertPos) // [in] Where to insert vertex fetch instructions { Value* pVertex = nullptr; @@ -1221,92 +1222,114 @@ Value* VertexFetch::Run( // Finalize vertex fetch Type* pBasicTy = pInputTy->isVectorTy() ? pInputTy->getVectorElementType() : pInputTy; const uint32_t bitWidth = pBasicTy->getScalarSizeInBits(); + LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64)); - const uint32_t inputCompCount = pInputTy->isVectorTy() ? pInputTy->getVectorNumElements() : 1; - const uint32_t vertexCompCount = inputCompCount * bitWidth / 32; - const uint32_t fetchCompCount = pVertexFetch->getType()->isVectorTy() ? - pVertexFetch->getType()->getVectorNumElements() : 1; - if (vertexCompCount == fetchCompCount) + // Get default fetch values + Constant* pDefaults = nullptr; + + if (pBasicTy->isIntegerTy()) { - // Exact match, vertex input takes values from vertex fetch results - pVertex = pVertexFetch; + if (bitWidth == 32) + { + pDefaults = m_fetchDefaults.pInt; + } + else + { + LLPC_ASSERT(bitWidth == 64); + pDefaults = m_fetchDefaults.pInt64; + } } - else if (vertexCompCount < fetchCompCount) + else if (pBasicTy->isFloatingPointTy()) { - // Vertex input takes part of values from vertex fetch results - if (vertexCompCount == 1) + if (bitWidth == 32) { - Constant* pIndex = ConstantInt::get(m_pContext->Int32Ty(), 0); - pVertex = ExtractElementInst::Create(pVertexFetch, pIndex, "", pInsertPos); + pDefaults = m_fetchDefaults.pFloat; } else { - shuffleMask.clear(); - for (uint32_t i = 0; i < vertexCompCount; ++i) - { - shuffleMask.push_back(ConstantInt::get(m_pContext->Int32Ty(), i)); - } - pVertex = new ShuffleVectorInst(pVertexFetch, pVertexFetch, ConstantVector::get(shuffleMask), "", pInsertPos); + LLPC_ASSERT(bitWidth == 64); + pDefaults = m_fetchDefaults.pDouble; } } else { - // Vertex input takes values from both vertex fetch results and the default fetch values - Constant* pDefaults = nullptr; + LLPC_NEVER_CALLED(); + } + + const uint32_t defaultCompCount = pDefaults->getType()->getVectorNumElements(); + std::vector defaultValues(defaultCompCount); - // Get default fetch values - if (pBasicTy->isIntegerTy()) + for (uint32_t i = 0; i < defaultValues.size(); ++i) + { + defaultValues[i] = ExtractElementInst::Create(pDefaults, + ConstantInt::get(m_pContext->Int32Ty(), i), + "", + pInsertPos); + } + + // Get vertex fetch values + const uint32_t fetchCompCount = pVertexFetch->getType()->isVectorTy() ? + pVertexFetch->getType()->getVectorNumElements() : 1; + std::vector fetchValues(fetchCompCount); + + if (fetchCompCount == 1) + { + fetchValues[0] = pVertexFetch; + } + else + { + for (uint32_t i = 0; i < fetchCompCount; ++i) { - if (bitWidth == 32) - { - pDefaults = m_fetchDefaults.pInt; - } - else - { - LLPC_ASSERT(bitWidth == 64); - pDefaults = m_fetchDefaults.pInt64; - } + fetchValues[i] = ExtractElementInst::Create(pVertexFetch, + ConstantInt::get(m_pContext->Int32Ty(), i), + "", + pInsertPos); } - else if (pBasicTy->isFloatingPointTy()) + } + + // Construct vertex fetch results + const uint32_t inputCompCount = pInputTy->isVectorTy() ? pInputTy->getVectorNumElements() : 1; + const uint32_t vertexCompCount = inputCompCount * bitWidth / 32; + + std::vector vertexValues(vertexCompCount); + + // NOTE: Original component index is based on the basic scalar type. + compIdx *= ((bitWidth == 64) ? 2 : 1); + + // Vertex input might take values from vertex fetch values or default fetch values + for (uint32_t i = 0; i < vertexCompCount; i++) + { + if (compIdx + i < fetchCompCount) { - if (bitWidth == 32) - { - pDefaults = m_fetchDefaults.pFloat; - } - else - { - LLPC_ASSERT(bitWidth == 64); - pDefaults = m_fetchDefaults.pDouble; - } + vertexValues[i] = fetchValues[compIdx + i]; + } + else if (compIdx + i < defaultCompCount) + { + vertexValues[i] = defaultValues[compIdx + i]; } else { LLPC_NEVER_CALLED(); + vertexValues[i] = UndefValue::get(m_pContext->Int32Ty()); } + } + if (vertexCompCount == 1) + { + pVertex = vertexValues[0]; + } + else + { Type* pVertexTy = VectorType::get(m_pContext->Int32Ty(), vertexCompCount); pVertex = UndefValue::get(pVertexTy); - if (fetchCompCount == 1) - { - Constant* pIndex = ConstantInt::get(m_pContext->Int32Ty(), 0); - pVertex = InsertElementInst::Create(pVertex, pVertexFetch, pIndex, "", pInsertPos); - } - else - { - for (uint32_t i = 0; i < fetchCompCount; ++i) - { - Constant* pIndex = ConstantInt::get(m_pContext->Int32Ty(), i); - Value* pVertexComp = ExtractElementInst::Create(pVertexFetch, pIndex, "", pInsertPos); - pVertex = InsertElementInst::Create(pVertex, pVertexComp, pIndex, "", pInsertPos); - } - } - - for (uint32_t i = fetchCompCount; i < vertexCompCount; ++i) + for (uint32_t i = 0; i < vertexCompCount; ++i) { - Constant* pIndex = ConstantInt::get(m_pContext->Int32Ty(), i); - Value* pVertexComp = ExtractElementInst::Create(pDefaults, pIndex, "", pInsertPos); - pVertex = InsertElementInst::Create(pVertex, pVertexComp, pIndex, "", pInsertPos); + pVertex = InsertElementInst::Create(pVertex, + vertexValues[i], + ConstantInt::get(m_pContext->Int32Ty(), i), + "", + pInsertPos); } } diff --git a/icd/api/llpc/patch/llpcVertexFetch.h b/icd/api/llpc/patch/llpcVertexFetch.h index 2fd8f9b9..f9481690 100644 --- a/icd/api/llpc/patch/llpcVertexFetch.h +++ b/icd/api/llpc/patch/llpcVertexFetch.h @@ -69,7 +69,7 @@ class VertexFetch static const VertexFormatInfo* GetVertexFormatInfo(VkFormat format); - llvm::Value* Run(llvm::Type* pInputTy, uint32_t location, llvm::Instruction* pInsertPos); + llvm::Value* Run(llvm::Type* pInputTy, uint32_t location, uint32_t compIdx, llvm::Instruction* pInsertPos); // Gets variable corresponding to vertex index llvm::Value* GetVertexIndex() { return m_pVertexIndex; } diff --git a/icd/api/llpc/translator/SPIRVInternal.h b/icd/api/llpc/translator/SPIRVInternal.h index fde8ae57..056c5565 100644 --- a/icd/api/llpc/translator/SPIRVInternal.h +++ b/icd/api/llpc/translator/SPIRVInternal.h @@ -1210,6 +1210,7 @@ union ShaderInOutMetadata { uint32_t Value : 16; // Generic location or SPIR-V built-in ID uint32_t IsLoc : 1; // Whether value is a location uint32_t IsBuiltIn : 1; // Whether value is a SPIR-V built-in ID + uint32_t Component : 2; // Component offset of inputs and outputs uint32_t Signedness : 1; // Signedness of the input/output, valid // for integer (0 - unsigned, 1 - signed) uint32_t InterpMode : 2; // Interpolation mode (fragment shader) @@ -1219,7 +1220,7 @@ union ShaderInOutMetadata { // output (tessellation shader) uint32_t StreamId : 2; // ID of output stream (geometry shader) - uint32_t Unused : 8; + uint32_t Unused : 6; }; uint32_t U32All; }; @@ -1236,6 +1237,8 @@ struct ShaderInOutDecorate { bool IsBuiltIn; // Whether this is a SPIR-V built-in + uint32_t Component; // Component offset of inputs and outputs + bool PerPatch; // Whether this is a per-patch input/output // (tessellation shader) struct diff --git a/icd/api/llpc/translator/SPIRVReader.cpp b/icd/api/llpc/translator/SPIRVReader.cpp index fc52c3fc..45c26e41 100644 --- a/icd/api/llpc/translator/SPIRVReader.cpp +++ b/icd/api/llpc/translator/SPIRVReader.cpp @@ -3465,6 +3465,10 @@ SPIRVToLLVM::transShaderDecoration(SPIRVValue *BV, Value *V) { InOutDec.Value.BuiltIn = BuiltInPerVertex; } + SPIRVWord Component = SPIRVID_INVALID; + if (BV->hasDecorate(DecorationComponent, 0, &Component)) + InOutDec.Component = Component; + if (BV->hasDecorate(DecorationFlat)) InOutDec.Interp.Mode = InterpModeFlat; @@ -3482,7 +3486,7 @@ SPIRVToLLVM::transShaderDecoration(SPIRVValue *BV, Value *V) { SPIRVWord StreamId = SPIRVID_INVALID; if (BV->hasDecorate(DecorationStream, 0, &StreamId)) - InOutDec.StreamId = StreamId; + InOutDec.StreamId = StreamId; Type* MDTy = nullptr; SPIRVType* BT = BV->getType()->getPointerElementType(); @@ -3724,6 +3728,10 @@ SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *BT, InOutDec.IsBuiltIn = true; } + SPIRVWord Component = SPIRVID_INVALID; + if (BT->hasDecorate(DecorationComponent, 0, &Component)) + InOutDec.Component = Component; + if (BT->hasDecorate(DecorationFlat)) InOutDec.Interp.Mode = InterpModeFlat; @@ -3759,6 +3767,7 @@ SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *BT, InOutMD.Value = InOutDec.Value.Loc; } + InOutMD.Component = InOutDec.Component; InOutMD.InterpMode = InOutDec.Interp.Mode; InOutMD.InterpLoc = InOutDec.Interp.Loc; InOutMD.PerPatch = InOutDec.PerPatch; @@ -3826,6 +3835,7 @@ SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *BT, InOutMD.Value = StartLoc; } + InOutMD.Component = InOutDec.Component; InOutMD.InterpMode = InOutDec.Interp.Mode; InOutMD.InterpLoc = InOutDec.Interp.Loc; InOutMD.PerPatch = InOutDec.PerPatch; @@ -3861,6 +3871,11 @@ SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *BT, MemberDec.Value.BuiltIn = MemberBuiltIn; } + SPIRVWord MemberComponent = SPIRVID_INVALID; + if (BT->hasMemberDecorate( + MemberIdx, DecorationComponent, 0, &MemberComponent)) + MemberDec.Component = Component; + if (BT->hasMemberDecorate(MemberIdx, DecorationFlat)) MemberDec.Interp.Mode = InterpModeFlat; diff --git a/icd/api/open_strings/entry_points.txt b/icd/api/open_strings/entry_points.txt index 7ac0eaaf..83c86ff6 100644 --- a/icd/api/open_strings/entry_points.txt +++ b/icd/api/open_strings/entry_points.txt @@ -245,3 +245,9 @@ vkGetFenceFdKHR @dext KHR_external_fence_fd vkImportFenceWin32HandleKHR @dext KHR_external_fence_win32 vkGetFenceWin32HandleKHR @dext KHR_external_fence_win32 + +vkCmdWriteBufferMarkerAMD @dext AMD_buffer_marker + +vkCreateDebugReportCallbackEXT @iext EXT_debug_report +vkDestroyDebugReportCallbackEXT @iext EXT_debug_report +vkDebugReportMessageEXT @iext EXT_debug_report diff --git a/icd/api/open_strings/extensions.txt b/icd/api/open_strings/extensions.txt index d39d91b9..64c7c977 100644 --- a/icd/api/open_strings/extensions.txt +++ b/icd/api/open_strings/extensions.txt @@ -10,6 +10,7 @@ VK_KHR_external_memory_capabilities VK_KHR_external_semaphore_capabilities VK_KHR_external_fence_capabilities VK_KHX_device_group_creation +VK_EXT_debug_report ############################################################################### # DEVICE EXTENSIONS @@ -57,3 +58,4 @@ VK_AMD_shader_fragment_mask VK_EXT_sample_locations VK_KHR_win32_keyed_mutex VK_EXT_global_priority +VK_AMD_buffer_marker diff --git a/icd/api/open_strings/g_entry_points_decl.h b/icd/api/open_strings/g_entry_points_decl.h index 76507632..ae5d3bff 100644 --- a/icd/api/open_strings/g_entry_points_decl.h +++ b/icd/api/open_strings/g_entry_points_decl.h @@ -892,3 +892,19 @@ extern const char vkGetFenceWin32HandleKHR_name[]; static const char* VKGETFENCEWIN32HANDLEKHR_name = vkGetFenceWin32HandleKHR_name; #define vkGetFenceWin32HandleKHR_condition_type vk::secure::entry::ENTRY_POINT_DEVICE_EXTENSION #define vkGetFenceWin32HandleKHR_condition_value vk::DeviceExtensions::KHR_EXTERNAL_FENCE_WIN32 +extern const char vkCmdWriteBufferMarkerAMD_name[]; +static const char* VKCMDWRITEBUFFERMARKERAMD_name = vkCmdWriteBufferMarkerAMD_name; +#define vkCmdWriteBufferMarkerAMD_condition_type vk::secure::entry::ENTRY_POINT_DEVICE_EXTENSION +#define vkCmdWriteBufferMarkerAMD_condition_value vk::DeviceExtensions::AMD_BUFFER_MARKER +extern const char vkCreateDebugReportCallbackEXT_name[]; +static const char* VKCREATEDEBUGREPORTCALLBACKEXT_name = vkCreateDebugReportCallbackEXT_name; +#define vkCreateDebugReportCallbackEXT_condition_type vk::secure::entry::ENTRY_POINT_INSTANCE_EXTENSION +#define vkCreateDebugReportCallbackEXT_condition_value vk::InstanceExtensions::EXT_DEBUG_REPORT +extern const char vkDestroyDebugReportCallbackEXT_name[]; +static const char* VKDESTROYDEBUGREPORTCALLBACKEXT_name = vkDestroyDebugReportCallbackEXT_name; +#define vkDestroyDebugReportCallbackEXT_condition_type vk::secure::entry::ENTRY_POINT_INSTANCE_EXTENSION +#define vkDestroyDebugReportCallbackEXT_condition_value vk::InstanceExtensions::EXT_DEBUG_REPORT +extern const char vkDebugReportMessageEXT_name[]; +static const char* VKDEBUGREPORTMESSAGEEXT_name = vkDebugReportMessageEXT_name; +#define vkDebugReportMessageEXT_condition_type vk::secure::entry::ENTRY_POINT_INSTANCE_EXTENSION +#define vkDebugReportMessageEXT_condition_value vk::InstanceExtensions::EXT_DEBUG_REPORT diff --git a/icd/api/open_strings/g_entry_points_impl.h b/icd/api/open_strings/g_entry_points_impl.h index d0ac8e5c..549c5cb3 100644 --- a/icd/api/open_strings/g_entry_points_impl.h +++ b/icd/api/open_strings/g_entry_points_impl.h @@ -241,3 +241,7 @@ const char vkImportFenceFdKHR_name[] = "vkImportFenceFdKHR"; const char vkGetFenceFdKHR_name[] = "vkGetFenceFdKHR"; const char vkImportFenceWin32HandleKHR_name[] = "vkImportFenceWin32HandleKHR"; const char vkGetFenceWin32HandleKHR_name[] = "vkGetFenceWin32HandleKHR"; +const char vkCmdWriteBufferMarkerAMD_name[] = "vkCmdWriteBufferMarkerAMD"; +const char vkCreateDebugReportCallbackEXT_name[] = "vkCreateDebugReportCallbackEXT"; +const char vkDestroyDebugReportCallbackEXT_name[] = "vkDestroyDebugReportCallbackEXT"; +const char vkDebugReportMessageEXT_name[] = "vkDebugReportMessageEXT"; diff --git a/icd/api/open_strings/g_extensions_decl.h b/icd/api/open_strings/g_extensions_decl.h index 5ac07169..ed7fd7fa 100644 --- a/icd/api/open_strings/g_extensions_decl.h +++ b/icd/api/open_strings/g_extensions_decl.h @@ -42,6 +42,8 @@ extern const char VK_KHR_external_fence_capabilities_name[]; static const char* VK_KHR_EXTERNAL_FENCE_CAPABILITIES_name = VK_KHR_external_fence_capabilities_name; extern const char VK_KHX_device_group_creation_name[]; static const char* VK_KHX_DEVICE_GROUP_CREATION_name = VK_KHX_device_group_creation_name; +extern const char VK_EXT_debug_report_name[]; +static const char* VK_EXT_DEBUG_REPORT_name = VK_EXT_debug_report_name; extern const char VK_KHR_bind_memory2_name[]; static const char* VK_KHR_BIND_MEMORY2_name = VK_KHR_bind_memory2_name; extern const char VK_KHR_dedicated_allocation_name[]; @@ -130,3 +132,5 @@ extern const char VK_KHR_win32_keyed_mutex_name[]; static const char* VK_KHR_WIN32_KEYED_MUTEX_name = VK_KHR_win32_keyed_mutex_name; extern const char VK_EXT_global_priority_name[]; static const char* VK_EXT_GLOBAL_PRIORITY_name = VK_EXT_global_priority_name; +extern const char VK_AMD_buffer_marker_name[]; +static const char* VK_AMD_BUFFER_MARKER_name = VK_AMD_buffer_marker_name; diff --git a/icd/api/open_strings/g_extensions_impl.h b/icd/api/open_strings/g_extensions_impl.h index c3df04f9..1077dad6 100644 --- a/icd/api/open_strings/g_extensions_impl.h +++ b/icd/api/open_strings/g_extensions_impl.h @@ -33,6 +33,7 @@ const char VK_KHR_external_memory_capabilities_name[] = "VK_KHR_external_memory_ const char VK_KHR_external_semaphore_capabilities_name[] = "VK_KHR_external_semaphore_capabilities"; const char VK_KHR_external_fence_capabilities_name[] = "VK_KHR_external_fence_capabilities"; const char VK_KHX_device_group_creation_name[] = "VK_KHX_device_group_creation"; +const char VK_EXT_debug_report_name[] = "VK_EXT_debug_report"; const char VK_KHR_bind_memory2_name[] = "VK_KHR_bind_memory2"; const char VK_KHR_dedicated_allocation_name[] = "VK_KHR_dedicated_allocation"; const char VK_KHR_descriptor_update_template_name[] = "VK_KHR_descriptor_update_template"; @@ -77,3 +78,4 @@ const char VK_AMD_shader_fragment_mask_name[] = "VK_AMD_shader_fragment_mask"; const char VK_EXT_sample_locations_name[] = "VK_EXT_sample_locations"; const char VK_KHR_win32_keyed_mutex_name[] = "VK_KHR_win32_keyed_mutex"; const char VK_EXT_global_priority_name[] = "VK_EXT_global_priority"; +const char VK_AMD_buffer_marker_name[] = "VK_AMD_buffer_marker"; diff --git a/icd/api/open_strings/g_func_table.cpp b/icd/api/open_strings/g_func_table.cpp index 6b992d90..022ddced 100644 --- a/icd/api/open_strings/g_func_table.cpp +++ b/icd/api/open_strings/g_func_table.cpp @@ -999,6 +999,26 @@ extern void GetNextDeviceLayerTable( pInstance, pDevice, remainingCount, pRemainingTables, VK_SECURE_ENTRY(vkGetFenceWin32HandleKHR))); #endif +#if VK_AMD_buffer_marker + pNextLayerFuncs->vkCmdWriteBufferMarkerAMD = + reinterpret_cast(vk::GetIcdProcAddr( + pInstance, pDevice, remainingCount, pRemainingTables, + VK_SECURE_ENTRY(vkCmdWriteBufferMarkerAMD))); +#endif +#if VK_EXT_debug_report + pNextLayerFuncs->vkCreateDebugReportCallbackEXT = + reinterpret_cast(vk::GetIcdProcAddr( + pInstance, pDevice, remainingCount, pRemainingTables, + VK_SECURE_ENTRY(vkCreateDebugReportCallbackEXT))); + pNextLayerFuncs->vkDestroyDebugReportCallbackEXT = + reinterpret_cast(vk::GetIcdProcAddr( + pInstance, pDevice, remainingCount, pRemainingTables, + VK_SECURE_ENTRY(vkDestroyDebugReportCallbackEXT))); + pNextLayerFuncs->vkDebugReportMessageEXT = + reinterpret_cast(vk::GetIcdProcAddr( + pInstance, pDevice, remainingCount, pRemainingTables, + VK_SECURE_ENTRY(vkDebugReportMessageEXT))); +#endif } diff --git a/icd/api/open_strings/g_func_table.h b/icd/api/open_strings/g_func_table.h index 188f0c36..d2fc6b6a 100644 --- a/icd/api/open_strings/g_func_table.h +++ b/icd/api/open_strings/g_func_table.h @@ -304,6 +304,14 @@ struct EntryPointTable PFN_vkImportFenceWin32HandleKHR vkImportFenceWin32HandleKHR; PFN_vkGetFenceWin32HandleKHR vkGetFenceWin32HandleKHR; #endif +#if VK_AMD_buffer_marker + PFN_vkCmdWriteBufferMarkerAMD vkCmdWriteBufferMarkerAMD; +#endif +#if VK_EXT_debug_report + PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT; + PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT; + PFN_vkDebugReportMessageEXT vkDebugReportMessageEXT; +#endif }; diff --git a/icd/api/vert_buf_binding_mgr.cpp b/icd/api/vert_buf_binding_mgr.cpp index c13e6681..7190f931 100644 --- a/icd/api/vert_buf_binding_mgr.cpp +++ b/icd/api/vert_buf_binding_mgr.cpp @@ -155,7 +155,7 @@ void VertBufBindingMgr::BindVertexBuffers( // a final partial element. Rounding down matches our current behavior for buffer views. if (pBinding->view.stride > 1) { - pBinding->view.range = Util::RoundUpToMultiple(pBinding->size, pBinding->view.stride); + pBinding->view.range = Util::RoundDownToMultiple(pBinding->size, pBinding->view.stride); } else { @@ -221,7 +221,7 @@ void VertBufBindingMgr::GraphicsPipelineChanged( // for a final partial element. Rounding down matches our current behavior for buffer views. if (pBinding->view.stride > 1) { - pBinding->view.range = Util::RoundUpToMultiple(pBinding->size, pBinding->view.stride); + pBinding->view.range = Util::RoundDownToMultiple(pBinding->size, pBinding->view.stride); } else { diff --git a/icd/api/vk_buffer.cpp b/icd/api/vk_buffer.cpp index b3f40101..52a425dc 100644 --- a/icd/api/vk_buffer.cpp +++ b/icd/api/vk_buffer.cpp @@ -46,7 +46,8 @@ namespace vk // based on its declared usage bits at create time. These masks come in handy when trying to decide optimal PAL // caches coherency flags during a pipeline barrier. void Buffer::CalcBarrierUsage( - VkBufferUsageFlags usage) + const Device* pDevice, + VkBufferUsageFlags usage) { m_inputCacheMask = 0; m_outputCacheMask = Pal::CoherCpu | Pal::CoherMemory; // Always allow CPU writes and memory writes @@ -61,6 +62,13 @@ void Buffer::CalcBarrierUsage( // Also need Pal::CoherShader here as vkCmdCopyQueryPoolResults uses a compute shader defined in the Vulkan // API layer when used with timestamp queries. m_outputCacheMask |= Pal::CoherCopy | Pal::CoherShader; + + // Buffer markers fall under the same PAL coherency rules as timestamp writes + if (pDevice->IsExtensionEnabled(DeviceExtensions::AMD_BUFFER_MARKER)) + { + m_inputCacheMask |= Pal::CoherTimestamp; + m_outputCacheMask |= Pal::CoherTimestamp; + } } if (usage & (VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) @@ -116,12 +124,12 @@ Buffer::Buffer( { if (pGpuMemory[deviceIdx] != nullptr) { - m_pGpuMemory[deviceIdx] = pGpuMemory[deviceIdx]; + m_pGpuMemory[deviceIdx] = pGpuMemory[deviceIdx]; m_gpuVirtAddr[deviceIdx] = pGpuMemory[deviceIdx]->Desc().gpuVirtAddr; } } - CalcBarrierUsage(usage); + CalcBarrierUsage(pDevice, usage); } // ===================================================================================================================== @@ -378,7 +386,7 @@ VkResult Buffer::GetMemoryRequirements( // however, we'll specify such an alignment requirement which should fit formatted buffer use // with any kind of format pMemoryRequirements->alignment = (ubUsageEnabled) ? ubRequiredAlignment : 4; - pMemoryRequirements->size = (!ubUsageEnabled) ? m_size : Util::RoundUpToMultiple(m_size, ubRequiredAlignment); + pMemoryRequirements->size = Util::RoundUpToMultiple(m_size, pMemoryRequirements->alignment); } // Allow all available memory types for buffers diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index f2ec4222..bd4805ad 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -2657,7 +2657,9 @@ void CmdBuffer::ResetEvent( // ===================================================================================================================== // Given a bitmask of VkAccessFlags, computes the representative PAL CacheCoherencyUsageFlags that will be written // in the srcCacheMask field of a pipeline BarrierTransition. -Pal::uint32 CmdBuffer::ConvertBarrierSrcAccessFlags(VkAccessFlags accessMask) +Pal::uint32 CmdBuffer::ConvertBarrierSrcAccessFlags( + const Device* pDevice, + VkAccessFlags accessMask) { Pal::uint32 coher = 0; @@ -2686,6 +2688,11 @@ Pal::uint32 CmdBuffer::ConvertBarrierSrcAccessFlags(VkAccessFlags accessMask) // Also need Pal::CoherShader here as vkCmdCopyQueryPoolResults uses a compute shader defined in the Vulkan // API layer when used with timestamp queries. coher |= Pal::CoherCopy | Pal::CoherResolve | Pal::CoherClear | Pal::CoherShader; + + if (pDevice->IsExtensionEnabled(DeviceExtensions::AMD_BUFFER_MARKER)) + { + coher |= Pal::CoherTimestamp; + } } if (accessMask & VK_ACCESS_MEMORY_WRITE_BIT) @@ -2705,7 +2712,9 @@ Pal::uint32 CmdBuffer::ConvertBarrierSrcAccessFlags(VkAccessFlags accessMask) // ===================================================================================================================== // Given a bitmask of VkAccessFlags, computes the representative PAL CacheCoherencyUsageFlags that will be written // in the dstCacheMask field of a pipeline BarrierTransition. -Pal::uint32 CmdBuffer::ConvertBarrierDstAccessFlags(VkAccessFlags accessMask) +Pal::uint32 CmdBuffer::ConvertBarrierDstAccessFlags( + const Device* pDevice, + VkAccessFlags accessMask) { // With the more loose memory barrier semantics introduced we practically have to always invalidate all relevant // caches. The complete set is limited based on the usage allowed by the resource at the caller side. @@ -2742,6 +2751,7 @@ Pal::uint32 CmdBuffer::ConvertBarrierDstAccessFlags(VkAccessFlags accessMask) // Convert src access and dst access mask to the PAL CacheCoherencyUsageFlags that will be written // in the srcCacheMask and dstCacheMask field of a pipeline BarrierTransition. void CmdBuffer::ConvertBarrierCacheFlags( + const Device* pDevice, VkAccessFlags srcAccess, VkAccessFlags dstAccess, uint32_t supportInputCacheMask, @@ -2749,7 +2759,7 @@ void CmdBuffer::ConvertBarrierCacheFlags( uint32_t barrierOptions, Pal::BarrierTransition* pResult) { - pResult->srcCacheMask = (supportOutputCacheMask != 0xFFFFFFFF) ? supportOutputCacheMask & ConvertBarrierSrcAccessFlags(srcAccess) : ConvertBarrierSrcAccessFlags(srcAccess); + pResult->srcCacheMask = supportOutputCacheMask & ConvertBarrierSrcAccessFlags(pDevice, srcAccess); // srccachemask is 0 for all read only source access like VK_ACCESS_*_READ_BIT // etc. hence, only validate against all input caches if we are going from write to any other access flag. @@ -2761,7 +2771,7 @@ void CmdBuffer::ConvertBarrierCacheFlags( } else { - pResult->dstCacheMask = (supportInputCacheMask != 0xFFFFFFFF) ? supportInputCacheMask & ConvertBarrierDstAccessFlags(dstAccess) : ConvertBarrierDstAccessFlags(dstAccess); + pResult->dstCacheMask = supportInputCacheMask & ConvertBarrierDstAccessFlags(pDevice, dstAccess); } } @@ -2837,7 +2847,13 @@ void CmdBuffer::ExecuteBarriers( for (uint32_t i = 0; i < memBarrierCount; ++i) { - ConvertBarrierCacheFlags(pMemoryBarriers[i].srcAccessMask, pMemoryBarriers[i].dstAccessMask, 0xFFFFFFFF, 0xFFFFFFFF, barrierOptions, pNextMain); + ConvertBarrierCacheFlags( + m_pDevice, + pMemoryBarriers[i].srcAccessMask, + pMemoryBarriers[i].dstAccessMask, + 0xFFFFFFFF, 0xFFFFFFFF, + barrierOptions, + pNextMain); pNextMain->imageInfo.pImage = nullptr; VK_ASSERT(pMemoryBarriers[i].pNext == nullptr); @@ -2872,7 +2888,14 @@ void CmdBuffer::ExecuteBarriers( uint32_t supportInputCoherMask = pBuffer->GetSupportedInputCoherMask(); uint32_t supportOutputCoherMask = pBuffer->GetSupportedOutputCoherMask(); - ConvertBarrierCacheFlags(pBufferMemoryBarriers[i].srcAccessMask, pBufferMemoryBarriers[i].dstAccessMask, supportInputCoherMask, supportOutputCoherMask, barrierOptions, pNextMain); + ConvertBarrierCacheFlags( + m_pDevice, + pBufferMemoryBarriers[i].srcAccessMask, + pBufferMemoryBarriers[i].dstAccessMask, + supportInputCoherMask, + supportOutputCoherMask, + barrierOptions, + pNextMain); pNextMain->imageInfo.pImage = nullptr; @@ -2909,7 +2932,14 @@ void CmdBuffer::ExecuteBarriers( uint32_t supportOutputCoherMask = pImage->GetSupportedOutputCoherMask(); Pal::BarrierTransition barrierTransition = { 0 }; - ConvertBarrierCacheFlags(pImageMemoryBarriers[i].srcAccessMask, pImageMemoryBarriers[i].dstAccessMask, supportInputCoherMask, supportOutputCoherMask, barrierOptions, &barrierTransition); + ConvertBarrierCacheFlags( + m_pDevice, + pImageMemoryBarriers[i].srcAccessMask, + pImageMemoryBarriers[i].dstAccessMask, + supportInputCoherMask, + supportOutputCoherMask, + barrierOptions, + &barrierTransition); pNextMain->imageInfo.pImage = nullptr; @@ -4109,6 +4139,7 @@ void CmdBuffer::RPSyncPoint( pGlobalTransition->imageInfo.pImage = nullptr; ConvertBarrierCacheFlags( + m_pDevice, syncPoint.barrier.srcAccessMask, syncPoint.barrier.dstAccessMask, 0xffffffff, @@ -5016,6 +5047,30 @@ void CmdBuffer::DbgCmdBarrier(bool preCmd) } #endif +// ===================================================================================================================== +void CmdBuffer::WriteBufferMarker( + VkPipelineStageFlagBits pipelineStage, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + uint32_t marker) +{ + const Buffer* pDestBuffer = Buffer::ObjectFromHandle(dstBuffer); + const Pal::HwPipePoint pipePoint = VkToPalSrcPipePointForMarkers(pipelineStage, m_palEngineType); + + utils::IterateMask deviceGroup(m_palDeviceMask); + + while (deviceGroup.Iterate()) + { + const uint32_t deviceIdx = deviceGroup.Index(); + + PalCmdBuffer(deviceIdx)->CmdWriteImmediate( + pipePoint, + marker, + Pal::ImmediateDataWidth::ImmediateData32Bit, + pDestBuffer->GpuVirtAddr(deviceIdx) + dstOffset); + } +} + // ===================================================================================================================== RenderPassInstanceState::RenderPassInstanceState( PalAllocator* pAllocator) @@ -5780,6 +5835,18 @@ VKAPI_ATTR void VKAPI_CALL vkCmdSetSampleLocationsEXT( { ApiCmdBuffer::ObjectFromHandle(commandBuffer)->SetSampleLocations(pSampleLocationsInfo); } + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdWriteBufferMarkerAMD( + VkCommandBuffer commandBuffer, + VkPipelineStageFlagBits pipelineStage, + VkBuffer dstBuffer, + VkDeviceSize dstOffset, + uint32_t marker) +{ + ApiCmdBuffer::ObjectFromHandle(commandBuffer)->WriteBufferMarker(pipelineStage, dstBuffer, dstOffset, marker); +} + } // namespace entry } // namespace vk diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp index 2251b34a..515781e8 100644 --- a/icd/api/vk_conv.cpp +++ b/icd/api/vk_conv.cpp @@ -393,6 +393,466 @@ VK_TO_PAL_DECL_LOOKUP_TABLE(IMAGE_TILING, ImageTiling VK_TO_PAL_DECL_LOOKUP_TABLE(COMPONENT_SWIZZLE, ChannelSwizzle ) VK_TO_PAL_DECL_LOOKUP_TABLE(PIPELINE_BIND_POINT, PipelineBindPoint ) +// ===================================================================================================================== +// Converts a PAL::Result value to an equivalent string name +const char* PalResultName( + Pal::Result result) +{ + const char* resultName = nullptr; + + switch (result) + { + case Pal::Result::TooManyFlippableAllocations: + resultName = "TooManyFlippableAllocations"; + break; + + case Pal::Result::PresentOccluded: + resultName = "PresentOccluded"; + break; + + case Pal::Result::Unsupported: + resultName = "Unsupported"; + break; + + case Pal::Result::NotReady: + resultName = "NotReady"; + break; + + case Pal::Result::Timeout: + resultName = "Timeout"; + break; + + case Pal::Result::ErrorFenceNeverSubmitted: + resultName = "ErrorFenceNeverSubmitted"; + break; + + case Pal::Result::EventSet: + resultName = "EventSet"; + break; + + case Pal::Result::EventReset: + resultName = "EventReset"; + break; + + case Pal::Result::ErrorInitializationFailed: + resultName = "ErrorInitializationFailed"; + break; + + case Pal::Result::ErrorOutOfMemory: + resultName = "ErrorOutOfMemory"; + break; + + case Pal::Result::ErrorOutOfGpuMemory: + resultName = "ErrorOutOfGpuMemory"; + break; + + case Pal::Result::ErrorDeviceLost: + resultName = "ErrorDeviceLost"; + break; + + case Pal::Result::ErrorIncompatibleLibrary: + resultName = "ErrorIncompatibleLibrary"; + break; + + case Pal::Result::ErrorGpuMemoryMapFailed: + resultName = "ErrorGpuMemoryMapFailed"; + break; + + case Pal::Result::ErrorNotMappable: + resultName = "ErrorNotMappable"; + break; + + case Pal::Result::ErrorUnknown: + resultName = "ErrorUnknown"; + break; + + case Pal::Result::ErrorUnavailable: + resultName = "ErrorUnavailable"; + break; + + case Pal::Result::ErrorInvalidPointer: + resultName = "ErrorInvalidPointer"; + break; + + case Pal::Result::ErrorInvalidValue: + resultName = "ErrorInvalidValue"; + break; + + case Pal::Result::ErrorInvalidOrdinal: + resultName = "ErrorInvalidOrdinal"; + break; + + case Pal::Result::ErrorInvalidMemorySize: + resultName = "ErrorInvalidMemorySize"; + break; + + case Pal::Result::ErrorInvalidFlags: + resultName = "ErrorInvalidFlags"; + break; + + case Pal::Result::ErrorInvalidAlignment: + resultName = "ErrorInvalidAlignment"; + break; + + case Pal::Result::ErrorInvalidFormat: + resultName = "ErrorInvalidFormat"; + break; + + case Pal::Result::ErrorInvalidImage: + resultName = "ErrorInvalidImage"; + break; + + case Pal::Result::ErrorInvalidDescriptorSetData: + resultName = "ErrorInvalidDescriptorSetData"; + break; + + case Pal::Result::ErrorInvalidQueueType: + resultName = "ErrorInvalidQueueType"; + break; + + case Pal::Result::ErrorUnsupportedShaderIlVersion: + resultName = "ErrorUnsupportedShaderIlVersion"; + break; + + case Pal::Result::ErrorBadShaderCode: + resultName = "ErrorBadShaderCode"; + break; + + case Pal::Result::ErrorBadPipelineData: + resultName = "ErrorBadPipelineData"; + break; + + case Pal::Result::ErrorGpuMemoryUnmapFailed: + resultName = "ErrorGpuMemoryUnmapFailed"; + break; + + case Pal::Result::ErrorIncompatibleDevice: + resultName = "ErrorIncompatibleDevice"; + break; + + case Pal::Result::ErrorBuildingCommandBuffer: + resultName = "ErrorBuildingCommandBuffer"; + break; + + case Pal::Result::ErrorGpuMemoryNotBound: + resultName = "ErrorGpuMemoryNotBound"; + break; + + case Pal::Result::ErrorImageNotShaderAccessible: + resultName = "ErrorImageNotShaderAccessible"; + break; + + case Pal::Result::ErrorInvalidUsageForFormat: + resultName = "ErrorInvalidUsageForFormat"; + break; + + case Pal::Result::ErrorFormatIncompatibleWithImageUsage: + resultName = "ErrorFormatIncompatibleWithImageUsage"; + break; + + case Pal::Result::ErrorThreadGroupTooBig: + resultName = "ErrorThreadGroupTooBig"; + break; + + case Pal::Result::ErrorInvalidMsaaMipLevels: + resultName = "ErrorInvalidMsaaMipLevels"; + break; + + case Pal::Result::ErrorInvalidSampleCount: + resultName = "ErrorInvalidSampleCount"; + break; + + case Pal::Result::ErrorInvalidImageArraySize: + resultName = "ErrorInvalidImageArraySize"; + break; + + case Pal::Result::ErrorInvalid3dImageArraySize: + resultName = "ErrorInvalid3dImageArraySize"; + break; + + case Pal::Result::ErrorInvalidImageWidth: + resultName = "ErrorInvalidImageWidth"; + break; + + case Pal::Result::ErrorInvalidImageHeight: + resultName = "ErrorInvalidImageHeight"; + break; + + case Pal::Result::ErrorInvalidImageDepth: + resultName = "ErrorInvalidImageDepth"; + break; + + case Pal::Result::ErrorInvalidMipCount: + resultName = "ErrorInvalidMipCount"; + break; + + case Pal::Result::ErrorInvalidBaseMipLevel: + resultName = "ErrorInvalidBaseMipLevel"; + break; + + case Pal::Result::ErrorInvalidViewArraySize: + resultName = "ErrorInvalidViewArraySize"; + break; + + case Pal::Result::ErrorInvalidViewBaseSlice: + resultName = "ErrorInvalidViewBaseSlice"; + break; + + case Pal::Result::ErrorInsufficientImageArraySize: + resultName = "ErrorInsufficientImageArraySize"; + break; + + case Pal::Result::ErrorCubemapNonSquareFaceSize: + resultName = "ErrorCubemapNonSquareFaceSize"; + break; + + case Pal::Result::ErrorInvalidImageTargetUsage: + resultName = "ErrorInvalidImageTargetUsage"; + break; + + case Pal::Result::ErrorMissingDepthStencilUsage: + resultName = "ErrorMissingDepthStencilUsage"; + break; + + case Pal::Result::ErrorInvalidColorTargetType: + resultName = "ErrorInvalidColorTargetType"; + break; + + case Pal::Result::ErrorInvalidDepthTargetType: + resultName = "ErrorInvalidDepthTargetType"; + break; + + case Pal::Result::ErrorInvalidMsaaType: + resultName = "ErrorInvalidMsaaType"; + break; + + case Pal::Result::ErrorInvalidCompressedImageType: + resultName = "ErrorInvalidCompressedImageType"; + break; + + case Pal::Result::ErrorImageAspectUnavailable: + resultName = "ErrorImageAspectUnavailable"; + break; + + case Pal::Result::ErrorInvalidFormatSwizzle: + resultName = "ErrorInvalidFormatSwizzle"; + break; + + case Pal::Result::ErrorViewTypeIncompatibleWithImageType: + resultName = "ErrorViewTypeIncompatibleWithImageType"; + break; + + case Pal::Result::ErrorCubemapIncompatibleWithMsaa: + resultName = "ErrorCubemapIncompatibleWithMsaa"; + break; + + case Pal::Result::ErrorInvalidMsaaFormat: + resultName = "ErrorInvalidMsaaFormat"; + break; + + case Pal::Result::ErrorFormatIncompatibleWithImageFormat: + resultName = "ErrorFormatIncompatibleWithImageFormat"; + break; + + case Pal::Result::ErrorFormatIncompatibleWithImageAspect: + resultName = "ErrorFormatIncompatibleWithImageAspect"; + break; + + case Pal::Result::ErrorFullscreenUnavailable: + resultName = "ErrorFullscreenUnavailable"; + break; + + case Pal::Result::ErrorScreenRemoved: + resultName = "ErrorScreenRemoved"; + break; + + case Pal::Result::ErrorIncompatibleScreenMode: + resultName = "ErrorIncompatibleScreenMode"; + break; + + case Pal::Result::ErrorMultiDevicePresentFailed: + resultName = "ErrorMultiDevicePresentFailed"; + break; + + case Pal::Result::ErrorWindowedPresentUnavailable: + resultName = "ErrorWindowedPresentUnavailable"; + break; + + case Pal::Result::ErrorInvalidResolution: + resultName = "ErrorInvalidResolution"; + break; + + case Pal::Result::ErrorInvalidObjectType: + resultName = "ErrorInvalidObjectType"; + break; + + case Pal::Result::ErrorTooManyMemoryReferences: + resultName = "ErrorTooManyMemoryReferences"; + break; + + case Pal::Result::ErrorNotShareable: + resultName = "ErrorNotShareable"; + break; + + case Pal::Result::ErrorImageFmaskUnavailable: + resultName = "ErrorImageFmaskUnavailable"; + break; + + case Pal::Result::ErrorPrivateScreenRemoved: + resultName = "ErrorPrivateScreenRemoved"; + break; + + case Pal::Result::ErrorPrivateScreenUsed: + resultName = "ErrorPrivateScreenUsed"; + break; + + case Pal::Result::ErrorTooManyPrivateDisplayImages: + resultName = "ErrorTooManyPrivateDisplayImages"; + break; + + case Pal::Result::ErrorPrivateScreenNotEnabled: + resultName = "ErrorPrivateScreenNotEnabled"; + break; + + default: + VK_NOT_IMPLEMENTED; + resultName = "??"; + break; + } + + return resultName; +} + +// ===================================================================================================================== +// Converts a VkResult value to an equivalent string name +const char* VkResultName( + VkResult result) +{ + const char* errName = nullptr; + + switch (result) + { + case VkResult::VK_SUCCESS: + errName = "VK_SUCCESS"; + break; + + case VkResult::VK_NOT_READY: + errName = "VK_NOT_READY"; + break; + + case VkResult::VK_TIMEOUT: + errName = "VK_TIMEOUT"; + break; + + case VkResult::VK_EVENT_SET: + errName = "VK_EVENT_SET"; + break; + + case VkResult::VK_EVENT_RESET: + errName = "VK_EVENT_RESET"; + break; + + case VkResult::VK_INCOMPLETE: + errName = "VK_INCOMPLETE"; + break; + + case VkResult::VK_ERROR_OUT_OF_HOST_MEMORY: + errName = "VK_ERROR_OUT_OF_HOST_MEMORY"; + break; + + case VkResult::VK_ERROR_OUT_OF_DEVICE_MEMORY: + errName = "VK_ERROR_OUT_OF_DEVICE_MEMORY"; + break; + + case VkResult::VK_ERROR_INITIALIZATION_FAILED: + errName = "VK_ERROR_INITIALIZATION_FAILED"; + break; + + case VkResult::VK_ERROR_DEVICE_LOST: + errName = "VK_ERROR_DEVICE_LOST"; + break; + + case VkResult::VK_ERROR_MEMORY_MAP_FAILED: + errName = "VK_ERROR_MEMORY_MAP_FAILED"; + break; + + case VkResult::VK_ERROR_LAYER_NOT_PRESENT: + errName = "VK_ERROR_LAYER_NOT_PRESENT"; + break; + + case VkResult::VK_ERROR_EXTENSION_NOT_PRESENT: + errName = "VK_ERROR_EXTENSION_NOT_PRESENT"; + break; + + case VkResult::VK_ERROR_FEATURE_NOT_PRESENT: + errName = "VK_ERROR_FEATURE_NOT_PRESENT"; + break; + + case VkResult::VK_ERROR_INCOMPATIBLE_DRIVER: + errName = "VK_ERROR_INCOMPATIBLE_DRIVER"; + break; + + case VkResult::VK_ERROR_TOO_MANY_OBJECTS: + errName = "VK_ERROR_TOO_MANY_OBJECTS"; + break; + + case VkResult::VK_ERROR_FORMAT_NOT_SUPPORTED: + errName = "VK_ERROR_FORMAT_NOT_SUPPORTED"; + break; + + case VkResult::VK_ERROR_FRAGMENTED_POOL: + errName = "VK_ERROR_FRAGMENTED_POOL"; + break; + + case VkResult::VK_ERROR_OUT_OF_POOL_MEMORY_KHR: + errName = "VK_ERROR_OUT_OF_POOL_MEMORY_KHR"; + break; + + case VkResult::VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR: + errName = "VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR"; + break; + + case VkResult::VK_ERROR_SURFACE_LOST_KHR: + errName = "VK_ERROR_SURFACE_LOST_KHR"; + break; + + case VkResult::VK_ERROR_NATIVE_WINDOW_IN_USE_KHR: + errName = "VK_ERROR_NATIVE_WINDOW_IN_USE_KHR"; + break; + + case VkResult::VK_SUBOPTIMAL_KHR: + errName = "VK_SUBOPTIMAL_KHR"; + break; + + case VkResult::VK_ERROR_OUT_OF_DATE_KHR: + errName = "VK_ERROR_OUT_OF_DATE_KHR"; + break; + + case VkResult::VK_ERROR_INCOMPATIBLE_DISPLAY_KHR: + errName = "VK_ERROR_INCOMPATIBLE_DISPLAY_KHR"; + break; + + case VkResult::VK_ERROR_VALIDATION_FAILED_EXT: + errName = "VK_ERROR_VALIDATION_FAILED_EXT"; + break; + + case VkResult::VK_ERROR_INVALID_SHADER_NV: + errName = "VK_ERROR_INVALID_SHADER_NV"; + break; + + case VkResult::VK_ERROR_NOT_PERMITTED_EXT: + errName = "VK_ERROR_NOT_PERMITTED_EXT"; + break; + + default: + VK_NOT_IMPLEMENTED; + errName = "??"; + break; + }; + + return errName; +} + // ===================================================================================================================== // Converts a non-Success PAL result to an equivalent VK error VkResult PalToVkError( @@ -400,49 +860,64 @@ VkResult PalToVkError( { VK_ASSERT(result != Pal::Result::Success); + VkResult vkResult = VK_SUCCESS; + switch (result) { // These PAL error codes currently aren't handled specially and they indicate success otherwise case Pal::Result::TooManyFlippableAllocations: case Pal::Result::PresentOccluded: - return VK_SUCCESS; + vkResult = VK_SUCCESS; + break; case Pal::Result::Unsupported: - return VK_ERROR_FORMAT_NOT_SUPPORTED; + vkResult = VK_ERROR_FORMAT_NOT_SUPPORTED; + break; case Pal::Result::NotReady: - return VK_NOT_READY; + vkResult = VK_NOT_READY; + break; case Pal::Result::Timeout: case Pal::Result::ErrorFenceNeverSubmitted: - return VK_TIMEOUT; + vkResult = VK_TIMEOUT; + break; case Pal::Result::EventSet: - return VK_EVENT_SET; + vkResult = VK_EVENT_SET; + break; case Pal::Result::EventReset: - return VK_EVENT_RESET; + vkResult = VK_EVENT_RESET; + break; case Pal::Result::ErrorInitializationFailed: - return VK_ERROR_INITIALIZATION_FAILED; + vkResult = VK_ERROR_INITIALIZATION_FAILED; + break; case Pal::Result::ErrorOutOfMemory: - return VK_ERROR_OUT_OF_HOST_MEMORY; + vkResult = VK_ERROR_OUT_OF_HOST_MEMORY; + break; case Pal::Result::ErrorOutOfGpuMemory: - return VK_ERROR_OUT_OF_DEVICE_MEMORY; + vkResult = VK_ERROR_OUT_OF_DEVICE_MEMORY; + break; case Pal::Result::ErrorDeviceLost: - return VK_ERROR_DEVICE_LOST; + vkResult = VK_ERROR_DEVICE_LOST; + break; case Pal::Result::ErrorIncompatibleLibrary: - return VK_ERROR_INCOMPATIBLE_DRIVER; + vkResult = VK_ERROR_INCOMPATIBLE_DRIVER; + break; case Pal::Result::ErrorGpuMemoryMapFailed: - return VK_ERROR_MEMORY_MAP_FAILED; + vkResult = VK_ERROR_MEMORY_MAP_FAILED; + break; case Pal::Result::ErrorNotMappable: - return VK_ERROR_MEMORY_MAP_FAILED; + vkResult = VK_ERROR_MEMORY_MAP_FAILED; + break; case Pal::Result::ErrorUnknown: case Pal::Result::ErrorUnavailable: @@ -499,7 +974,8 @@ VkResult PalToVkError( case Pal::Result::ErrorMultiDevicePresentFailed: case Pal::Result::ErrorWindowedPresentUnavailable: case Pal::Result::ErrorInvalidResolution: - return VK_ERROR_INITIALIZATION_FAILED; + vkResult = VK_ERROR_INITIALIZATION_FAILED; + break; case Pal::Result::ErrorInvalidObjectType: // This is only generated by RemapVirtualMemoryPages currently which is only used @@ -520,8 +996,17 @@ VkResult PalToVkError( // There's no private screen support yet. Fall through to the default path. default: VK_NOT_IMPLEMENTED; - return VK_ERROR_INITIALIZATION_FAILED; + vkResult = VK_ERROR_INITIALIZATION_FAILED; + break; } + +#if PAL_ENABLE_PRINTS_ASSERTS + const char* palErrorName = PalResultName(result); + const char* vkErrorName = VkResultName(vkResult); + PAL_DPINFO("Vulkan error: %s(%d), from Pal error: Pal::Result::%s(%d)", vkErrorName, vkResult, palErrorName, result); +#endif + + return vkResult; } // ===================================================================================================================== diff --git a/icd/api/vk_debug_report.cpp b/icd/api/vk_debug_report.cpp new file mode 100644 index 00000000..2ea76d5a --- /dev/null +++ b/icd/api/vk_debug_report.cpp @@ -0,0 +1,175 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#include "include/vk_debug_report.h" +#include "include/vk_instance.h" +#include "palDbgPrint.h" + +namespace vk +{ +// ===================================================================================================================== +// Create a DebugReportCallback object. +VkResult DebugReportCallback::Create( + Instance* pInstance, + const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDebugReportCallbackEXT* pCallback) +{ + VkResult result = VK_SUCCESS; + + void* pSystemMem = pAllocator->pfnAllocation( + pAllocator->pUserData, + sizeof(DebugReportCallback), + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (pSystemMem == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + + if (result == VK_SUCCESS) + { + VK_PLACEMENT_NEW(pSystemMem) DebugReportCallback(); + + *pCallback = DebugReportCallback::HandleFromVoidPointer(pSystemMem); + + result = pInstance->RegisterDebugCallback(DebugReportCallback::ObjectFromHandle(*pCallback)); + + if (result == VK_SUCCESS) + { + DebugReportCallback::ObjectFromHandle(*pCallback)->m_createInfo = *pCreateInfo; + } + else + { + Util::Destructor(DebugReportCallback::ObjectFromHandle(*pCallback)); + pAllocator->pfnFree(pAllocator->pUserData, DebugReportCallback::ObjectFromHandle(*pCallback)); + } + } + + return result; +} + +// ===================================================================================================================== +// Destroy a DebugReportCallback object. +void DebugReportCallback::Destroy( + Instance* pInstance, + const VkAllocationCallbacks* pAllocator) +{ + pInstance->UnregisterDebugCallback(this); + + Util::Destructor(this); + + // Free memory + pAllocator->pfnFree(pAllocator->pUserData, this); +} + +// ===================================================================================================================== +// Inject a message into the debug stream from the Debug Report Callback. +void DebugReportCallback::Message( + Instance* pInstance, + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char* pLayerPrefix, + const char* pMessage) +{ + pInstance->CallExternalCallbacks(flags, + objectType, + object, + location, + messageCode, + pLayerPrefix, + pMessage); +} + +// ===================================================================================================================== +// Get the flags for this callback +VkDebugReportFlagsEXT DebugReportCallback::GetFlags() +{ + return m_createInfo.flags; +} + +// ===================================================================================================================== +// Get the external callback function pointer for this callback +PFN_vkDebugReportCallbackEXT DebugReportCallback::GetCallbackFunc() +{ + return m_createInfo.pfnCallback; +} + +// ===================================================================================================================== +// Get the client-provided user data pointer for this callback +void* DebugReportCallback::GetUserData() +{ + return m_createInfo.pUserData; +} + +namespace entry +{ +VKAPI_ATTR VkResult VKAPI_CALL vkCreateDebugReportCallbackEXT( + VkInstance instance, + const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDebugReportCallbackEXT* pCallback) +{ + Instance* pInstance = Instance::ObjectFromHandle(instance); + + const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pInstance->GetAllocCallbacks(); + + return DebugReportCallback::Create(pInstance, pCreateInfo, pAllocCB, pCallback); +} + +VKAPI_ATTR void VKAPI_CALL vkDestroyDebugReportCallbackEXT( + VkInstance instance, + VkDebugReportCallbackEXT callback, + const VkAllocationCallbacks* pAllocator) +{ + Instance* pInstance = Instance::ObjectFromHandle(instance); + + const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pInstance->GetAllocCallbacks(); + + DebugReportCallback::ObjectFromHandle(callback)->Destroy(pInstance, pAllocCB); +} + +VKAPI_ATTR void VKAPI_CALL vkDebugReportMessageEXT( + VkInstance instance, + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char* pLayerPrefix, + const char* pMessage) +{ + Instance* pInstance = Instance::ObjectFromHandle(instance); + + pInstance->CallExternalCallbacks(flags, objectType, object, location, messageCode, pLayerPrefix, pMessage); +} + +} // namespace entry + +} // namespace vk diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp index 0d5c20c9..d00dc160 100644 --- a/icd/api/vk_descriptor_set.cpp +++ b/icd/api/vk_descriptor_set.cpp @@ -106,7 +106,7 @@ void DescriptorSet::Reassign( // In this case we also have to copy the immutable sampler data from the descriptor set layout to the // descriptor set's appropriate memory locations. - InitImmutableDescriptors(numPalDevices); + InitImmutableDescriptors(pLayout, numPalDevices); } else { @@ -126,19 +126,23 @@ void DescriptorSet::Reassign( // ===================================================================================================================== // Initialize immutable descriptor data in the descriptor set. -void DescriptorSet::InitImmutableDescriptors(uint32_t numPalDevices) +void DescriptorSet::InitImmutableDescriptors( + const DescriptorSetLayout* pLayout, + uint32_t numPalDevices) { - const size_t imageDescDwSize = m_pLayout->VkDevice()->GetProperties().descriptorSizes.imageView / sizeof(uint32_t); - const size_t samplerDescSize = m_pLayout->VkDevice()->GetProperties().descriptorSizes.sampler; + VK_ASSERT(m_pLayout == pLayout); + + const size_t imageDescDwSize = pLayout->VkDevice()->GetProperties().descriptorSizes.imageView / sizeof(uint32_t); + const size_t samplerDescSize = pLayout->VkDevice()->GetProperties().descriptorSizes.sampler; - uint32_t immutableSamplersLeft = m_pLayout->Info().imm.numImmutableSamplers; + uint32_t immutableSamplersLeft = pLayout->Info().imm.numImmutableSamplers; uint32_t binding = 0; - uint32_t* pSrcData = m_pLayout->Info().imm.pImmutableSamplerData; + uint32_t* pSrcData = pLayout->Info().imm.pImmutableSamplerData; while (immutableSamplersLeft > 0) { - const DescriptorSetLayout::BindingInfo& bindingInfo = m_pLayout->Info().bindings[binding]; + const DescriptorSetLayout::BindingInfo& bindingInfo = pLayout->Binding(binding); uint32_t desCount = bindingInfo.info.descriptorCount; if (bindingInfo.imm.dwSize > 0) diff --git a/icd/api/vk_descriptor_set_layout.cpp b/icd/api/vk_descriptor_set_layout.cpp index 2f448cdf..d46a2985 100644 --- a/icd/api/vk_descriptor_set_layout.cpp +++ b/icd/api/vk_descriptor_set_layout.cpp @@ -282,7 +282,8 @@ void DescriptorSetLayout::ConvertImmutableInfo( VkResult DescriptorSetLayout::ConvertCreateInfo( const Device* pDevice, const VkDescriptorSetLayoutCreateInfo* pIn, - CreateInfo* pOut) + CreateInfo* pOut, + BindingInfo* pOutBindings) { if (pIn == nullptr) { @@ -324,7 +325,7 @@ VkResult DescriptorSetLayout::ConvertCreateInfo( for (uint32_t inIndex = 0; inIndex < pInfo->bindingCount; ++inIndex) { const VkDescriptorSetLayoutBinding & currentBinding = pInfo->pBindings[inIndex]; - pOut->bindings[currentBinding.binding].info = currentBinding; + pOutBindings[currentBinding.binding].info = currentBinding; } // Now iterate over our output array to convert the binding info. Any gaps in @@ -332,7 +333,7 @@ VkResult DescriptorSetLayout::ConvertCreateInfo( // should be safe to call ConvertBindingInfo on those as well. for (uint32_t bindingNumber = 0; bindingNumber < pOut->count; ++bindingNumber) { - BindingInfo* pBinding = &pOut->bindings[bindingNumber]; + BindingInfo* pBinding = &pOutBindings[bindingNumber]; // Determine the alignment requirement of descriptors in dwords. uint32_t descAlignmentInDw = pDevice->GetProperties().descriptorSizes.alignment / sizeof(uint32_t); @@ -441,10 +442,10 @@ VkResult DescriptorSetLayout::Create( info.count = bindingCount; // Set the bindings array to the appropriate location within the allocated memory - info.bindings = reinterpret_cast(reinterpret_cast(pSysMem) + apiSize); + BindingInfo* pBindings = reinterpret_cast(reinterpret_cast(pSysMem) + apiSize); // Also memset it as not all bindings may be actually used - memset(info.bindings, 0, bindingInfoAuxSize); + memset(pBindings, 0, bindingInfoAuxSize); // Set the base pointer of the immutable sampler data to the appropriate location within the allocated memory info.imm.pImmutableSamplerData = reinterpret_cast(Util::VoidPtrInc(pSysMem, apiSize + bindingInfoAuxSize)); @@ -453,7 +454,8 @@ VkResult DescriptorSetLayout::Create( VkResult result = ConvertCreateInfo( pDevice, pCreateInfo, - &info); + &info, + pBindings); if (result != VK_SUCCESS) { diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp index 7742de73..c58ca16e 100644 --- a/icd/api/vk_dispatch.cpp +++ b/icd/api/vk_dispatch.cpp @@ -61,6 +61,7 @@ #include "include/vk_shader.h" #include "include/vk_surface.h" #include "include/vk_swapchain.h" +#include "include/vk_debug_report.h" #include @@ -447,6 +448,11 @@ const DispatchTableEntry g_StandardDispatchTable[] = PRIMARY_DISPATCH_ENTRY( vkGetPhysicalDeviceMultisamplePropertiesEXT ), PRIMARY_DISPATCH_ENTRY( vkGetPhysicalDeviceExternalFencePropertiesKHR ), + PRIMARY_DISPATCH_ENTRY( vkCreateDebugReportCallbackEXT ), + PRIMARY_DISPATCH_ENTRY( vkDestroyDebugReportCallbackEXT ), + PRIMARY_DISPATCH_ENTRY( vkDebugReportMessageEXT ), + + PRIMARY_DISPATCH_ENTRY( vkCmdWriteBufferMarkerAMD ), VK_DISPATCH_TABLE_END() }; diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp index 53d1278f..26d597c0 100644 --- a/icd/api/vk_instance.cpp +++ b/icd/api/vk_instance.cpp @@ -53,6 +53,7 @@ #include "palDevice.h" #include "palPlatform.h" #include "palOglPresent.h" +#include "palListImpl.h" #include @@ -85,7 +86,8 @@ Instance::Instance( #endif m_screenCount(0), m_pScreenStorage(nullptr), - m_pDevModeMgr(nullptr) + m_pDevModeMgr(nullptr), + m_debugReportCallbacks(&m_palAllocator) #if PAL_ENABLE_PRINTS_ASSERTS , m_dispatchTableQueryCount(0) #endif @@ -266,6 +268,13 @@ VkResult Instance::Init( return VK_ERROR_OUT_OF_HOST_MEMORY; } + // Initialize mutexes used for debug report extension before registering the callback with the Platform. + if ((m_logCallbackInternalOnlyMutex.Init() != Pal::Result::Success) || + (m_logCallbackInternalExternalMutex.Init() != Pal::Result::Success)) + { + return VK_ERROR_INITIALIZATION_FAILED; + } + // Thunk PAL's memory allocator callbacks to our own const Util::AllocCallbacks allocCb = { @@ -276,6 +285,15 @@ VkResult Instance::Init( Pal::PlatformCreateInfo createInfo = { 0 }; createInfo.pAllocCb = &allocCb; + + const Util::LogCallbackInfo callbackInfo = + { + this, + &LogCallback + }; + + createInfo.pLogInfo = &callbackInfo; + createInfo.pSettingsPath = "/etc/amd"; // Switch to "null" GPU mode if requested @@ -584,6 +602,7 @@ const InstanceExtensions::Supported& Instance::GetSupportedExtensions() supportedExtensions.AddExtension(VK_INSTANCE_EXTENSION(KHR_EXTERNAL_SEMAPHORE_CAPABILITIES)); supportedExtensions.AddExtension(VK_INSTANCE_EXTENSION(KHR_EXTERNAL_FENCE_CAPABILITIES)); + supportedExtensions.AddExtension(VK_INSTANCE_EXTENSION(EXT_DEBUG_REPORT)); supportedExtensionsPopulated = true; } @@ -821,6 +840,164 @@ VkResult Instance::QueryApplicationProfile(RuntimeSettings* pRuntimeSettings) } #endif +// ===================================================================================================================== +// Callback function used to route debug prints to the VK_EXT_debug_report extension +void PAL_STDCALL Instance::LogCallback( + void* pClientData, + Pal::uint32 level, + Pal::uint64 categoryMask, + const char* pFormat, + va_list args) +{ + Instance* pInstance = reinterpret_cast(pClientData); + pInstance->LogMessage(level, categoryMask, pFormat, args); +} + +// ===================================================================================================================== +// Add the given Debug Report Callback to the instance. +VkResult Instance::RegisterDebugCallback( + DebugReportCallback* pCallback) +{ + VkResult result = VK_SUCCESS; + + Pal::Result palResult = m_debugReportCallbacks.PushBack(pCallback); + + if (palResult == Pal::Result::Success) + { + result = VK_SUCCESS; + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + + return result; +} + +// ===================================================================================================================== +// Remove the given Debug Report Callback from the instance. +void Instance::UnregisterDebugCallback( + DebugReportCallback* pCallback) +{ + auto it = m_debugReportCallbacks.Begin(); + + DebugReportCallback* element = *it.Get(); + + while (element != nullptr) + { + if (pCallback == element) + { + m_debugReportCallbacks.Erase(&it); + + // Each element should only be in the list once; break out of loop once found + element = nullptr; + } + else + { + it.Next(); + element = *it.Get(); + } + } +} + +// ===================================================================================================================== +// Convert log message data to match the format of the external callback, then call required external callbacks +void Instance::LogMessage(uint32_t level, + uint64_t categoryMask, + const char* pFormat, + va_list args) +{ + // Guarantee serialization of this function to keep internal log messages from getting intermixed + m_logCallbackInternalOnlyMutex.Lock(); + + uint32_t flags = 0; + + if (categoryMask == Pal::LogCategoryMaskInternal) + { + if ((level == static_cast(Pal::LogLevel::Info)) || + (level == static_cast(Pal::LogLevel::Verbose))) + { + flags = VK_DEBUG_REPORT_INFORMATION_BIT_EXT; + } + else if (level == static_cast(Pal::LogLevel::Alert)) + { + flags = VK_DEBUG_REPORT_WARNING_BIT_EXT; + } + else if (level == static_cast(Pal::LogLevel::Error)) + { + flags = VK_DEBUG_REPORT_ERROR_BIT_EXT; + } + else if (level == static_cast(Pal::LogLevel::Debug)) + { + flags = VK_DEBUG_REPORT_DEBUG_BIT_EXT; + } + else if (level == static_cast(Pal::LogLevel::Always)) + { + flags = VK_DEBUG_REPORT_DEBUG_BIT_EXT | + VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_WARNING_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT; + } + } + else if (categoryMask == Pal::LogCategoryMaskPerformance) + { + flags = VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT; + } + + constexpr uint64_t object = 0; + constexpr size_t location = 0; + constexpr int32_t messageCode = 0; + constexpr char layerPrefix[] = "AMDVLK\0"; + + constexpr uint32_t messageSize = 256; + char message[messageSize]; + + Util::Vsnprintf(message, + messageSize, + pFormat, + args); + + CallExternalCallbacks(flags, + VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT, + object, + location, + messageCode, + layerPrefix, + message); + + m_logCallbackInternalOnlyMutex.Unlock(); +} + +// ===================================================================================================================== +// Call all registered callbacks with the given VkDebugReportFlagsEXT. +void Instance::CallExternalCallbacks( + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char* pLayerPrefix, + const char* pMessage) +{ + // Guarantee serialization of this function to keep internal and external log messages from getting intermixed + m_logCallbackInternalExternalMutex.Lock(); + + for (auto it = m_debugReportCallbacks.Begin(); it.Get() != nullptr; it.Next()) + { + DebugReportCallback* element = *it.Get(); + + if (flags & element->GetFlags()) + { + PFN_vkDebugReportCallbackEXT pfnCallback = element->GetCallbackFunc(); + void* pUserData = element->GetUserData(); + + (*pfnCallback)(flags, objectType, object, location, messageCode, pLayerPrefix, pMessage, pUserData); + } + } + + m_logCallbackInternalExternalMutex.Unlock(); +} + namespace entry { diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp index af98b7fd..e707dfcd 100644 --- a/icd/api/vk_memory.cpp +++ b/icd/api/vk_memory.cpp @@ -349,25 +349,6 @@ VkResult Memory::Create( } else if (vkResult == VK_SUCCESS) { - // Initialize tiny host visible allocations to zero - const uint32_t NumBytesToZero = 32; - - if ((pAllocInfo->allocationSize < NumBytesToZero) && - (createInfo.heaps[0] != Pal::GpuHeapInvisible)) - { - void* pData = nullptr; - VkResult result = pMemory->Map(0, 0, NumBytesToZero, &pData); - - VK_ASSERT(createInfo.size >= NumBytesToZero); - - if (result == VK_SUCCESS) - { - memset(pData, 0, NumBytesToZero); - - pMemory->Unmap(); - } - } - // notify the memory object that it is counted so that the destructor can decrease the counter accordingly pMemory->SetAllocationCounted(); diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index a2315675..3d545de9 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -2444,6 +2444,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( // TODO: Add this extension if the related implementation of Linux is done. // availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_EXTERNAL_FENCE_FD)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(AMD_BUFFER_MARKER)); + return availableExtensions; } diff --git a/icd/api/vk_pipeline.cpp b/icd/api/vk_pipeline.cpp index ba3be9ca..9efc6312 100644 --- a/icd/api/vk_pipeline.cpp +++ b/icd/api/vk_pipeline.cpp @@ -484,12 +484,6 @@ void Pipeline::CreateLegacyPathElfBinary( gpuVersionNote.gfxipMajorVer = 9; gpuVersionNote.gfxipMinorVer = 0; break; -#ifdef VKI_CLOSED_SOURCE - case Pal::GfxIpLevel::GfxIp10: - gpuVersionNote.gfxipMajorVer = 10; - gpuVersionNote.gfxipMinorVer = 0; - break; -#endif default: VK_NEVER_CALLED(); break; diff --git a/icd/api/vk_query.cpp b/icd/api/vk_query.cpp index 7e0463be..169d8720 100644 --- a/icd/api/vk_query.cpp +++ b/icd/api/vk_query.cpp @@ -158,7 +158,10 @@ VkResult PalQueryPool::Create( { // Allocate and bind GPU memory for the object const bool removeInvisibleHeap = true; - result = pDevice->MemMgr()->AllocAndBindGpuMem(pPalQueryPool, false, &internalMem, removeInvisibleHeap); + const bool persistentMapped = true; + + result = pDevice->MemMgr()->AllocAndBindGpuMem( + pPalQueryPool, false, &internalMem, removeInvisibleHeap, persistentMapped); } if (result == VK_SUCCESS) @@ -227,6 +230,7 @@ VkResult PalQueryPool::GetResults( m_palQueryType, startQuery, queryCount, + m_internalMem.CpuAddr(), &dataSize, pData, static_cast(stride)); @@ -406,8 +410,9 @@ VkResult TimestampQueryPool::GetResults( // Although the spec says that dataSize has to be large enough to contain the result of each query, which sort // of sounds like it makes it redundant, clamp the maximum number of queries written to the given dataSize - // just in case, since it's harmless to do. - queryCount = static_cast(Util::Min(static_cast(queryCount), dataSize / querySlotSize)); + // and take account of the supplied stride, since it's harmless to do. + queryCount = Util::Min(queryCount, + static_cast(dataSize / Util::Max(querySlotSize, static_cast(stride)))); // Write results of each query slot for (uint32_t dstSlot = 0; dstSlot < queryCount; ++dstSlot) diff --git a/icd/make/importdefs b/icd/make/importdefs index a0b4c8cf..cda93b6d 100644 --- a/icd/make/importdefs +++ b/icd/make/importdefs @@ -1,7 +1,7 @@ # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. It must # be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -ICD_PAL_CLIENT_MAJOR_VERSION = 366 +ICD_PAL_CLIENT_MAJOR_VERSION = 377 # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. It describes # the interface version of the gpuopen shared module (part of PAL) that the ICD supports. diff --git a/icd/res/ver.h b/icd/res/ver.h index b450f0d3..6501d652 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -40,7 +40,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 9 +#define VULKAN_ICD_BUILD_VERSION 10 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index f080caa1..d9d07c66 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -105,6 +105,9 @@ void ProcessSettings( // setup default values for the settings. SetupDefaults(pSettings); + // Update PAL settings based on runtime settings and desired driver defaults if needed + UpdatePalSettings(pPalDevice, pSettings); + #ifdef ICD_BUILD_APPPROFILE const AppProfile origProfile = *pAppProfile; // Override defaults based on application profile @@ -136,11 +139,6 @@ void ProcessSettings( { ProcessSettings(pPalDevice, pAppProfile, pSettings); } - else - { - // update PAL settings based on runtime settings if needed - UpdatePalSettings(pPalDevice, pSettings); - } #endif } @@ -180,7 +178,7 @@ void UpdatePalSettings( { Pal::PalPublicSettings* pPalSettings = pPalDevice->GetPublicSettings(); - /* Nothing to do here at the moment */ + pPalSettings->hintDisableSmallSurfColorCompressionSize = 0; } };