diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt index 8ecb6d0f..db718ea5 100644 --- a/icd/CMakeLists.txt +++ b/icd/CMakeLists.txt @@ -159,6 +159,7 @@ target_sources(xgl PRIVATE api/color_space_helper.cpp api/gpu_event_mgr.cpp api/internal_mem_mgr.cpp + api/pipeline_compiler.cpp api/stencil_ops_combiner.cpp api/vert_buf_binding_mgr.cpp api/virtual_stack_mgr.cpp diff --git a/icd/api/include/khronos/sdk-1.1/vk_layer_dispatch_table.h b/icd/api/include/khronos/sdk-1.1/vk_layer_dispatch_table.h index 2cc12a6b..572982df 100644 --- a/icd/api/include/khronos/sdk-1.1/vk_layer_dispatch_table.h +++ b/icd/api/include/khronos/sdk-1.1/vk_layer_dispatch_table.h @@ -499,6 +499,9 @@ typedef struct VkLayerDispatchTable_ { // ---- VK_EXT_external_memory_host extension commands PFN_vkGetMemoryHostPointerPropertiesEXT GetMemoryHostPointerPropertiesEXT; + + // ---- VK_AMD_buffer_marker extension commands + PFN_vkCmdWriteBufferMarkerAMD CmdWriteBufferMarkerAMD; } VkLayerDispatchTable; diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h new file mode 100644 index 00000000..3d50c41f --- /dev/null +++ b/icd/api/include/pipeline_compiler.h @@ -0,0 +1,149 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file pipeline_compiler.h + * @brief Contains declaration of Vulkan pipeline compiler + *********************************************************************************************************************** + */ + +#pragma once + +#include "include/khronos/vulkan.h" +#include "include/vk_utils.h" +#include "include/vk_defines.h" +#include "include/vk_shader_code.h" + +#include "llpc.h" + +namespace Bil +{ + +struct BilConvertOptions; +struct BilShaderPatchOutput; +enum BilDescriptorType : uint32_t; + +} + +namespace vk +{ + +class PhysicalDevice; +class PipelineLayout; +class PipelineCache; +struct VbBindingInfo; + +// ===================================================================================================================== +// Represents Vulkan pipeline compiler, it wraps LLPC and SCPC, and hides the differences. +class PipelineCompiler +{ +public: + // Creation info parameters for all the necessary LLPC/SCPC state objects encapsulated + // by the Vulkan graphics pipeline. + struct GraphicsPipelineCreateInfo + { + Llpc::GraphicsPipelineBuildInfo pipelineInfo; + const PipelineLayout* pLayout; + const VkPipelineShaderStageCreateInfo* pStages[ShaderGfxStageCount]; + VkPipelineCreateFlags flags; + void* pMappingBuffer; + VkFormat dbFormat; + }; + + // Creation info parameters for all the necessary LLPC/SCPC state objects encapsulated + // by the Vulkan compute pipeline. + struct ComputePipelineCreateInfo + { + Llpc::ComputePipelineBuildInfo pipelineInfo; + const PipelineLayout* pLayout; + const VkPipelineShaderStageCreateInfo* pStage; + VkPipelineCreateFlags flags; + void* pMappingBuffer; + }; + + PipelineCompiler(PhysicalDevice* pPhysicalDevice); + ~PipelineCompiler(); + VkResult Initialize(); + void Destroy(); + + VkResult CreateGraphicsPipelineBinary( + Device* pDevice, + uint32_t deviceIndex, + PipelineCache* pPipelineCache, + GraphicsPipelineCreateInfo* pCreateInfo, + size_t* pPipelineBinarySize, + const void** ppPipelineBinary); + + VkResult CreateComputePipelineBinary( + Device* pDevice, + uint32_t deviceIndex, + PipelineCache* pPipelineCache, + ComputePipelineCreateInfo* pInfo, + size_t* pPipelineBinarySize, + const void** ppPipelineBinary); + + VkResult ConvertGraphicsPipelineInfo( + Device* pDevice, + const VkGraphicsPipelineCreateInfo* pIn, + GraphicsPipelineCreateInfo* pInfo, + VbBindingInfo* pVbInfo); + + VkResult ConvertComputePipelineInfo( + const VkComputePipelineCreateInfo* pIn, + ComputePipelineCreateInfo* pInfo); + + void FreeComputePipelineBinary( + ComputePipelineCreateInfo* pCreateInfo, + const void* pPipelineBinary, + size_t binarySize); + + void FreeGraphicsPipelineBinary( + GraphicsPipelineCreateInfo* pCreateInfo, + const void* pPipelineBinary, + size_t binarySize); + + void FreeComputePipelineCreateInfo(ComputePipelineCreateInfo* pCreateInfo); + + void FreeGraphicsPipelineCreateInfo(GraphicsPipelineCreateInfo* pCreateInfo); + // Get LLPC compiler explicitly. + // TODO: Should be removed in the future + Llpc::ICompiler* GetLlpcCompiler() { return m_pLlpc; } + +private: + VkResult CreateLlpcCompiler(); + + static bool IsDualSourceBlend(VkBlendFactor blend); + + // ----------------------------------------------------------------------------------------------------------------- + + PhysicalDevice* m_pPhysicalDevice; // Vulkan physical device object + Llpc::GfxIpVersion m_gfxIp; // Graphics IP version info, used by LLPC + Pal::GfxIpLevel m_gfxIpLevel; // Graphics IP Level, used by SCPC + + Llpc::ICompiler* m_pLlpc; // LLPC compiler object + +}; // class PipelineCompiler + +} // namespce vk diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 29f8729b..23308d19 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -853,6 +853,8 @@ class CmdBuffer const uint32_t queryCount, const uint32_t timestampChunk); + VK_INLINE uint32_t EstimateMaxObjectsOnVirtualStack(size_t objectSize) const; + #if VK_ENABLE_DEBUG_BARRIERS void DbgCmdBarrier(bool preCmd); #endif diff --git a/icd/api/include/vk_compute_pipeline.h b/icd/api/include/vk_compute_pipeline.h index d4e45fc6..62412eff 100644 --- a/icd/api/include/vk_compute_pipeline.h +++ b/icd/api/include/vk_compute_pipeline.h @@ -110,8 +110,6 @@ class ComputePipeline : public Pipeline, public NonDispatchable { public: - VK_INLINE void WriteSamplerDescriptors( - const Device::Properties& deviceProperties, + template + static void WriteSamplerDescriptors( const VkDescriptorImageInfo* pDescriptors, uint32_t* pDestAddr, uint32_t count, uint32_t dwStride, size_t descriptorStrideInBytes); - VK_INLINE void WriteImageSamplerDescriptors( - const Device::Properties& deviceProperties, + template + static void WriteImageSamplerDescriptors( const VkDescriptorImageInfo* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -85,9 +85,8 @@ class DescriptorSet : public NonDispatchable uint32_t dwStride, size_t descriptorStrideInBytes); - VK_INLINE void WriteImageDescriptors( - VkDescriptorType descType, - const Device::Properties& deviceProperties, + template + static void WriteImageDescriptors( const VkDescriptorImageInfo* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -95,8 +94,8 @@ class DescriptorSet : public NonDispatchable uint32_t dwStride, size_t descriptorStrideInBytes); - VK_INLINE void WriteFmaskDescriptors( - const Device* pDevice, + template + static void WriteFmaskDescriptors( const VkDescriptorImageInfo* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -104,9 +103,9 @@ class DescriptorSet : public NonDispatchable uint32_t dwStride, size_t descriptorStrideInBytes); - VK_INLINE void WriteBufferInfoDescriptors( + template + static void WriteBufferInfoDescriptors( const Device* pDevice, - VkDescriptorType type, const VkDescriptorBufferInfo* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -114,9 +113,8 @@ class DescriptorSet : public NonDispatchable uint32_t dwStride, size_t descriptorStrideInBytes); - VK_INLINE void WriteBufferDescriptors( - const Device::Properties& deviceProperties, - VkDescriptorType type, + template + static void WriteBufferDescriptors( const VkBufferView* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -161,39 +159,52 @@ class DescriptorSet : public NonDispatchable const uint32_t* pDynamicOffsets, uint32_t numDynamicDescriptors); + static PFN_vkUpdateDescriptorSets GetUpdateDescriptorSetsFunc(const Device* pDevice); + +protected: + DescriptorSet( + DescriptorPool* pPool, + uint32_t heapIndex, + DescriptorSetFlags flags); + + ~DescriptorSet() + { PAL_NEVER_CALLED(); } + + template + static PFN_vkUpdateDescriptorSets GetUpdateDescriptorSetsFunc(const Device* pDevice); + + template + static VKAPI_ATTR void VKAPI_CALL UpdateDescriptorSets( + VkDevice device, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites, + uint32_t descriptorCopyCount, + const VkCopyDescriptorSet* pDescriptorCopies); + + template static void WriteDescriptorSets( const Device* pDevice, uint32_t deviceIdx, - const Device::Properties& deviceProperties, uint32_t descriptorWriteCount, const VkWriteDescriptorSet* pDescriptorWrites, - size_t descriptorStrideInBytes = 0); + size_t descriptorStrideInBytes = 0); + template static void CopyDescriptorSets( const Device* pDevice, uint32_t deviceIdx, - const Device::Properties& deviceProperties, uint32_t descriptorCopyCount, const VkCopyDescriptorSet* pDescriptorCopies); -protected: - DescriptorSet( - DescriptorPool* pPool, - uint32_t heapIndex, - DescriptorSetFlags flags); - - ~DescriptorSet() - { PAL_NEVER_CALLED(); } - void Reassign( const DescriptorSetLayout* pLayout, Pal::gpusize gpuMemOffset, Pal::gpusize* gpuBaseAddress, uint32_t** cpuBaseAddress, uint32_t numPalDevices, - const InternalMemory* const pInternalMem, - void* pAllocHandle, - VkDescriptorSet* pHandle); + void* pAllocHandle); + + void Reset(); void InitImmutableDescriptors( const DescriptorSetLayout* pLayout, diff --git a/icd/api/include/vk_descriptor_set_layout.h b/icd/api/include/vk_descriptor_set_layout.h index e4a3948a..4422cd70 100644 --- a/icd/api/include/vk_descriptor_set_layout.h +++ b/icd/api/include/vk_descriptor_set_layout.h @@ -126,6 +126,29 @@ class DescriptorSetLayout : public NonDispatchable(Util::VoidPtrInc(this, sizeof(*this))); + } + + template + static PfnUpdateEntry GetUpdateEntryFunc( + const Device* pDevice, + VkDescriptorType descriptorType, + const DescriptorSetLayout::BindingInfo& dstBinding); + + static PfnUpdateEntry GetUpdateEntryFunc( + const Device* pDevice, + VkDescriptorType descriptorType, + const DescriptorSetLayout::BindingInfo& dstBinding); + + template + static void UpdateEntrySampledImage( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry); + + template + static void UpdateEntrySampler( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry); + + template + static void UpdateEntryBuffer( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry); + + template + static void UpdateEntryTexelBuffer( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry); + + template + static void UpdateEntryCombinedImageSampler( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry); + + uint32_t m_numEntries; }; namespace entry diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h index e9ad6f1f..882fc906 100644 --- a/icd/api/include/vk_device.h +++ b/icd/api/include/vk_device.h @@ -465,8 +465,8 @@ class Device VK_INLINE Util::Mutex* GetTimerQueueMutex() { return &m_timerQueueMutex; } - VK_INLINE Llpc::ICompiler* GetLlpcCompiler(uint32_t idx = DefaultDeviceIndex) const - { return m_pLlpcCompiler[idx]; } + VK_INLINE PipelineCompiler* GetCompiler(uint32_t idx = DefaultDeviceIndex) const + { return m_pPhysicalDevices[idx]->GetCompiler(); } static const Pal::MsaaQuadSamplePattern* GetDefaultQuadSamplePattern(uint32_t sampleCount); static uint32_t GetDefaultSamplePatternIndex(uint32_t sampleCount); @@ -485,6 +485,9 @@ class Device VkExternalMemoryHandleTypeFlagBitsKHR handleType, const void* pExternalPtr) const; + PFN_vkUpdateDescriptorSets GetUpdateDescriptorSetsFunc() const + { return m_pfnUpdateDescriptorSets; } + protected: Device( uint32_t deviceCount, @@ -507,7 +510,7 @@ class Device void InitSamplePatternPalette(Pal::SamplePatternPalette* pPalette) const; - VkResult CreateLlpcCompiler(int32_t idx = DefaultDeviceIndex); + void InitEntryPointFuncs(); Instance* const m_pInstance; const RuntimeSettings& m_settings; @@ -547,11 +550,11 @@ class Device // The maximum allocations that can be created from the logical device uint32_t m_maxAllocations; - Llpc::ICompiler* m_pLlpcCompiler[MaxPalDevices]; - // Record pipeline cache count created on this device. Note this may be dropped once there isn't any test creating // excessive pipeline caches. volatile uint32_t m_pipelineCacheCount; + + PFN_vkUpdateDescriptorSets m_pfnUpdateDescriptorSets; }; // ===================================================================================================================== diff --git a/icd/api/include/vk_graphics_pipeline.h b/icd/api/include/vk_graphics_pipeline.h index d91609dc..ee66afa6 100644 --- a/icd/api/include/vk_graphics_pipeline.h +++ b/icd/api/include/vk_graphics_pipeline.h @@ -268,18 +268,13 @@ class GraphicsPipeline : public Pipeline, public NonDispatchableGetAPIVersion()); +#else + return VkInstance()->GetAPIVersion(); +#endif + } + #ifdef ICD_BUILD_APPPROFILE VK_INLINE AppProfile GetAppProfile() const { return m_appProfile; } @@ -391,6 +401,10 @@ class PhysicalDevice void LateInitialize(); + VK_FORCEINLINE PipelineCompiler* GetCompiler() + { + return &m_compiler; + } protected: PhysicalDevice(PhysicalDeviceManager* pPhysicalDeviceManager, Pal::IDevice* pPalDevice, @@ -406,6 +420,11 @@ class PhysicalDevice void PopulateExtensions(); void PopulateGpaProperties(); + VK_FORCEINLINE bool IsPerChannelMinMaxFilteringSupported() const + { + return m_properties.gfxipProperties.flags.supportPerChannelMinMaxFilter; + } + PhysicalDeviceManager* m_pPhysicalDeviceManager; Pal::IDevice* m_pPalDevice; Pal::DeviceProperties m_properties; @@ -436,6 +455,8 @@ class PhysicalDevice // Device properties related to the VK_AMD_gpu_perf_api_interface extension PhysicalDeviceGpaProperties m_gpaProps; + + PipelineCompiler m_compiler; }; VK_DEFINE_DISPATCHABLE(PhysicalDevice); diff --git a/icd/api/include/vk_render_pass.h b/icd/api/include/vk_render_pass.h index d69e8096..10df096c 100644 --- a/icd/api/include/vk_render_pass.h +++ b/icd/api/include/vk_render_pass.h @@ -128,6 +128,20 @@ class RenderPass : public NonDispatchable VK_INLINE uint32_t GetViewMask(uint32_t subpass) const { return m_createInfo.pSubpasses[subpass].viewMask; } + VK_INLINE uint32_t GetActiveViewsBitMask() const + { + uint32_t activeViewsBitMask = 0; + + // View is considered active when it is used in any subpass defined by RenderPass. + for (uint32_t subpass = 0; subpass < GetSubpassCount(); ++subpass) + { + activeViewsBitMask |= GetViewMask(subpass); + } + + // ActiveViewsBitMask can be understood as RenderPass ViewMask. + return activeViewsBitMask; + } + VK_INLINE bool IsMultiviewEnabled() const { // When a subpass uses a non-zero view mask, diff --git a/icd/api/llpc/CMakeLists.txt b/icd/api/llpc/CMakeLists.txt index 10f1da50..2f94e123 100644 --- a/icd/api/llpc/CMakeLists.txt +++ b/icd/api/llpc/CMakeLists.txt @@ -87,7 +87,6 @@ message(STATUS "LLVM link options:" ${LLVM_LINK_FLAGS}) target_compile_definitions(llpc PRIVATE ${TARGET_ARCHITECTURE_ENDIANESS}ENDIAN_CPU) target_compile_definitions(llpc PRIVATE _SPIRV_LLVM_API) -target_compile_definitions(llpc PRIVATE LLPC_BUILD_GFX9) if(XGL_LLVM_UPSTREAM) target_compile_definitions(llpc PRIVATE XGL_LLVM_UPSTREAM=1) @@ -292,7 +291,6 @@ add_dependencies(amdllpc llpc) target_compile_definitions(amdllpc PRIVATE ${TARGET_ARCHITECTURE_ENDIANESS}ENDIAN_CPU) target_compile_definitions(amdllpc PRIVATE _SPIRV_LLVM_API) -target_compile_definitions(amdllpc PRIVATE LLPC_BUILD_GFX9) target_include_directories(amdllpc PUBLIC @@ -337,10 +335,5 @@ if(UNIX) endif() target_link_libraries(amdllpc PRIVATE llpc dl stdc++) - if(XGL_LLVM_UPSTREAM) - llvm_map_components_to_libnames(llvm_libs amdgpucodegen amdgpuinfo amdgpuasmparser amdgpudisassembler LTO ipo analysis bitreader bitwriter codegen irreader linker mc passes support target transformutils coroutines aggressiveinstcombine) - else() - llvm_map_components_to_libnames(llvm_libs amdgpucodegen amdgpuinfo amdgpuasmparser amdgpudisassembler LTO ipo analysis bitreader bitwriter codegen irreader linker mc passes support target transformutils coroutines) - endif() - + llvm_map_components_to_libnames(llvm_libs amdgpucodegen amdgpuinfo amdgpuasmparser amdgpudisassembler LTO ipo analysis bitreader bitwriter codegen irreader linker mc passes support target transformutils coroutines aggressiveinstcombine) target_link_libraries(amdllpc PRIVATE ${llvm_libs}) diff --git a/icd/api/llpc/context/llpcCompiler.cpp b/icd/api/llpc/context/llpcCompiler.cpp index 494c67e4..3d100335 100644 --- a/icd/api/llpc/context/llpcCompiler.cpp +++ b/icd/api/llpc/context/llpcCompiler.cpp @@ -51,16 +51,12 @@ #include "llpcContext.h" #include "llpcCopyShader.h" #include "llpcGfx6Chip.h" -#ifdef LLPC_BUILD_GFX9 #include "llpcGfx9Chip.h" -#endif #include "llpcGraphicsContext.h" #include "llpcElf.h" #include "llpcFile.h" #include "llpcPatch.h" -#ifdef LLPC_BUILD_GFX9 #include "llpcShaderMerger.h" -#endif #include "llpcPipelineDumper.h" #include "llpcSpirvLower.h" #include "llpcVertexFetch.h" @@ -278,9 +274,7 @@ Compiler::Compiler( } else { -#ifdef LLPC_BUILD_GFX9 Gfx9::InitRegisterNameMap(gfxIp); -#endif } } @@ -749,13 +743,11 @@ Result Compiler::BuildGraphicsPipeline( pContext->SetGsOnChip(gsOnChip); } -#ifdef LLPC_BUILD_GFX9 // Do user data node merge for merged shader if ((result == Result::Success) && (m_gfxIp.major >= 9)) { pContext->DoUserDataNodeMerge(); } -#endif // Do LLVM module patching (main patch work) for (int32_t stage = ShaderStageGfxCount - 1; (stage >= 0) && (result == Result::Success); --stage) @@ -783,7 +775,6 @@ Result Compiler::BuildGraphicsPipeline( } } -#ifdef LLPC_BUILD_GFX9 // Do shader merge operations if ((result == Result::Success) && (m_gfxIp.major >= 9)) { @@ -854,7 +845,6 @@ Result Compiler::BuildGraphicsPipeline( modules[ShaderStageGeometry] = pEsGsModule; } } -#endif // Build copy shader if necessary (has geometry shader) if ((result == Result::Success) && (modules[ShaderStageGeometry] != nullptr)) @@ -1757,14 +1747,10 @@ void Compiler::InitGpuProperty() } else if (m_gfxIp.major == 9) { -#ifdef LLPC_BUILD_GFX9 if (m_gfxIp.stepping == 0) { m_gpuProperty.numShaderEngines = 4; } -#else - LLPC_NOT_IMPLEMENTED(); -#endif } else { diff --git a/icd/api/llpc/context/llpcComputeContext.h b/icd/api/llpc/context/llpcComputeContext.h index 49ddb6d5..5cae77e4 100644 --- a/icd/api/llpc/context/llpcComputeContext.h +++ b/icd/api/llpc/context/llpcComputeContext.h @@ -75,10 +75,8 @@ class ComputeContext: public PipelineContext // Enables GS on-chip mode virtual void SetGsOnChip(bool gsOnChip) { LLPC_NEVER_CALLED(); } -#ifdef LLPC_BUILD_GFX9 // Does user data node merge for merged shader virtual void DoUserDataNodeMerge() { LLPC_NEVER_CALLED(); } -#endif protected: virtual std::vector* GetDummyResourceMapNodes(ShaderStage shaderStage); diff --git a/icd/api/llpc/context/llpcContext.cpp b/icd/api/llpc/context/llpcContext.cpp index c0dce682..da89b9b9 100644 --- a/icd/api/llpc/context/llpcContext.cpp +++ b/icd/api/llpc/context/llpcContext.cpp @@ -66,12 +66,10 @@ const uint8_t Context::GlslEmuLibGfx8[] = #include "./generate/gfx8/g_llpcGlslEmuLibGfx8.h" }; -#ifdef LLPC_BUILD_GFX9 const uint8_t Context::GlslEmuLibGfx9[]= { #include "./generate/gfx9/g_llpcGlslEmuLibGfx9.h" }; -#endif // ===================================================================================================================== Context::Context( @@ -134,7 +132,6 @@ Context::Context( if (gfxIp.major >= 9) { -#ifdef LLPC_BUILD_GFX9 libBin.codeSize = sizeof(GlslEmuLibGfx9); libBin.pCode = GlslEmuLibGfx9; pGlslEmuLibGfx = LoadLibary(&libBin); @@ -143,9 +140,6 @@ Context::Context( { LLPC_ERRS("Fails to link LLVM libraries together\n"); } -#else - LLPC_NOT_IMPLEMENTED(); -#endif } // Do function inlining @@ -162,7 +156,7 @@ Context::Context( // Remove non-native function for native lib { - m_pNativeGlslEmuLib = CloneModule(m_pGlslEmuLib.get()); + m_pNativeGlslEmuLib = CloneModule(*m_pGlslEmuLib.get()); legacy::PassManager passMgr; passMgr.add(PassNonNativeFuncRemove::Create()); diff --git a/icd/api/llpc/context/llpcContext.h b/icd/api/llpc/context/llpcContext.h index 7624e65c..5f0241de 100644 --- a/icd/api/llpc/context/llpcContext.h +++ b/icd/api/llpc/context/llpcContext.h @@ -223,12 +223,10 @@ class Context : public llvm::LLVMContext m_pPipelineContext->SetGsOnChip(gsOnChip); } -#ifdef LLPC_BUILD_GFX9 void DoUserDataNodeMerge() { m_pPipelineContext->DoUserDataNodeMerge(); } -#endif uint64_t GetPiplineHashCode() const { @@ -294,10 +292,7 @@ class Context : public llvm::LLVMContext // GLSL emulation libraries static const uint8_t GlslEmuLib[]; static const uint8_t GlslEmuLibGfx8[]; -#ifdef LLPC_BUILD_GFX9 static const uint8_t GlslEmuLibGfx9[]; -#endif - }; } // Llpc diff --git a/icd/api/llpc/context/llpcGraphicsContext.cpp b/icd/api/llpc/context/llpcGraphicsContext.cpp index da70c9ad..2fa150f1 100644 --- a/icd/api/llpc/context/llpcGraphicsContext.cpp +++ b/icd/api/llpc/context/llpcGraphicsContext.cpp @@ -33,9 +33,7 @@ #include "SPIRVInternal.h" #include "llpcCompiler.h" #include "llpcGfx6Chip.h" -#ifdef LLPC_BUILD_GFX9 #include "llpcGfx9Chip.h" -#endif #include "llpcGraphicsContext.h" #include "llpcInternal.h" @@ -80,13 +78,11 @@ GraphicsContext::GraphicsContext( m_tessOffchip(cl::EnableTessOffChip), m_gsOnChip(false) { -#ifdef LLPC_BUILD_GFX9 if (gfxIp.major >= 9) { // For GFX9+, always enable tessellation off-chip mode m_tessOffchip = true; } -#endif const PipelineShaderInfo* shaderInfo[ShaderStageGfxCount] = { @@ -124,12 +120,10 @@ GraphicsContext::GraphicsContext( // ===================================================================================================================== GraphicsContext::~GraphicsContext() { -#ifdef LLPC_BUILD_GFX9 for (auto pAllocNodes : m_allocUserDataNodes) { delete pAllocNodes; } -#endif } // ===================================================================================================================== @@ -500,7 +494,6 @@ bool GraphicsContext::CheckGsOnChipValidity() } else { -#ifdef LLPC_BUILD_GFX9 uint32_t gsPrimsPerSubgroup = m_pGpuProperty->gsOnChipDefaultPrimsPerSubgroup; // NOTE: Make esGsItemSize odd by "| 1", to optimize ES -> GS ring layout for LDS bank conflicts @@ -597,9 +590,6 @@ bool GraphicsContext::CheckGsOnChipValidity() // TODO: GFX9 GS -> VS ring on chip is not supported yet gsOnChip = false; -#else - LLPC_NOT_IMPLEMENTED(); -#endif } LLPC_OUTS("===============================================================================\n"); @@ -626,7 +616,6 @@ bool GraphicsContext::CheckGsOnChipValidity() return gsOnChip; } -#ifdef LLPC_BUILD_GFX9 // ===================================================================================================================== // Does user data node merging for merged shader void GraphicsContext::DoUserDataNodeMerge() @@ -842,6 +831,5 @@ void GraphicsContext::MergeUserDataNode( *pMergedNodeCount = mergedNodeCount; *ppMergedNodes = pMergedNodes; } -#endif } // Llpc diff --git a/icd/api/llpc/context/llpcGraphicsContext.h b/icd/api/llpc/context/llpcGraphicsContext.h index 43f0c733..dbabade5 100644 --- a/icd/api/llpc/context/llpcGraphicsContext.h +++ b/icd/api/llpc/context/llpcGraphicsContext.h @@ -81,9 +81,7 @@ class GraphicsContext: public PipelineContext // Enables GS on-chip mode virtual void SetGsOnChip(bool gsOnChip) { m_gsOnChip = gsOnChip; } -#ifdef LLPC_BUILD_GFX9 virtual void DoUserDataNodeMerge(); -#endif void InitShaderInfoForNullFs(); @@ -104,14 +102,12 @@ class GraphicsContext: public PipelineContext LLPC_DISALLOW_DEFAULT_CTOR(GraphicsContext); LLPC_DISALLOW_COPY_AND_ASSIGN(GraphicsContext); -#ifdef LLPC_BUILD_GFX9 void MergeUserDataNode(uint32_t nodeCount1, const ResourceMappingNode* pNodes1, uint32_t nodeCount2, const ResourceMappingNode* pNodes2, uint32_t* pMergedNodeCount, const ResourceMappingNode** ppMergedNodes); -#endif const GraphicsPipelineBuildInfo* m_pPipelineInfo; // Info to build a graphics pipeline @@ -131,9 +127,7 @@ class GraphicsContext: public PipelineContext bool m_tessOffchip; // Whether to enable tessellation off-chip mode bool m_gsOnChip; // Whether to enable GS on-chip mode -#ifdef LLPC_BUILD_GFX9 std::vector m_allocUserDataNodes; // Allocated user data nodes for merged shader -#endif }; } // Llpc diff --git a/icd/api/llpc/context/llpcPipelineContext.h b/icd/api/llpc/context/llpcPipelineContext.h index f876a11e..11966918 100644 --- a/icd/api/llpc/context/llpcPipelineContext.h +++ b/icd/api/llpc/context/llpcPipelineContext.h @@ -670,10 +670,8 @@ class PipelineContext // Enables GS on-chip mode virtual void SetGsOnChip(bool gsOnChip) = 0; -#ifdef LLPC_BUILD_GFX9 // Does user data node merge for merged shader virtual void DoUserDataNodeMerge() = 0; -#endif const char* GetGpuNameString() const; const char* GetGpuNameAbbreviation() const; diff --git a/icd/api/llpc/include/llpc.h b/icd/api/llpc/include/llpc.h index 1b460b92..b75fd14c 100644 --- a/icd/api/llpc/include/llpc.h +++ b/icd/api/llpc/include/llpc.h @@ -269,6 +269,7 @@ struct GraphicsPipelineBuildInfo bool blendEnable; ///< Blend will be enabled for this target at draw time bool blendSrcAlphaToColor; ///< Whether source alpha is blended to color channels for this target /// at draw time + uint8_t channelWriteMask; ///< Write mask to specify destination channels VkFormat format; ///< Color attachment format } target[MaxColorTargets]; ///< Per-MRT color target info } cbState; ///< Color target state @@ -402,14 +403,14 @@ class IPipelineDumper /// @param [in] pPipelineInfo Info to build this graphics pipeline /// /// @returns Hash code associated this graphics pipeline. - static uint64_t VKAPI_CALL GetGraphicsPipelineHash(const GraphicsPipelineBuildInfo* pPipelineInfo); + static uint64_t VKAPI_CALL GetPipelineHash(const GraphicsPipelineBuildInfo* pPipelineInfo); /// Calculates compute pipeline hash code. /// /// @param [in] pPipelineInfo Info to build this compute pipeline /// /// @returns Hash code associated this compute pipeline. - static uint64_t VKAPI_CALL GetComputePipelineHash(const ComputePipelineBuildInfo* pPipelineInfo); + static uint64_t VKAPI_CALL GetPipelineHash(const ComputePipelineBuildInfo* pPipelineInfo); }; // ===================================================================================================================== diff --git a/icd/api/llpc/lower/llpcSpirvLowerAggregateLoadStore.cpp b/icd/api/llpc/lower/llpcSpirvLowerAggregateLoadStore.cpp index b3e24f12..76de5ef5 100644 --- a/icd/api/llpc/lower/llpcSpirvLowerAggregateLoadStore.cpp +++ b/icd/api/llpc/lower/llpcSpirvLowerAggregateLoadStore.cpp @@ -152,7 +152,6 @@ void SpirvLowerAggregateLoadStore::visitCallInst( if (pStoreDest->getType()->getPointerAddressSpace() == SPIRAS_Private) { auto pStoreTy = pStoreDest->getType()->getPointerElementType(); - LLPC_ASSERT (pStoreTy->isArrayTy() || pStoreTy->isStructTy()); std::vector idxs; ExpandStoreInst(pStoreValue, pStoreDest, pStoreTy, idxs, &callInst); @@ -193,15 +192,27 @@ void SpirvLowerAggregateLoadStore::ExpandStoreInst( } else { - Value* pElemValue = ExtractValueInst::Create(pStoreValue, idxStack, "", pInsertPos); - std::vector idxs; - idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); - for (uint32_t i = 0, idxCount = idxStack.size(); i < idxCount; ++i) + Value* pElemValue = nullptr; + Value* pElemPtr = nullptr; + + if (idxStack.empty()) { - idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), idxStack[i])); + pElemValue = pStoreValue; + pElemPtr = pStorePtr; + } + else + { + pElemValue = ExtractValueInst::Create(pStoreValue, idxStack, "", pInsertPos); + std::vector idxs; + idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0)); + for (uint32_t i = 0, idxCount = idxStack.size(); i < idxCount; ++i) + { + idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), idxStack[i])); + } + + pElemPtr = GetElementPtrInst::CreateInBounds(pStorePtr, idxs,"", pInsertPos); } - auto pElemPtr = GetElementPtrInst::CreateInBounds(pStorePtr, idxs,"", pInsertPos); if (pElemPtr->getType()->getPointerElementType() != pElemValue->getType()) { // Type mismatch (only occurs for the store of uint32 <-> bool) diff --git a/icd/api/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp b/icd/api/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp index d4eeab6f..27417cdc 100644 --- a/icd/api/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp +++ b/icd/api/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp @@ -188,7 +188,7 @@ void SpirvLowerConstImmediateStore::ConvertAllocaToReadOnlyGlobal( "", nullptr, GlobalValue::NotThreadLocal, - ADDR_SPACE_CONST); + SPIRAS_Constant); pGlobal->takeName(pAlloca); // Change all uses of pAlloca to use pGlobal. We need to do it manually, as there is a change // of address space, and we also need to recreate "getelementptr"s. diff --git a/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp b/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp index b4f12510..25adc98c 100644 --- a/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp +++ b/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp @@ -370,16 +370,72 @@ void SpirvLowerGlobal::visitLoadInst( LLPC_ASSERT(pMetaNode != nullptr); auto pInOutMeta = mdconst::dyn_extract(pMetaNode->getOperand(0)); - auto pLoadValue = AddCallInstForInOutImport(pInOutTy, - addrSpace, - pInOutMeta, - nullptr, - nullptr, - nullptr, - InterpLocUnknown, - nullptr, - nullptr, - &loadInst); + Value* pLoadValue = UndefValue::get(pInOutTy); + bool hasVertexIdx = false; + + if (pInOutTy->isArrayTy()) + { + // Arrayed input/output + LLPC_ASSERT(pInOutMeta->getNumOperands() == 3); + ShaderInOutMetadata inOutMeta = {}; + inOutMeta.U32All = cast(pInOutMeta->getOperand(1))->getZExtValue(); + + // If the input/output is arrayed, the outermost dimension might for vertex indexing + if (inOutMeta.IsBuiltIn) + { + BuiltIn builtInId = static_cast(inOutMeta.Value); + hasVertexIdx = ((builtInId == BuiltInPerVertex) || // GLSL style per-vertex data + (builtInId == BuiltInPosition) || // HLSL style per-vertex data + (builtInId == BuiltInPointSize) || + (builtInId == BuiltInClipDistance) || + (builtInId == BuiltInCullDistance)); + } + else + { + hasVertexIdx = (inOutMeta.PerPatch == false); + } + } + + if (hasVertexIdx) + { + LLPC_ASSERT(pInOutTy->isArrayTy()); + + auto pElemTy = pInOutTy->getArrayElementType(); + auto pElemMeta = cast(pInOutMeta->getOperand(2)); + + const uint32_t elemCount = pInOutTy->getArrayNumElements(); + for (uint32_t i = 0; i < elemCount; ++i) + { + Value* pVertexIdx = ConstantInt::get(m_pContext->Int32Ty(), i); + auto pElemValue = AddCallInstForInOutImport(pElemTy, + addrSpace, + pElemMeta, + nullptr, + nullptr, + pVertexIdx, + InterpLocUnknown, + nullptr, + nullptr, + &loadInst); + + std::vector idxs; + idxs.push_back(i); + pLoadValue = InsertValueInst::Create(pLoadValue, pElemValue, idxs, "", &loadInst); + } + } + else + { + pLoadValue = AddCallInstForInOutImport(pInOutTy, + addrSpace, + pInOutMeta, + nullptr, + nullptr, + nullptr, + InterpLocUnknown, + nullptr, + nullptr, + &loadInst); + } m_loadInsts.insert(&loadInst); loadInst.replaceAllUsesWith(pLoadValue); @@ -478,7 +534,56 @@ void SpirvLowerGlobal::visitStoreInst( LLPC_ASSERT(pMetaNode != nullptr); auto pOutputMeta = mdconst::dyn_extract(pMetaNode->getOperand(0)); - AddCallInstForOutputExport(pStoreValue, pOutputMeta, nullptr, nullptr, nullptr, InvalidValue, &storeInst); + bool hasVertexIdx = false; + + // If the input/output is arrayed, the outermost dimension might for vertex indexing + if (pOutputy->isArrayTy()) + { + LLPC_ASSERT(pOutputMeta->getNumOperands() == 3); + ShaderInOutMetadata outputMeta = {}; + outputMeta.U32All = cast(pOutputMeta->getOperand(1))->getZExtValue(); + + if (outputMeta.IsBuiltIn) + { + BuiltIn builtInId = static_cast(outputMeta.Value); + hasVertexIdx = ((builtInId == BuiltInPerVertex) || // GLSL style per-vertex data + (builtInId == BuiltInPosition) || // HLSL style per-vertex data + (builtInId == BuiltInPointSize) || + (builtInId == BuiltInClipDistance) || + (builtInId == BuiltInCullDistance)); + } + else + { + hasVertexIdx = (outputMeta.PerPatch == false); + } + } + + if (hasVertexIdx) + { + LLPC_ASSERT(pOutputy->isArrayTy()); + auto pElemMeta = cast(pOutputMeta->getOperand(2)); + + const uint32_t elemCount = pOutputy->getArrayNumElements(); + for (uint32_t i = 0; i < elemCount; ++i) + { + std::vector idxs; + idxs.push_back(i); + auto pElemValue = ExtractValueInst::Create(pStoreValue, idxs, "", &storeInst); + + Value* pVertexIdx = ConstantInt::get(m_pContext->Int32Ty(), i); + AddCallInstForOutputExport(pElemValue, + pElemMeta, + nullptr, + nullptr, + pVertexIdx, + InvalidValue, + &storeInst); + } + } + else + { + AddCallInstForOutputExport(pStoreValue, pOutputMeta, nullptr, nullptr, nullptr, InvalidValue, &storeInst); + } m_storeInsts.insert(&storeInst); } @@ -861,10 +966,6 @@ void SpirvLowerGlobal::LowerInput() // Does lowering opertions for SPIR-V outputs, replaces outputs with proxy variables. void SpirvLowerGlobal::LowerOutput() { - // NOTE: For tessellation control shader, we invoke handling of "load"/"store" instructions and replace all those - // instructions with import/export calls in-place. - LLPC_ASSERT(m_shaderStage != ShaderStageTessControl); - m_pRetBlock = BasicBlock::Create(*m_pContext, "", m_pEntryPoint); // Invoke handling of "return" instructions or "emit" calls @@ -887,6 +988,16 @@ void SpirvLowerGlobal::LowerOutput() retInst->eraseFromParent(); } + if (m_outputProxyMap.empty()) + { + // Skip lowering if there is no output + return; + } + + // NOTE: For tessellation control shader, we invoke handling of "load"/"store" instructions and replace all those + // instructions with import/export calls in-place. + LLPC_ASSERT(m_shaderStage != ShaderStageTessControl); + // Export output from the proxy variable prior to "return" instruction or "emit" calls for (auto outputMap : m_outputProxyMap) { @@ -1523,6 +1634,7 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport( { BuiltIn builtInId = static_cast(inOutMeta.Value); if ((builtInId == BuiltInSubgroupLocalInvocationId) || + (builtInId == BuiltInSubgroupSize) || (builtInId == BuiltInSubgroupEqMaskKHR) || (builtInId == BuiltInSubgroupGeMaskKHR) || (builtInId == BuiltInSubgroupGtMaskKHR) || diff --git a/icd/api/llpc/patch/generate/gfx8/glslSpecialOpEmu.ll b/icd/api/llpc/patch/generate/gfx8/glslSpecialOpEmu.ll index 10cb2d35..61cafa3f 100644 --- a/icd/api/llpc/patch/generate/gfx8/glslSpecialOpEmu.ll +++ b/icd/api/llpc/patch/generate/gfx8/glslSpecialOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2018, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslSpecialOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL special graphics-specific operations. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll b/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll index d1a4f8de..b0f60c1e 100755 --- a/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll +++ b/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslArithOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (std32). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/gfx9/glslImageOpEmu.ll b/icd/api/llpc/patch/generate/gfx9/glslImageOpEmu.ll index e4468210..eb06ecce 100644 --- a/icd/api/llpc/patch/generate/gfx9/glslImageOpEmu.ll +++ b/icd/api/llpc/patch/generate/gfx9/glslImageOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslImageOpEmu.ll -;* @brief LLVM IR file: contains emulation codes for GLSL image operations. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslArithOpEmu.ll b/icd/api/llpc/patch/generate/glslArithOpEmu.ll index f4608693..af84e1b2 100644 --- a/icd/api/llpc/patch/generate/glslArithOpEmu.ll +++ b/icd/api/llpc/patch/generate/glslArithOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslArithOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (std32). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslArithOpEmuF16.ll b/icd/api/llpc/patch/generate/glslArithOpEmuF16.ll index 9817f1d9..62ccb888 100644 --- a/icd/api/llpc/patch/generate/glslArithOpEmuF16.ll +++ b/icd/api/llpc/patch/generate/glslArithOpEmuF16.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslArithOpEmuf16.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (float16). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslArithOpEmuF64.ll b/icd/api/llpc/patch/generate/glslArithOpEmuF64.ll index 58ba8338..5ed92521 100644 --- a/icd/api/llpc/patch/generate/glslArithOpEmuF64.ll +++ b/icd/api/llpc/patch/generate/glslArithOpEmuF64.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslArithOpEmuF64.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (float64). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslArithOpEmuI16.ll b/icd/api/llpc/patch/generate/glslArithOpEmuI16.ll index 0f12e613..5f10e5e5 100644 --- a/icd/api/llpc/patch/generate/glslArithOpEmuI16.ll +++ b/icd/api/llpc/patch/generate/glslArithOpEmuI16.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslArithOpEmuI16.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (int16). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslArithOpEmuI64.ll b/icd/api/llpc/patch/generate/glslArithOpEmuI64.ll index 311d4593..b71c1117 100644 --- a/icd/api/llpc/patch/generate/glslArithOpEmuI64.ll +++ b/icd/api/llpc/patch/generate/glslArithOpEmuI64.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslArithOpEmuI64.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (int64). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslBufferOpEmu.ll b/icd/api/llpc/patch/generate/glslBufferOpEmu.ll index 288f5da0..84f54fd4 100644 --- a/icd/api/llpc/patch/generate/glslBufferOpEmu.ll +++ b/icd/api/llpc/patch/generate/glslBufferOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslBufferOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL buffer operations. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslBuiltInVarEmu.ll b/icd/api/llpc/patch/generate/glslBuiltInVarEmu.ll index 7922c61e..1ab65ddc 100644 --- a/icd/api/llpc/patch/generate/glslBuiltInVarEmu.ll +++ b/icd/api/llpc/patch/generate/glslBuiltInVarEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslBuiltInVarEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL built-in variables. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" @@ -21,6 +30,12 @@ target triple = "spir64-unknown-unknown" ; >>> Common Built-in Variables ; ===================================================================================================================== +; GLSL: in uint gl_SubGroupSize +define i32 @llpc.input.import.builtin.SubgroupSize(i32 %builtInId) #0 +{ + ret i32 64 +} + ; GLSL: in uint gl_SubGroupInvocation define i32 @llpc.input.import.builtin.SubgroupLocalInvocationId(i32 %builtInId) #0 { @@ -204,7 +219,6 @@ define <2 x float> @llpc.input.import.builtin.SamplePosition(i32 %builtInId) #0 declare <3 x i32> @llpc.input.import.builtin.WorkgroupSize(i32) #0 declare <3 x i32> @llpc.input.import.builtin.WorkgroupId(i32) #0 declare <3 x i32> @llpc.input.import.builtin.LocalInvocationId(i32) #0 -declare i32 @llpc.input.import.builtin.SubgroupSize(i32) #0 declare i32 @llpc.input.import.builtin.NumSamples(i32) #0 declare i32 @llpc.input.import.builtin.SamplePatternIdx(i32) #0 declare i32 @llpc.input.import.builtin.SampleId(i32) #0 diff --git a/icd/api/llpc/patch/generate/glslCopyShaderEmu.ll b/icd/api/llpc/patch/generate/glslCopyShaderEmu.ll index 0c26a52e..93278ac0 100644 --- a/icd/api/llpc/patch/generate/glslCopyShaderEmu.ll +++ b/icd/api/llpc/patch/generate/glslCopyShaderEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslNullFsEmul.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for copy shader. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" @@ -52,6 +61,8 @@ define <4 x i32> @llpc.descriptor.load.gsvsringbuffer(i32 %internalTablePtrLow, %6 = bitcast <2 x i32> %5 to i64 %7 = shl i64 %ringOutOffset, 4 %8 = add i64 %6, %7 + ; This uses addrspace(2), which is SPIRAS::Constant. The PatchAddrSpaceMutate pass then changes + ; it to addrspace(4), which is AMDGPUAS::Constant. %9 = inttoptr i64 %8 to <4 x i32> addrspace(2)*, !amdgpu.uniform !1 %10 = load <4 x i32>, <4 x i32> addrspace(2)* %9 diff --git a/icd/api/llpc/patch/generate/glslImageOpEmu.ll b/icd/api/llpc/patch/generate/glslImageOpEmu.ll index 298e5cb5..f0374f9d 100644 --- a/icd/api/llpc/patch/generate/glslImageOpEmu.ll +++ b/icd/api/llpc/patch/generate/glslImageOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslImageOpEmu.ll -;* @brief LLVM IR file: contains emulation codes for GLSL image operations. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" @@ -317,6 +326,13 @@ define <4 x float> @llpc.patch.image.gather.texel.i32( ret <4 x float> %5 } +define i1 @llpc.imagesparse.texel.resident( + i32 %residentCode) #0 +{ + %1 = icmp eq i32 %residentCode, 0 + ret i1 %1 +} + declare <8 x i32> @llpc.descriptor.load.resource(i32 , i32 , i32) #0 declare <4 x i32> @llpc.descriptor.load.texelbuffer(i32 , i32 , i32) #0 diff --git a/icd/api/llpc/patch/generate/glslInlineConstOpEmu.ll b/icd/api/llpc/patch/generate/glslInlineConstOpEmu.ll index ecda2abe..f6e88d7c 100644 --- a/icd/api/llpc/patch/generate/glslInlineConstOpEmu.ll +++ b/icd/api/llpc/patch/generate/glslInlineConstOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslInlineConstOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL inline constant buffer operations. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslMatrixOpEmu.ll b/icd/api/llpc/patch/generate/glslMatrixOpEmu.ll index a8bf0477..3c5c8a6e 100644 --- a/icd/api/llpc/patch/generate/glslMatrixOpEmu.ll +++ b/icd/api/llpc/patch/generate/glslMatrixOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslMatrixOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL matrix operations (float). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslMatrixOpEmuF16.ll b/icd/api/llpc/patch/generate/glslMatrixOpEmuF16.ll index 122c2a3b..12b545ec 100644 --- a/icd/api/llpc/patch/generate/glslMatrixOpEmuF16.ll +++ b/icd/api/llpc/patch/generate/glslMatrixOpEmuF16.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslMatrixOpEmuF16.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL matrix operations (float16). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslMatrixOpEmuF64.ll b/icd/api/llpc/patch/generate/glslMatrixOpEmuF64.ll index 00034a8f..88f62910 100644 --- a/icd/api/llpc/patch/generate/glslMatrixOpEmuF64.ll +++ b/icd/api/llpc/patch/generate/glslMatrixOpEmuF64.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file g_glslMatrixOpEmuF64.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL matrix operations (double). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslNullFsEmu.ll b/icd/api/llpc/patch/generate/glslNullFsEmu.ll index 4977c811..445dded1 100644 --- a/icd/api/llpc/patch/generate/glslNullFsEmu.ll +++ b/icd/api/llpc/patch/generate/glslNullFsEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslNullFsEmul.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for null fragment shader. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslPushConstOpEmu.ll b/icd/api/llpc/patch/generate/glslPushConstOpEmu.ll index ef9f792e..c1062cbd 100644 --- a/icd/api/llpc/patch/generate/glslPushConstOpEmu.ll +++ b/icd/api/llpc/patch/generate/glslPushConstOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslPushConstOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL push constant (spilled) operations. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" @@ -20,84 +29,84 @@ target triple = "spir64-unknown-unknown" ; GLSL: load float16/int16/uint16 (word) define <2 x i8> @llpc.pushconst.load.v2i8(i32 %memberOffset, i1 %glc, i1 %slc) #0 { - %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() - %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset - %2 = bitcast i8 addrspace(2)* %1 to <2 x i8> addrspace(2)*, !amdgpu.uniform !0 - %3 = load <2 x i8>, <2 x i8> addrspace(2)* %2, align 4 + %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() + %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset + %2 = bitcast i8 addrspace(4)* %1 to <2 x i8> addrspace(4)*, !amdgpu.uniform !0 + %3 = load <2 x i8>, <2 x i8> addrspace(4)* %2, align 4 ret <2 x i8> %3 } ; GLSL: load f16vec2/i16vec2/u16vec2/float/int/uint (dword) define <4 x i8> @llpc.pushconst.load.v4i8(i32 %memberOffset, i1 %glc, i1 %slc) #0 { - %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() - %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset - %2 = bitcast i8 addrspace(2)* %1 to <4 x i8> addrspace(2)*, !amdgpu.uniform !0 - %3 = load <4 x i8>, <4 x i8> addrspace(2)* %2, align 4 + %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() + %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset + %2 = bitcast i8 addrspace(4)* %1 to <4 x i8> addrspace(4)*, !amdgpu.uniform !0 + %3 = load <4 x i8>, <4 x i8> addrspace(4)* %2, align 4 ret <4 x i8> %3 } ; GLSL: load f16vec3/i16vec3/u16vec3 (wordx3) define <6 x i8> @llpc.pushconst.load.v6i8(i32 %memberOffset, i1 %glc, i1 %slc) #0 { - %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() - %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset - %2 = bitcast i8 addrspace(2)* %1 to <6 x i8> addrspace(2)*, !amdgpu.uniform !0 - %3 = load <6 x i8>, <6 x i8> addrspace(2)* %2, align 4 + %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() + %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset + %2 = bitcast i8 addrspace(4)* %1 to <6 x i8> addrspace(4)*, !amdgpu.uniform !0 + %3 = load <6 x i8>, <6 x i8> addrspace(4)* %2, align 4 ret <6 x i8> %3 } ; GLSL: load f16vec4/i16vec4/u16vec4/vec2/ivec2/uvec2/double/int64/uint64 (dwordx2) define <8 x i8> @llpc.pushconst.load.v8i8(i32 %memberOffset, i1 %glc, i1 %slc) #0 { - %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() - %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset - %2 = bitcast i8 addrspace(2)* %1 to <8 x i8> addrspace(2)*, !amdgpu.uniform !0 - %3 = load <8 x i8>, <8 x i8> addrspace(2)* %2, align 8 + %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() + %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset + %2 = bitcast i8 addrspace(4)* %1 to <8 x i8> addrspace(4)*, !amdgpu.uniform !0 + %3 = load <8 x i8>, <8 x i8> addrspace(4)* %2, align 8 ret <8 x i8> %3 } ; GLSL: load vec3/ivec3/uvec3 (dwordx3) define <12 x i8> @llpc.pushconst.load.v12i8(i32 %memberOffset, i1 %glc, i1 %slc) #0 { - %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() - %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset - %2 = bitcast i8 addrspace(2)* %1 to <12 x i8> addrspace(2)*, !amdgpu.uniform !0 - %3 = load <12 x i8>, <12 x i8> addrspace(2)* %2, align 4 + %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() + %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset + %2 = bitcast i8 addrspace(4)* %1 to <12 x i8> addrspace(4)*, !amdgpu.uniform !0 + %3 = load <12 x i8>, <12 x i8> addrspace(4)* %2, align 4 ret <12 x i8> %3 } ; GLSL: load vec4/ivec4/uvec4/dvec2/i64vec2/u64vec2 (dwordx4) define <16 x i8> @llpc.pushconst.load.v16i8(i32 %memberOffset, i1 %glc, i1 %slc) #0 { - %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() - %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset - %2 = bitcast i8 addrspace(2)* %1 to <16 x i8> addrspace(2)*, !amdgpu.uniform !0 - %3 = load <16 x i8>, <16 x i8> addrspace(2)* %2, align 4 + %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() + %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset + %2 = bitcast i8 addrspace(4)* %1 to <16 x i8> addrspace(4)*, !amdgpu.uniform !0 + %3 = load <16 x i8>, <16 x i8> addrspace(4)* %2, align 4 ret <16 x i8> %3 } ; GLSL: load dvec3/i64vec3/u64vec3 (dwordx6) define <24 x i8> @llpc.pushconst.load.v24i8(i32 %memberOffset, i1 %glc, i1 %slc) #0 { - %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() - %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset - %2 = bitcast i8 addrspace(2)* %1 to <24 x i8> addrspace(2)*, !amdgpu.uniform !0 - %3 = load <24 x i8>, <24 x i8> addrspace(2)* %2, align 4 + %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() + %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset + %2 = bitcast i8 addrspace(4)* %1 to <24 x i8> addrspace(4)*, !amdgpu.uniform !0 + %3 = load <24 x i8>, <24 x i8> addrspace(4)* %2, align 4 ret <24 x i8> %3 } ; GLSL: load dvec4/i64vec4/u64vec4 (dwordx8) define <32 x i8> @llpc.pushconst.load.v32i8(i32 %memberOffset, i1 %glc, i1 %slc) #0 { - %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() - %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset - %2 = bitcast i8 addrspace(2)* %1 to <32 x i8> addrspace(2)*, !amdgpu.uniform !0 - %3 = load <32 x i8>, <32 x i8> addrspace(2)* %2, align 4 + %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() + %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset + %2 = bitcast i8 addrspace(4)* %1 to <32 x i8> addrspace(4)*, !amdgpu.uniform !0 + %3 = load <32 x i8>, <32 x i8> addrspace(4)* %2, align 4 ret <32 x i8> %3 } -declare [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() #0 +declare [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() #0 attributes #0 = { nounwind } diff --git a/icd/api/llpc/patch/generate/glslSharedVarOpEmu.ll b/icd/api/llpc/patch/generate/glslSharedVarOpEmu.ll index 32fabe94..b60f7973 100644 --- a/icd/api/llpc/patch/generate/glslSharedVarOpEmu.ll +++ b/icd/api/llpc/patch/generate/glslSharedVarOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslSharedVarOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL shared variable operations. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/glslSpecialOpEmu.ll b/icd/api/llpc/patch/generate/glslSpecialOpEmu.ll index e1ca8526..c1fb6a31 100644 --- a/icd/api/llpc/patch/generate/glslSpecialOpEmu.ll +++ b/icd/api/llpc/patch/generate/glslSpecialOpEmu.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslSpecialOpEmu.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL special graphics-specific operations. -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" @@ -271,26 +280,32 @@ define spir_func <4 x i32> @_Z17SubgroupBallotKHRb(i1 %value) #0 { %1 = call i64 @llpc.ballot(i1 %value) %2 = bitcast i64 %1 to <2 x i32> - %3 = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> + %3 = shufflevector <2 x i32> %2, <2 x i32> , <4 x i32> ret <4 x i32> %3 } +; GLSL: int/uint readInvocation(int/uint, uint) +define spir_func i32 @_Z25SubgroupReadInvocationKHRii(i32 %value, i32 %invocationIndex) +{ + %1 = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %invocationIndex) + ret i32 %1 +} + ; GLSL: float readInvocation(float, uint) define spir_func float @_Z25SubgroupReadInvocationKHRfi(float %value, i32 %invocationIndex) { %1 = bitcast float %value to i32 - %2 = call i32 @llvm.amdgcn.readlane(i32 %1, i32 %invocationIndex) + %2 = call i32 @_Z25SubgroupReadInvocationKHRii(i32 %1, i32 %invocationIndex) %3 = bitcast i32 %2 to float ret float %3 } -; GLSL: int/uint readInvocation(int/uint, uint) -define spir_func i32 @_Z25SubgroupReadInvocationKHRii(i32 %value, i32 %invocationIndex) +; GLSL: int/uint readFirstInvocation(int/uint) +define spir_func i32 @_Z26SubgroupFirstInvocationKHRi(i32 %value) { - %1 = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %invocationIndex) - + %1 = call i32 @llvm.amdgcn.readfirstlane(i32 %value) ret i32 %1 } @@ -298,20 +313,12 @@ define spir_func i32 @_Z25SubgroupReadInvocationKHRii(i32 %value, i32 %invocatio define spir_func float @_Z26SubgroupFirstInvocationKHRf(float %value) { %1 = bitcast float %value to i32 - %2 = call i32 @llvm.amdgcn.readfirstlane(i32 %1) + %2 = call i32 @_Z26SubgroupFirstInvocationKHRi(i32 %1) %3 = bitcast i32 %2 to float ret float %3 } -; GLSL: int/uint readFirstInvocation(int/uint) -define spir_func i32 @_Z26SubgroupFirstInvocationKHRi(i32 %value) -{ - %1 = call i32 @llvm.amdgcn.readfirstlane(i32 %value) - - ret i32 %1 -} - ; GLSL: bool anyInvocation(bool) define spir_func i1 @_Z14SubgroupAnyKHRb(i1 %value) { @@ -343,21 +350,22 @@ define spir_func i1 @_Z19SubgroupAllEqualKHRb(i1 %value) ret i1 %5 } +; GLSL: int/uint writeInvocation(int/uint, int/uint, int/uint) +define spir_func i32 @_Z18WriteInvocationAMDiii(i32 %inputValue, i32 %writeValue, i32 %invocationIndex) +{ + %1 = call i32 @llvm.amdgcn.writelane(i32 %writeValue, i32 %invocationIndex, i32 %inputValue) + ret i32 %1 +} + ; GLSL: float writeInvocation(float, float, uint) define spir_func float @_Z18WriteInvocationAMDffi(float %inputValue, float %writeValue, i32 %invocationIndex) { %1 = bitcast float %writeValue to i32 %2 = bitcast float %inputValue to i32 - %3 = call i32 @llvm.amdgcn.writelane(i32 %1, i32 %invocationIndex, i32 %2) + %3 = call i32 @_Z18WriteInvocationAMDiii(i32 %1, i32 %invocationIndex, i32 %2) %4 = bitcast i32 %3 to float - ret float %4 -} -; GLSL: int/uint writeInvocation(int/uint, int/uint, int/uint) -define spir_func i32 @_Z18WriteInvocationAMDiii(i32 %inputValue, i32 %writeValue, i32 %invocationIndex) -{ - %1 = call i32 @llvm.amdgcn.writelane(i32 %writeValue, i32 %invocationIndex, i32 %inputValue) - ret i32 %1 + ret float %4 } ; GLSL: bool subgroupElect() @@ -374,6 +382,1575 @@ define spir_func i1 @_Z20GroupNonUniformElecti(i32 %scope) ret i1 %6 } +; GLSL: bool subgroupAll(bool) +define spir_func i1 @_Z18GroupNonUniformAllib(i32 %scope, i1 %value) +{ + %1 = call i1 @_Z14SubgroupAllKHRb(i1 %value) + ret i1 %1 +} + +; GLSL: bool subgroupAny(bool) +define spir_func i1 @_Z18GroupNonUniformAnyib(i32 %scope, i1 %value) +{ + %1 = call i1 @_Z14SubgroupAnyKHRb(i1 %value) + ret i1 %1 +} + +; GLSL: bool subgroupAllEqual(int/uint) +define spir_func i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %value) +{ + %1 = icmp ne i32 %value, 0 + %2 = call i1 @_Z19SubgroupAllEqualKHRb(i1 %1) + ret i1 %2 +} + +; GLSL: bool subgroupAllEqual(ivec2/uvec2) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %value) +{ + %1 = extractelement <2 x i32> %value, i32 0 + %2 = extractelement <2 x i32> %value, i32 1 + + %3 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1) + %4 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %2) + + %5 = and i1 %3, %4 + ret i1 %5 +} + +; GLSL: bool subgroupAllEqual(ivec3/uvec3) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %value) +{ + %1 = extractelement <3 x i32> %value, i32 0 + %2 = extractelement <3 x i32> %value, i32 1 + %3 = extractelement <3 x i32> %value, i32 1 + + %4 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1) + %5 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %2) + %6 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %3) + + %7 = and i1 %4, %5 + %8 = and i1 %7, %6 + ret i1 %8 +} + +; GLSL: bool subgroupAllEqual(ivec4/uvec4) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %value) +{ + %1 = extractelement <4 x i32> %value, i32 0 + %2 = extractelement <4 x i32> %value, i32 1 + %3 = extractelement <4 x i32> %value, i32 2 + %4 = extractelement <4 x i32> %value, i32 3 + + %5 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1) + %6 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %2) + %7 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %3) + %8 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %4) + + %9 = and i1 %5, %6 + %10 = and i1 %9, %7 + %11 = and i1 %10, %8 + ret i1 %11 +} + +; GLSL: bool subgroupAllEqual(float) +define spir_func i1 @_Z23GroupNonUniformAllEqualif(i32 %scope, float %value) +{ + %1 = bitcast float %value to i32 + %2 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1) + + ret i1 %2 +} + +; GLSL: bool subgroupAllEqual(vec2) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv2_f(i32 %scope, <2 x float> %value) +{ + %1 = bitcast <2 x float> %value to <2 x i32> + %2 = call i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %1) + + ret i1 %2 +} + +; GLSL: bool subgroupAllEqual(vec3) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv3_f(i32 %scope, <3 x float> %value) +{ + %1 = bitcast <3 x float> %value to <3 x i32> + %2 = call i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %1) + + ret i1 %2 +} + +; GLSL: bool subgroupAllEqual(vec4) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv4_f(i32 %scope, <4 x float> %value) +{ + %1 = bitcast <4 x float> %value to <4 x i32> + %2 = call i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %1) + + ret i1 %2 +} + +; GLSL: bool subgroupAllEqual(double) +define spir_func i1 @_Z23GroupNonUniformAllEqualid(i32 %scope, double %value) +{ + %1 = bitcast double %value to <2 x i32> + %2 = extractelement <2 x i32> %1, i32 0 + %3 = extractelement <2 x i32> %1, i32 1 + + %4 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %2) + %5 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %3) + %6 = and i1 %4, %5 + + ret i1 %6 +} + +; GLSL: bool subgroupAllEqual(dvec2) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv2_d(i32 %scope, <2 x double> %value) +{ + %1 = bitcast <2 x double> %value to <4 x i32> + %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <2 x i32> + %3 = shufflevector <4 x i32> %1, <4 x i32> %1, <2 x i32> + + %4 = call i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %2) + %5 = call i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %3) + %6 = and i1 %4, %5 + + ret i1 %6 +} + +; GLSL: bool subgroupAllEqual(dvec3) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv3_d(i32 %scope, <3 x double> %value) +{ + %1 = bitcast <3 x double> %value to <6 x i32> + %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <3 x i32> + %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <3 x i32> + + %4 = call i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %2) + %5 = call i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %3) + %6 = and i1 %4, %5 + + ret i1 %6 +} + +; GLSL: bool subgroupAllEqual(dvec4) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv4_d(i32 %scope, <4 x double> %value) +{ + %1 = bitcast <4 x double> %value to <8 x i32> + %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + + %4 = call i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %2) + %5 = call i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %3) + %6 = and i1 %4, %5 + + ret i1 %6 +} + +; GLSL: bool subgroupAllEqual(bool) +define spir_func i1 @_Z23GroupNonUniformAllEqualib(i32 %scope, i1 %value) +{ + %1 = zext i1 %value to i32 + %2 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1) + + ret i1 %2 +} + +; GLSL: bool subgroupAllEqual(bvec2) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv2_b(i32 %scope, <2 x i1> %value) +{ + %1 = zext <2 x i1> %value to <2 x i32> + %2 = call i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %1) + + ret i1 %2 +} + +; GLSL: bool subgroupAllEqual(bvec3) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv3_b(i32 %scope, <3 x i1> %value) +{ + %1 = zext <3 x i1> %value to <3 x i32> + %2 = call i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %1) + + ret i1 %2 +} + +; GLSL: bool subgroupAllEqual(bvec4) +define spir_func i1 @_Z23GroupNonUniformAllEqualiDv4_b(i32 %scope, <4 x i1> %value) +{ + %1 = zext <4 x i1> %value to <4 x i32> + %2 = call i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %1) + + ret i1 %2 +} + +; GLSL: int/uint subgroupBroadcast(int/uint, uint) +define spir_func i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %value, i32 %id) +{ + %1 = call i32 @_Z25SubgroupReadInvocationKHRii(i32 %value, i32 %id) + ret i32 %1 +} + +; GLSL: ivec2/uvec2 subgroupBroadcast(ivec2/uvec2, uint) +define spir_func <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %value, i32 %id) +{ + %1 = extractelement <2 x i32> %value, i32 0 + %2 = extractelement <2 x i32> %value, i32 1 + + %3 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id) + %4 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %2, i32 %id) + + %5 = insertelement <2 x i32> undef, i32 %3, i32 0 + %6 = insertelement <2 x i32> %5, i32 %4, i32 1 + + ret <2 x i32> %6 +} + +; GLSL: ivec3/uvec3 subgroupBroadcast(ivec3/uvec3, uint) +define spir_func <3 x i32> @_Z24GroupNonUniformBroadcastiDv3_ii(i32 %scope, <3 x i32> %value, i32 %id) +{ + %1 = extractelement <3 x i32> %value, i32 0 + %2 = extractelement <3 x i32> %value, i32 1 + %3 = extractelement <3 x i32> %value, i32 2 + + %4 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id) + %5 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %2, i32 %id) + %6 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %3, i32 %id) + + %7 = insertelement <3 x i32> undef, i32 %4, i32 0 + %8 = insertelement <3 x i32> %7, i32 %5, i32 1 + %9 = insertelement <3 x i32> %8, i32 %6, i32 2 + + ret <3 x i32> %9 +} + +; GLSL: ivec4/uvec4 subgroupBroadcast(ivec4/uvec4, uint) +define spir_func <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %value, i32 %id) +{ + %1 = extractelement <4 x i32> %value, i32 0 + %2 = extractelement <4 x i32> %value, i32 1 + %3 = extractelement <4 x i32> %value, i32 2 + %4 = extractelement <4 x i32> %value, i32 3 + + %5 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id) + %6 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %2, i32 %id) + %7 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %3, i32 %id) + %8 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %4, i32 %id) + + %9 = insertelement <4 x i32> undef, i32 %5, i32 0 + %10 = insertelement <4 x i32> %9, i32 %6, i32 1 + %11 = insertelement <4 x i32> %10, i32 %7, i32 2 + %12 = insertelement <4 x i32> %11, i32 %8, i32 3 + + ret <4 x i32> %12 +} + +; GLSL: float subgroupBroadcast(float, uint) +define spir_func float @_Z24GroupNonUniformBroadcastifi(i32 %scope, float %value, i32 %id) +{ + %1 = bitcast float %value to i32 + %2 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id) + %3 = bitcast i32 %2 to float + + ret float %3 +} + +; GLSL: vec2 subgroupBroadcast(vec2, uint) +define spir_func <2 x float> @_Z24GroupNonUniformBroadcastiDv2_fi(i32 %scope, <2 x float> %value, i32 %id) +{ + %1 = bitcast <2 x float> %value to <2 x i32> + %2 = call <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to <2 x float> + + ret <2 x float> %3 +} + +; GLSL: vec3 subgroupBroadcast(vec3, uint) +define spir_func <3 x float> @_Z24GroupNonUniformBroadcastiDv3_fi(i32 %scope, <3 x float> %value, i32 %id) +{ + %1 = bitcast <3 x float> %value to <3 x i32> + %2 = call <3 x i32> @_Z24GroupNonUniformBroadcastiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = bitcast <3 x i32> %2 to <3 x float> + + ret <3 x float> %3 +} + +; GLSL: vec4 subgroupBroadcast(vec4, uint) +define spir_func <4 x float> @_Z24GroupNonUniformBroadcastiDv4_fi(i32 %scope, <4 x float> %value, i32 %id) +{ + %1 = bitcast <4 x float> %value to <4 x i32> + %2 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <4 x float> + + ret <4 x float> %3 +} + +; GLSL: double subgroupBroadcast(double, uint) +define spir_func double @_Z24GroupNonUniformBroadcastidi(i32 %scope, double %value, i32 %id) +{ + %1 = bitcast double %value to <2 x i32> + %2 = call <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to double + + ret double %3 +} + +; GLSL: dvec2 subgroupBroadcast(dvec2, uint) +define spir_func <2 x double> @_Z24GroupNonUniformBroadcastiDv2_di(i32 %scope, <2 x double> %value, i32 %id) +{ + %1 = bitcast <2 x double> %value to <4 x i32> + %2 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <2 x double> + + ret <2 x double> %3 +} + +; GLSL: dvec3 subgroupBroadcast(dvec3, uint) +define spir_func <3 x double> @_Z24GroupNonUniformBroadcastiDv3_di(i32 %scope, <3 x double> %value, i32 %id) +{ + %1 = bitcast <3 x double> %value to <6 x i32> + %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> + %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> + + %4 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %3, i32 %id) + %6 = shufflevector <2 x i32> %5, <2 x i32> , <4 x i32> + + %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> + %8 = bitcast <6 x i32> %7 to <3 x double> + + ret <3 x double> %8 +} + +; GLSL: dvec4 subgroupBroadcast(dvec4, uint) +define spir_func <4 x double> @_Z24GroupNonUniformBroadcastiDv4_di(i32 %scope, <4 x double> %value, i32 %id) +{ + %1 = bitcast <4 x double> %value to <8 x i32> + %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + + %4 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %3, i32 %id) + + %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + %7 = bitcast <8 x i32> %6 to <4 x double> + + ret <4 x double> %7 +} + +; GLSL: bool subgroupBroadcast(bool, uint) +define spir_func i1 @_Z24GroupNonUniformBroadcastibi(i32 %scope, i1 %value, i32 %id) +{ + %1 = zext i1 %value to i32 + %2 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id) + %3 = trunc i32 %2 to i1 + + ret i1 %3 +} + +; GLSL: bvec2 subgroupBroadcast(bvec2, uint) +define spir_func <2 x i1> @_Z24GroupNonUniformBroadcastiDv2_bi(i32 %scope, <2 x i1> %value, i32 %id) +{ + %1 = zext <2 x i1> %value to <2 x i32> + %2 = call <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = trunc <2 x i32> %2 to <2 x i1> + + ret <2 x i1> %3 +} + +; GLSL: bvec3 subgroupBroadcast(bvec3, uint) +define spir_func <3 x i1> @_Z24GroupNonUniformBroadcastiDv3_bi(i32 %scope, <3 x i1> %value, i32 %id) +{ + %1 = zext <3 x i1> %value to <3 x i32> + %2 = call <3 x i32> @_Z24GroupNonUniformBroadcastiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = trunc <3 x i32> %2 to <3 x i1> + + ret <3 x i1> %3 +} + +; GLSL: bvec4 subgroupBroadcast(bvec4, uint) +define spir_func <4 x i1> @_Z24GroupNonUniformBroadcastiDv4_bi(i32 %scope, <4 x i1> %value, i32 %id) +{ + %1 = zext <4 x i1> %value to <4 x i32> + %2 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = trunc <4 x i32> %2 to <4 x i1> + + ret <4 x i1> %3 +} + +; GLSL: int/uint subgroupBroadcastFirst(int/uint) +define spir_func i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %value) +{ + %1 = call i32 @_Z26SubgroupFirstInvocationKHRi(i32 %value) + ret i32 %1 +} + +; GLSL: ivec2/uvec2 subgroupBroadcastFirst(ivec2/uvec2) +define spir_func <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %value) +{ + %1 = extractelement <2 x i32> %value, i32 0 + %2 = extractelement <2 x i32> %value, i32 1 + + %3 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1) + %4 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %2) + + %5 = insertelement <2 x i32> undef, i32 %3, i32 0 + %6 = insertelement <2 x i32> %5, i32 %4, i32 1 + + ret <2 x i32> %6 +} + +; GLSL: ivec3/uvec3 subgroupBroadcastFirst(ivec3/uvec3) +define spir_func <3 x i32> @_Z29GroupNonUniformBroadcastFirstiDv3_i(i32 %scope, <3 x i32> %value) +{ + %1 = extractelement <3 x i32> %value, i32 0 + %2 = extractelement <3 x i32> %value, i32 1 + %3 = extractelement <3 x i32> %value, i32 2 + + %4 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1) + %5 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %2) + %6 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %3) + + %7 = insertelement <3 x i32> undef, i32 %4, i32 0 + %8 = insertelement <3 x i32> %7, i32 %5, i32 1 + %9 = insertelement <3 x i32> %8, i32 %6, i32 2 + + ret <3 x i32> %9 +} + +; GLSL: ivec4/uvec4 subgroupBroadcastFirst(ivec4/uvec4) +define spir_func <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %value) +{ + %1 = extractelement <4 x i32> %value, i32 0 + %2 = extractelement <4 x i32> %value, i32 1 + %3 = extractelement <4 x i32> %value, i32 2 + %4 = extractelement <4 x i32> %value, i32 3 + + %5 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1) + %6 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %2) + %7 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %3) + %8 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %4) + + %9 = insertelement <4 x i32> undef, i32 %5, i32 0 + %10 = insertelement <4 x i32> %9, i32 %6, i32 1 + %11 = insertelement <4 x i32> %10, i32 %7, i32 2 + %12 = insertelement <4 x i32> %11, i32 %8, i32 3 + + ret <4 x i32> %12 +} + +; GLSL: float subgroupBroadcastFirst(float) +define spir_func float @_Z29GroupNonUniformBroadcastFirstif(i32 %scope, float %value) +{ + %1 = bitcast float %value to i32 + %2 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1) + %3 = bitcast i32 %2 to float + + ret float %3 +} + +; GLSL: vec2 subgroupBroadcastFirst(vec2) +define spir_func <2 x float> @_Z29GroupNonUniformBroadcastFirstiDv2_f(i32 %scope, <2 x float> %value) +{ + %1 = bitcast <2 x float> %value to <2 x i32> + %2 = call <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %1) + %3 = bitcast <2 x i32> %2 to <2 x float> + + ret <2 x float> %3 +} + +; GLSL: vec3 subgroupBroadcastFirst(vec3) +define spir_func <3 x float> @_Z29GroupNonUniformBroadcastFirstiDv3_f(i32 %scope, <3 x float> %value) +{ + %1 = bitcast <3 x float> %value to <3 x i32> + %2 = call <3 x i32> @_Z29GroupNonUniformBroadcastFirstiDv3_i(i32 %scope, <3 x i32> %1) + %3 = bitcast <3 x i32> %2 to <3 x float> + + ret <3 x float> %3 +} + +; GLSL: vec4 subgroupBroadcastFirst(vec4) +define spir_func <4 x float> @_Z29GroupNonUniformBroadcastFirstiDv4_f(i32 %scope, <4 x float> %value) +{ + %1 = bitcast <4 x float> %value to <4 x i32> + %2 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %1) + %3 = bitcast <4 x i32> %2 to <4 x float> + + ret <4 x float> %3 +} + +; GLSL: double subgroupBroadcastFirst(double) +define spir_func double @_Z29GroupNonUniformBroadcastFirstid(i32 %scope, double %value) +{ + %1 = bitcast double %value to <2 x i32> + %2 = call <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %1) + %3 = bitcast <2 x i32> %2 to double + + ret double %3 +} + +; GLSL: dvec2 subgroupBroadcastFirst(dvec2) +define spir_func <2 x double> @_Z29GroupNonUniformBroadcastFirstiDv2_d(i32 %scope, <2 x double> %value) +{ + %1 = bitcast <2 x double> %value to <4 x i32> + %2 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %1) + %3 = bitcast <4 x i32> %2 to <2 x double> + + ret <2 x double> %3 +} + +; GLSL: dvec3 subgroupBroadcastFirst(dvec3) +define spir_func <3 x double> @_Z29GroupNonUniformBroadcastFirstiDv3_d(i32 %scope, <3 x double> %value) +{ + %1 = bitcast <3 x double> %value to <6 x i32> + %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> + %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> + + %4 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %2) + %5 = call <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %3) + %6 = shufflevector <2 x i32> %5, <2 x i32> , <4 x i32> + + %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> + %8 = bitcast <6 x i32> %7 to <3 x double> + + ret <3 x double> %8 +} + +; GLSL: dvec4 subgroupBroadcastFirst(dvec4) +define spir_func <4 x double> @_Z29GroupNonUniformBroadcastFirstiDv4_d(i32 %scope, <4 x double> %value) +{ + %1 = bitcast <4 x double> %value to <8 x i32> + %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + + %4 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %2) + %5 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %3) + + %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + %7 = bitcast <8 x i32> %6 to <4 x double> + + ret <4 x double> %7 +} + +; GLSL: bool subgroupBroadcastFirst(bool) +define spir_func i1 @_Z29GroupNonUniformBroadcastFirstib(i32 %scope, i1 %value) +{ + %1 = zext i1 %value to i32 + %2 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1) + %3 = trunc i32 %2 to i1 + + ret i1 %3 +} + +; GLSL: bvec2 subgroupBroadcastFirst(bvec2) +define spir_func <2 x i1> @_Z29GroupNonUniformBroadcastFirstiDv2_b(i32 %scope, <2 x i1> %value) +{ + %1 = zext <2 x i1> %value to <2 x i32> + %2 = call <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %1) + %3 = trunc <2 x i32> %2 to <2 x i1> + + ret <2 x i1> %3 +} + +; GLSL: bvec3 subgroupBroadcastFirst(bvec3) +define spir_func <3 x i1> @_Z29GroupNonUniformBroadcastFirstiDv3_b(i32 %scope, <3 x i1> %value) +{ + %1 = zext <3 x i1> %value to <3 x i32> + %2 = call <3 x i32> @_Z29GroupNonUniformBroadcastFirstiDv3_i(i32 %scope, <3 x i32> %1) + %3 = trunc <3 x i32> %2 to <3 x i1> + + ret <3 x i1> %3 +} + +; GLSL: bvec4 subgroupBroadcastFirst(bvec4, uint) +define spir_func <4 x i1> @_Z29GroupNonUniformBroadcastFirstiDv4_b(i32 %scope, <4 x i1> %value) +{ + %1 = zext <4 x i1> %value to <4 x i32> + %2 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %1) + %3 = trunc <4 x i32> %2 to <4 x i1> + + ret <4 x i1> %3 +} + +; GLSL: uvec4 subgroupBallot(bool) +define spir_func <4 x i32> @_Z21GroupNonUniformBallotib(i32 %scope, i1 %value) +{ + %1 = call <4 x i32> @_Z17SubgroupBallotKHRb(i1 %value) + ret <4 x i32> %1 +} + +; GLSL: bool subgroupInverseBallot(uvec4) +define spir_func i1 @_Z28GroupNonUniformInverseBallotiDv4_i(i32 %scope, <4 x i32> %value) +{ + %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 + %2 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1) #1 + %3 = zext i32 %2 to i64 + %4 = shl i64 1, %3 + + %5 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> + %6 = bitcast <2 x i32> %5 to i64 + %7 = and i64 %4, %6 + %8 = icmp ne i64 %7, 0 + + ret i1 %8 +} + +; GLSL: bool subgroupBallotBitExtract(uvec4, uint) +define spir_func i1 @_Z31GroupNonUniformBallotBitExtractiDv4_ii(i32 %scope, <4 x i32> %value, i32 %index) +{ + %1 = zext i32 %index to i64 + %2 = shl i64 1, %1 + + %3 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> + %4 = bitcast <2 x i32> %3 to i64 + %5 = and i64 %2, %4 + %6 = icmp ne i64 %5, 0 + + ret i1 %6 +} + +; GLSL: uint subgroupBallotBitCount(uvec4) +; uint subgroupBallotInclusiveBitCount(uvec4) +; uint subgroupBallotExclusiveBitCount(uvec4) +define spir_func i32 @_Z29GroupNonUniformBallotBitCountiiDv4_i(i32 %scope, i32 %operation, <4 x i32> %value) +{ + %1 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> + %2 = bitcast <2 x i32> %1 to i64 + %3 = extractelement <2 x i32> %1, i32 0 + %4 = extractelement <2 x i32> %1, i32 1 + + switch i32 %operation, label %.default [ i32 0, label %.reduce + i32 1, label %.inclusive + i32 2, label %.exclusive ] + +.reduce: + %5 = call i64 @llvm.ctpop.i64(i64 %2) + %6 = trunc i64 %5 to i32 + ret i32 %6 + +.inclusive: + %7 = call i32 @llvm.amdgcn.mbcnt.lo(i32 %3, i32 0) + %8 = call i32 @llvm.amdgcn.mbcnt.hi(i32 %4, i32 %7) + %9 = add i32 %8, 1 + + %10 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %11 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %10) + %12 = zext i32 %11 to i64 + %13 = shl i64 1, %12 + + %14 = and i64 %13, %2 + %15 = icmp ne i64 %14, 0 + %16 = select i1 %15, i32 %9, i32 %8 + + ret i32 %16 + +.exclusive: + %17 = call i32 @llvm.amdgcn.mbcnt.lo(i32 %3, i32 0) + %18 = call i32 @llvm.amdgcn.mbcnt.hi(i32 %4, i32 %17) + + ret i32 %18 + +.default: + ret i32 0 +} + +; GLSL: uint subgroupBallotFindLSB(uvec4) +define spir_func i32 @_Z28GroupNonUniformBallotFindLSBiDv4_i(i32 %scope, <4 x i32> %value) +{ + %1 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> + %2 = bitcast <2 x i32> %1 to i64 + + %3 = call i64 @llvm.cttz.i64(i64 %2, i1 true) + %4 = trunc i64 %3 to i32 + + ret i32 %4 +} + +; GLSL: uint subgroupBallotFindMSB(uvec4) +define spir_func i32 @_Z28GroupNonUniformBallotFindMSBiDv4_i(i32 %scope, <4 x i32> %value) +{ + %1 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> + %2 = bitcast <2 x i32> %1 to i64 + + %3 = call i64 @llvm.ctlz.i64(i64 %2, i1 true) + %4 = trunc i64 %3 to i32 + %5 = sub i32 63, %4 + + ret i32 %5 +} + +; GLSL: int/uint subgroupShuffle(int/uint, uint) +define spir_func i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %value, i32 %id) +{ + %1 = mul i32 %id, 4 + %2 = call i32 @llvm.amdgcn.ds.bpermute(i32 %1, i32 %value) + + ret i32 %2 +} + +; GLSL: ivec2/uvec2 subgroupShuffle(ivec2/uvec2, uint) +define spir_func <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %value, i32 %id) +{ + %1 = extractelement <2 x i32> %value, i32 0 + %2 = extractelement <2 x i32> %value, i32 1 + + %3 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id) + %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %2, i32 %id) + + %5 = insertelement <2 x i32> undef, i32 %3, i32 0 + %6 = insertelement <2 x i32> %5, i32 %4, i32 1 + + ret <2 x i32> %6 +} + +; GLSL: ivec3/uvec3 subgroupShuffle(ivec3/uvec3, uint) +define spir_func <3 x i32> @_Z22GroupNonUniformShuffleiDv3_ii(i32 %scope, <3 x i32> %value, i32 %id) +{ + %1 = extractelement <3 x i32> %value, i32 0 + %2 = extractelement <3 x i32> %value, i32 1 + %3 = extractelement <3 x i32> %value, i32 2 + + %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id) + %5 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %2, i32 %id) + %6 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %3, i32 %id) + + %7 = insertelement <3 x i32> undef, i32 %4, i32 0 + %8 = insertelement <3 x i32> %7, i32 %5, i32 1 + %9 = insertelement <3 x i32> %8, i32 %6, i32 2 + + ret <3 x i32> %9 +} + +; GLSL: ivec4/uvec4 subgroupShuffle(ivec4/uvec4, uint) +define spir_func <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %value, i32 %id) +{ + %1 = extractelement <4 x i32> %value, i32 0 + %2 = extractelement <4 x i32> %value, i32 1 + %3 = extractelement <4 x i32> %value, i32 2 + %4 = extractelement <4 x i32> %value, i32 3 + + %5 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id) + %6 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %2, i32 %id) + %7 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %3, i32 %id) + %8 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %4, i32 %id) + + %9 = insertelement <4 x i32> undef, i32 %5, i32 0 + %10 = insertelement <4 x i32> %9, i32 %6, i32 1 + %11 = insertelement <4 x i32> %10, i32 %7, i32 2 + %12 = insertelement <4 x i32> %11, i32 %8, i32 3 + + ret <4 x i32> %12 +} + +; GLSL: float subgroupShuffle(float, uint) +define spir_func float @_Z22GroupNonUniformShuffleifi(i32 %scope, float %value, i32 %id) +{ + %1 = bitcast float %value to i32 + %2 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id) + %3 = bitcast i32 %2 to float + + ret float %3 +} + +; GLSL: vec2 subgroupShuffle(vec2, uint) +define spir_func <2 x float> @_Z22GroupNonUniformShuffleiDv2_fi(i32 %scope, <2 x float> %value, i32 %id) +{ + %1 = bitcast <2 x float> %value to <2 x i32> + %2 = call <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to <2 x float> + + ret <2 x float> %3 +} + +; GLSL: vec3 subgroupShuffle(vec3, uint) +define spir_func <3 x float> @_Z22GroupNonUniformShuffleiDv3_fi(i32 %scope, <3 x float> %value, i32 %id) +{ + %1 = bitcast <3 x float> %value to <3 x i32> + %2 = call <3 x i32> @_Z22GroupNonUniformShuffleiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = bitcast <3 x i32> %2 to <3 x float> + + ret <3 x float> %3 +} + +; GLSL: vec4 subgroupShuffle(vec4, uint) +define spir_func <4 x float> @_Z22GroupNonUniformShuffleiDv4_fi(i32 %scope, <4 x float> %value, i32 %id) +{ + %1 = bitcast <4 x float> %value to <4 x i32> + %2 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <4 x float> + + ret <4 x float> %3 +} + +; GLSL: double subgroupShuffle(double, uint) +define spir_func double @_Z22GroupNonUniformShuffleidi(i32 %scope, double %value, i32 %id) +{ + %1 = bitcast double %value to <2 x i32> + %2 = call <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to double + + ret double %3 +} + +; GLSL: dvec2 subgroupShuffle(dvec2, uint) +define spir_func <2 x double> @_Z22GroupNonUniformShuffleiDv2_di(i32 %scope, <2 x double> %value, i32 %id) +{ + %1 = bitcast <2 x double> %value to <4 x i32> + %2 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <2 x double> + + ret <2 x double> %3 +} + +; GLSL: dvec3 subgroupShuffle(dvec3, uint) +define spir_func <3 x double> @_Z22GroupNonUniformShuffleiDv3_di(i32 %scope, <3 x double> %value, i32 %id) +{ + %1 = bitcast <3 x double> %value to <6 x i32> + %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> + %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> + + %4 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %3, i32 %id) + %6 = shufflevector <2 x i32> %5, <2 x i32> , <4 x i32> + + %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> + %8 = bitcast <6 x i32> %7 to <3 x double> + + ret <3 x double> %8 +} + +; GLSL: dvec4 subgroupShuffle(dvec4, uint) +define spir_func <4 x double> @_Z22GroupNonUniformShuffleiDv4_di(i32 %scope, <4 x double> %value, i32 %id) +{ + %1 = bitcast <4 x double> %value to <8 x i32> + %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + + %4 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %3, i32 %id) + + %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + %7 = bitcast <8 x i32> %6 to <4 x double> + + ret <4 x double> %7 +} + +; GLSL: bool subgroupShuffle(bool, uint) +define spir_func i1 @_Z22GroupNonUniformShuffleibi(i32 %scope, i1 %value, i32 %id) +{ + %1 = zext i1 %value to i32 + %2 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id) + %3 = trunc i32 %2 to i1 + + ret i1 %3 +} + +; GLSL: bvec2 subgroupShuffle(bvec2, uint) +define spir_func <2 x i1> @_Z22GroupNonUniformShuffleiDv2_bi(i32 %scope, <2 x i1> %value, i32 %id) +{ + %1 = zext <2 x i1> %value to <2 x i32> + %2 = call <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = trunc <2 x i32> %2 to <2 x i1> + + ret <2 x i1> %3 +} + +; GLSL: bvec3 subgroupShuffle(bvec3, uint) +define spir_func <3 x i1> @_Z22GroupNonUniformShuffleiDv3_bi(i32 %scope, <3 x i1> %value, i32 %id) +{ + %1 = zext <3 x i1> %value to <3 x i32> + %2 = call <3 x i32> @_Z22GroupNonUniformShuffleiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = trunc <3 x i32> %2 to <3 x i1> + + ret <3 x i1> %3 +} + +; GLSL: bvec4 subgroupShuffle(bvec4, uint) +define spir_func <4 x i1> @_Z22GroupNonUniformShuffleiDv4_bi(i32 %scope, <4 x i1> %value, i32 %id) +{ + %1 = zext <4 x i1> %value to <4 x i32> + %2 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = trunc <4 x i32> %2 to <4 x i1> + + ret <4 x i1> %3 +} + +; GLSL: int/uint subgroupShuffleXor(int/uint, uint) +define spir_func i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %value, i32 %mask) +{ + %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %2 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1) + %3 = xor i32 %2, %mask + %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %value, i32 %3) + + ret i32 %4 +} + +; GLSL: ivec2/uvec2 subgroupShuffleXor(ivec2/uvec2, uint) +define spir_func <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %value, i32 %id) +{ + %1 = extractelement <2 x i32> %value, i32 0 + %2 = extractelement <2 x i32> %value, i32 1 + + %3 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id) + %4 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %2, i32 %id) + + %5 = insertelement <2 x i32> undef, i32 %3, i32 0 + %6 = insertelement <2 x i32> %5, i32 %4, i32 1 + + ret <2 x i32> %6 +} + +; GLSL: ivec3/uvec3 subgroupShuffleXor(ivec3/uvec3, uint) +define spir_func <3 x i32> @_Z25GroupNonUniformShuffleXoriDv3_ii(i32 %scope, <3 x i32> %value, i32 %id) +{ + %1 = extractelement <3 x i32> %value, i32 0 + %2 = extractelement <3 x i32> %value, i32 1 + %3 = extractelement <3 x i32> %value, i32 2 + + %4 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id) + %5 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %2, i32 %id) + %6 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %3, i32 %id) + + %7 = insertelement <3 x i32> undef, i32 %4, i32 0 + %8 = insertelement <3 x i32> %7, i32 %5, i32 1 + %9 = insertelement <3 x i32> %8, i32 %6, i32 2 + + ret <3 x i32> %9 +} + +; GLSL: ivec4/uvec4 subgroupShuffleXor(ivec4/uvec4, uint) +define spir_func <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %value, i32 %id) +{ + %1 = extractelement <4 x i32> %value, i32 0 + %2 = extractelement <4 x i32> %value, i32 1 + %3 = extractelement <4 x i32> %value, i32 2 + %4 = extractelement <4 x i32> %value, i32 3 + + %5 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id) + %6 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %2, i32 %id) + %7 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %3, i32 %id) + %8 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %4, i32 %id) + + %9 = insertelement <4 x i32> undef, i32 %5, i32 0 + %10 = insertelement <4 x i32> %9, i32 %6, i32 1 + %11 = insertelement <4 x i32> %10, i32 %7, i32 2 + %12 = insertelement <4 x i32> %11, i32 %8, i32 3 + + ret <4 x i32> %12 +} + +; GLSL: float subgroupShuffleXor(float, uint) +define spir_func float @_Z25GroupNonUniformShuffleXorifi(i32 %scope, float %value, i32 %id) +{ + %1 = bitcast float %value to i32 + %2 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id) + %3 = bitcast i32 %2 to float + + ret float %3 +} + +; GLSL: vec2 subgroupShuffleXor(vec2, uint) +define spir_func <2 x float> @_Z25GroupNonUniformShuffleXoriDv2_fi(i32 %scope, <2 x float> %value, i32 %id) +{ + %1 = bitcast <2 x float> %value to <2 x i32> + %2 = call <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to <2 x float> + + ret <2 x float> %3 +} + +; GLSL: vec3 subgroupShuffleXor(vec3, uint) +define spir_func <3 x float> @_Z25GroupNonUniformShuffleXoriDv3_fi(i32 %scope, <3 x float> %value, i32 %id) +{ + %1 = bitcast <3 x float> %value to <3 x i32> + %2 = call <3 x i32> @_Z25GroupNonUniformShuffleXoriDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = bitcast <3 x i32> %2 to <3 x float> + + ret <3 x float> %3 +} + +; GLSL: vec4 subgroupShuffleXor(vec4, uint) +define spir_func <4 x float> @_Z25GroupNonUniformShuffleXoriDv4_fi(i32 %scope, <4 x float> %value, i32 %id) +{ + %1 = bitcast <4 x float> %value to <4 x i32> + %2 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <4 x float> + + ret <4 x float> %3 +} + +; GLSL: double subgroupShuffleXor(double, uint) +define spir_func double @_Z25GroupNonUniformShuffleXoridi(i32 %scope, double %value, i32 %id) +{ + %1 = bitcast double %value to <2 x i32> + %2 = call <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to double + + ret double %3 +} + +; GLSL: dvec2 subgroupShuffleXor(dvec2, uint) +define spir_func <2 x double> @_Z25GroupNonUniformShuffleXoriDv2_di(i32 %scope, <2 x double> %value, i32 %id) +{ + %1 = bitcast <2 x double> %value to <4 x i32> + %2 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <2 x double> + + ret <2 x double> %3 +} + +; GLSL: dvec3 subgroupShuffleXor(dvec3, uint) +define spir_func <3 x double> @_Z25GroupNonUniformShuffleXoriDv3_di(i32 %scope, <3 x double> %value, i32 %id) +{ + %1 = bitcast <3 x double> %value to <6 x i32> + %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> + %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> + + %4 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %3, i32 %id) + %6 = shufflevector <2 x i32> %5, <2 x i32> , <4 x i32> + + %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> + %8 = bitcast <6 x i32> %7 to <3 x double> + + ret <3 x double> %8 +} + +; GLSL: dvec4 subgroupShuffleXor(dvec4, uint) +define spir_func <4 x double> @_Z25GroupNonUniformShuffleXoriDv4_di(i32 %scope, <4 x double> %value, i32 %id) +{ + %1 = bitcast <4 x double> %value to <8 x i32> + %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + + %4 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %3, i32 %id) + + %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + %7 = bitcast <8 x i32> %6 to <4 x double> + + ret <4 x double> %7 +} + +; GLSL: bool subgroupShuffleXor(bool, uint) +define spir_func i1 @_Z25GroupNonUniformShuffleXoribi(i32 %scope, i1 %value, i32 %id) +{ + %1 = zext i1 %value to i32 + %2 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id) + %3 = trunc i32 %2 to i1 + + ret i1 %3 +} + +; GLSL: bvec2 subgroupShuffleXor(bvec2, uint) +define spir_func <2 x i1> @_Z25GroupNonUniformShuffleXoriDv2_bi(i32 %scope, <2 x i1> %value, i32 %id) +{ + %1 = zext <2 x i1> %value to <2 x i32> + %2 = call <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = trunc <2 x i32> %2 to <2 x i1> + + ret <2 x i1> %3 +} + +; GLSL: bvec3 subgroupShuffleXor(bvec3, uint) +define spir_func <3 x i1> @_Z25GroupNonUniformShuffleXoriDv3_bi(i32 %scope, <3 x i1> %value, i32 %id) +{ + %1 = zext <3 x i1> %value to <3 x i32> + %2 = call <3 x i32> @_Z25GroupNonUniformShuffleXoriDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = trunc <3 x i32> %2 to <3 x i1> + + ret <3 x i1> %3 +} + +; GLSL: bvec4 subgroupShuffleXor(bvec4, uint) +define spir_func <4 x i1> @_Z25GroupNonUniformShuffleXoriDv4_bi(i32 %scope, <4 x i1> %value, i32 %id) +{ + %1 = zext <4 x i1> %value to <4 x i32> + %2 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = trunc <4 x i32> %2 to <4 x i1> + + ret <4 x i1> %3 +} + +; GLSL: int/uint subgroupShuffleUp(int/uint, uint) +define spir_func i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %value, i32 %delta) +{ + %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %2 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1) + %3 = sub i32 %2, %delta + %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %value, i32 %3) + + ret i32 %4 +} + +; GLSL: ivec2/uvec2 subgroupShuffleUp(ivec2/uvec2, uint) +define spir_func <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %value, i32 %id) +{ + %1 = extractelement <2 x i32> %value, i32 0 + %2 = extractelement <2 x i32> %value, i32 1 + + %3 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id) + %4 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %2, i32 %id) + + %5 = insertelement <2 x i32> undef, i32 %3, i32 0 + %6 = insertelement <2 x i32> %5, i32 %4, i32 1 + + ret <2 x i32> %6 +} + +; GLSL: ivec3/uvec3 subgroupShuffleUp(ivec3/uvec3, uint) +define spir_func <3 x i32> @_Z24GroupNonUniformShuffleUpiDv3_ii(i32 %scope, <3 x i32> %value, i32 %id) +{ + %1 = extractelement <3 x i32> %value, i32 0 + %2 = extractelement <3 x i32> %value, i32 1 + %3 = extractelement <3 x i32> %value, i32 2 + + %4 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id) + %5 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %2, i32 %id) + %6 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %3, i32 %id) + + %7 = insertelement <3 x i32> undef, i32 %4, i32 0 + %8 = insertelement <3 x i32> %7, i32 %5, i32 1 + %9 = insertelement <3 x i32> %8, i32 %6, i32 2 + + ret <3 x i32> %9 +} + +; GLSL: ivec4/uvec4 subgroupShuffleUp(ivec4/uvec4, uint) +define spir_func <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %value, i32 %id) +{ + %1 = extractelement <4 x i32> %value, i32 0 + %2 = extractelement <4 x i32> %value, i32 1 + %3 = extractelement <4 x i32> %value, i32 2 + %4 = extractelement <4 x i32> %value, i32 3 + + %5 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id) + %6 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %2, i32 %id) + %7 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %3, i32 %id) + %8 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %4, i32 %id) + + %9 = insertelement <4 x i32> undef, i32 %5, i32 0 + %10 = insertelement <4 x i32> %9, i32 %6, i32 1 + %11 = insertelement <4 x i32> %10, i32 %7, i32 2 + %12 = insertelement <4 x i32> %11, i32 %8, i32 3 + + ret <4 x i32> %12 +} + +; GLSL: float subgroupShuffleUp(float, uint) +define spir_func float @_Z24GroupNonUniformShuffleUpifi(i32 %scope, float %value, i32 %id) +{ + %1 = bitcast float %value to i32 + %2 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id) + %3 = bitcast i32 %2 to float + + ret float %3 +} + +; GLSL: vec2 subgroupShuffleUp(vec2, uint) +define spir_func <2 x float> @_Z24GroupNonUniformShuffleUpiDv2_fi(i32 %scope, <2 x float> %value, i32 %id) +{ + %1 = bitcast <2 x float> %value to <2 x i32> + %2 = call <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to <2 x float> + + ret <2 x float> %3 +} + +; GLSL: vec3 subgroupShuffleUp(vec3, uint) +define spir_func <3 x float> @_Z24GroupNonUniformShuffleUpiDv3_fi(i32 %scope, <3 x float> %value, i32 %id) +{ + %1 = bitcast <3 x float> %value to <3 x i32> + %2 = call <3 x i32> @_Z24GroupNonUniformShuffleUpiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = bitcast <3 x i32> %2 to <3 x float> + + ret <3 x float> %3 +} + +; GLSL: vec4 subgroupShuffleUp(vec4, uint) +define spir_func <4 x float> @_Z24GroupNonUniformShuffleUpiDv4_fi(i32 %scope, <4 x float> %value, i32 %id) +{ + %1 = bitcast <4 x float> %value to <4 x i32> + %2 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <4 x float> + + ret <4 x float> %3 +} + +; GLSL: double subgroupShuffleUp(double, uint) +define spir_func double @_Z24GroupNonUniformShuffleUpidi(i32 %scope, double %value, i32 %id) +{ + %1 = bitcast double %value to <2 x i32> + %2 = call <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to double + + ret double %3 +} + +; GLSL: dvec2 subgroupShuffleUp(dvec2, uint) +define spir_func <2 x double> @_Z24GroupNonUniformShuffleUpiDv2_di(i32 %scope, <2 x double> %value, i32 %id) +{ + %1 = bitcast <2 x double> %value to <4 x i32> + %2 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <2 x double> + + ret <2 x double> %3 +} + +; GLSL: dvec3 subgroupShuffleUp(dvec3, uint) +define spir_func <3 x double> @_Z24GroupNonUniformShuffleUpiDv3_di(i32 %scope, <3 x double> %value, i32 %id) +{ + %1 = bitcast <3 x double> %value to <6 x i32> + %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> + %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> + + %4 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %3, i32 %id) + %6 = shufflevector <2 x i32> %5, <2 x i32> , <4 x i32> + + %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> + %8 = bitcast <6 x i32> %7 to <3 x double> + + ret <3 x double> %8 +} + +; GLSL: dvec4 subgroupShuffleUp(dvec4, uint) +define spir_func <4 x double> @_Z24GroupNonUniformShuffleUpiDv4_di(i32 %scope, <4 x double> %value, i32 %id) +{ + %1 = bitcast <4 x double> %value to <8 x i32> + %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + + %4 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %3, i32 %id) + + %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + %7 = bitcast <8 x i32> %6 to <4 x double> + + ret <4 x double> %7 +} + +; GLSL: bool subgroupShuffleUp(bool, uint) +define spir_func i1 @_Z24GroupNonUniformShuffleUpibi(i32 %scope, i1 %value, i32 %id) +{ + %1 = zext i1 %value to i32 + %2 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id) + %3 = trunc i32 %2 to i1 + + ret i1 %3 +} + +; GLSL: bvec2 subgroupShuffleUp(bvec2, uint) +define spir_func <2 x i1> @_Z24GroupNonUniformShuffleUpiDv2_bi(i32 %scope, <2 x i1> %value, i32 %id) +{ + %1 = zext <2 x i1> %value to <2 x i32> + %2 = call <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = trunc <2 x i32> %2 to <2 x i1> + + ret <2 x i1> %3 +} + +; GLSL: bvec3 subgroupShuffleUp(bvec3, uint) +define spir_func <3 x i1> @_Z24GroupNonUniformShuffleUpiDv3_bi(i32 %scope, <3 x i1> %value, i32 %id) +{ + %1 = zext <3 x i1> %value to <3 x i32> + %2 = call <3 x i32> @_Z24GroupNonUniformShuffleUpiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = trunc <3 x i32> %2 to <3 x i1> + + ret <3 x i1> %3 +} + +; GLSL: bvec4 subgroupShuffleUp(bvec4, uint) +define spir_func <4 x i1> @_Z24GroupNonUniformShuffleUpiDv4_bi(i32 %scope, <4 x i1> %value, i32 %id) +{ + %1 = zext <4 x i1> %value to <4 x i32> + %2 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = trunc <4 x i32> %2 to <4 x i1> + + ret <4 x i1> %3 +} + +; GLSL: int/uint subgroupShuffleDown(int/uint, uint) +define spir_func i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %value, i32 %delta) +{ + %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %2 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1) + %3 = add i32 %2, %delta + %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %value, i32 %3) + + ret i32 %4 +} + +; GLSL: ivec2/uvec2 subgroupShuffleDown(ivec2/uvec2, uint) +define spir_func <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %value, i32 %id) +{ + %1 = extractelement <2 x i32> %value, i32 0 + %2 = extractelement <2 x i32> %value, i32 1 + + %3 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id) + %4 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %2, i32 %id) + + %5 = insertelement <2 x i32> undef, i32 %3, i32 0 + %6 = insertelement <2 x i32> %5, i32 %4, i32 1 + + ret <2 x i32> %6 +} + +; GLSL: ivec3/uvec3 subgroupShuffleDown(ivec3/uvec3, uint) +define spir_func <3 x i32> @_Z26GroupNonUniformShuffleDowniDv3_ii(i32 %scope, <3 x i32> %value, i32 %id) +{ + %1 = extractelement <3 x i32> %value, i32 0 + %2 = extractelement <3 x i32> %value, i32 1 + %3 = extractelement <3 x i32> %value, i32 2 + + %4 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id) + %5 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %2, i32 %id) + %6 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %3, i32 %id) + + %7 = insertelement <3 x i32> undef, i32 %4, i32 0 + %8 = insertelement <3 x i32> %7, i32 %5, i32 1 + %9 = insertelement <3 x i32> %8, i32 %6, i32 2 + + ret <3 x i32> %9 +} + +; GLSL: ivec4/uvec4 subgroupShuffleDown(ivec4/uvec4, uint) +define spir_func <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %value, i32 %id) +{ + %1 = extractelement <4 x i32> %value, i32 0 + %2 = extractelement <4 x i32> %value, i32 1 + %3 = extractelement <4 x i32> %value, i32 2 + %4 = extractelement <4 x i32> %value, i32 3 + + %5 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id) + %6 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %2, i32 %id) + %7 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %3, i32 %id) + %8 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %4, i32 %id) + + %9 = insertelement <4 x i32> undef, i32 %5, i32 0 + %10 = insertelement <4 x i32> %9, i32 %6, i32 1 + %11 = insertelement <4 x i32> %10, i32 %7, i32 2 + %12 = insertelement <4 x i32> %11, i32 %8, i32 3 + + ret <4 x i32> %12 +} + +; GLSL: float subgroupShuffleDown(float, uint) +define spir_func float @_Z26GroupNonUniformShuffleDownifi(i32 %scope, float %value, i32 %id) +{ + %1 = bitcast float %value to i32 + %2 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id) + %3 = bitcast i32 %2 to float + + ret float %3 +} + +; GLSL: vec2 subgroupShuffleDown(vec2, uint) +define spir_func <2 x float> @_Z26GroupNonUniformShuffleDowniDv2_fi(i32 %scope, <2 x float> %value, i32 %id) +{ + %1 = bitcast <2 x float> %value to <2 x i32> + %2 = call <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to <2 x float> + + ret <2 x float> %3 +} + +; GLSL: vec3 subgroupShuffleDown(vec3, uint) +define spir_func <3 x float> @_Z26GroupNonUniformShuffleDowniDv3_fi(i32 %scope, <3 x float> %value, i32 %id) +{ + %1 = bitcast <3 x float> %value to <3 x i32> + %2 = call <3 x i32> @_Z26GroupNonUniformShuffleDowniDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = bitcast <3 x i32> %2 to <3 x float> + + ret <3 x float> %3 +} + +; GLSL: vec4 subgroupShuffleDown(vec4, uint) +define spir_func <4 x float> @_Z26GroupNonUniformShuffleDowniDv4_fi(i32 %scope, <4 x float> %value, i32 %id) +{ + %1 = bitcast <4 x float> %value to <4 x i32> + %2 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <4 x float> + + ret <4 x float> %3 +} + +; GLSL: double subgroupShuffleDown(double, uint) +define spir_func double @_Z26GroupNonUniformShuffleDownidi(i32 %scope, double %value, i32 %id) +{ + %1 = bitcast double %value to <2 x i32> + %2 = call <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = bitcast <2 x i32> %2 to double + + ret double %3 +} + +; GLSL: dvec2 subgroupShuffleDown(dvec2, uint) +define spir_func <2 x double> @_Z26GroupNonUniformShuffleDowniDv2_di(i32 %scope, <2 x double> %value, i32 %id) +{ + %1 = bitcast <2 x double> %value to <4 x i32> + %2 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = bitcast <4 x i32> %2 to <2 x double> + + ret <2 x double> %3 +} + +; GLSL: dvec3 subgroupShuffleDown(dvec3, uint) +define spir_func <3 x double> @_Z26GroupNonUniformShuffleDowniDv3_di(i32 %scope, <3 x double> %value, i32 %id) +{ + %1 = bitcast <3 x double> %value to <6 x i32> + %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> + %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> + + %4 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %3, i32 %id) + %6 = shufflevector <2 x i32> %5, <2 x i32> , <4 x i32> + + %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> + %8 = bitcast <6 x i32> %7 to <3 x double> + + ret <3 x double> %8 +} + +; GLSL: dvec4 subgroupShuffleDown(dvec4, uint) +define spir_func <4 x double> @_Z26GroupNonUniformShuffleDowniDv4_di(i32 %scope, <4 x double> %value, i32 %id) +{ + %1 = bitcast <4 x double> %value to <8 x i32> + %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> + + %4 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %2, i32 %id) + %5 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %3, i32 %id) + + %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + %7 = bitcast <8 x i32> %6 to <4 x double> + + ret <4 x double> %7 +} + +; GLSL: bool subgroupShuffleDown(bool, uint) +define spir_func i1 @_Z26GroupNonUniformShuffleDownibi(i32 %scope, i1 %value, i32 %id) +{ + %1 = zext i1 %value to i32 + %2 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id) + %3 = trunc i32 %2 to i1 + + ret i1 %3 +} + +; GLSL: bvec2 subgroupShuffleDown(bvec2, uint) +define spir_func <2 x i1> @_Z26GroupNonUniformShuffleDowniDv2_bi(i32 %scope, <2 x i1> %value, i32 %id) +{ + %1 = zext <2 x i1> %value to <2 x i32> + %2 = call <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %1, i32 %id) + %3 = trunc <2 x i32> %2 to <2 x i1> + + ret <2 x i1> %3 +} + +; GLSL: bvec3 subgroupShuffleDown(bvec3, uint) +define spir_func <3 x i1> @_Z26GroupNonUniformShuffleDowniDv3_bi(i32 %scope, <3 x i1> %value, i32 %id) +{ + %1 = zext <3 x i1> %value to <3 x i32> + %2 = call <3 x i32> @_Z26GroupNonUniformShuffleDowniDv3_ii(i32 %scope, <3 x i32> %1, i32 %id) + %3 = trunc <3 x i32> %2 to <3 x i1> + + ret <3 x i1> %3 +} + +; GLSL: bvec4 subgroupShuffleDown(bvec4, uint) +define spir_func <4 x i1> @_Z26GroupNonUniformShuffleDowniDv4_bi(i32 %scope, <4 x i1> %value, i32 %id) +{ + %1 = zext <4 x i1> %value to <4 x i32> + %2 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %1, i32 %id) + %3 = trunc <4 x i32> %2 to <4 x i1> + + ret <4 x i1> %3 +} + +; GLSL: ivec/uvec subgroupAdd(ivec/uvec) +; ivec/uvec subgroupInclusiveAdd(ivec/uvec) +; ivec/uvec subgroupExclusiveAdd(ivec/uvec) + +; GLSL: vec subgroupAdd(vec) +; vec subgroupInclusiveAdd(vec) +; vec subgroupExclusiveAdd(vec) + +; GLSL: dvec subgroupAdd(dvec) +; dvec subgroupInclusiveAdd(dvec) +; dvec subgroupExclusiveAdd(dvec) + +; GLSL: ivec/uvec subgroupMul(ivec/uvec) +; ivec/uvec subgroupInclusiveMul(ivec/uvec) +; ivec/uvec subgroupExclusiveMul(ivec/uvec) + +; GLSL: vec subgroupMul(vec) +; vec subgroupInclusiveMul(vec) +; vec subgroupExclusiveMul(vec) + +; GLSL: dvec subgroupMul(dvec) +; dvec subgroupInclusiveMul(dvec) +; dvec subgroupExclusiveMul(dvec) + +; GLSL: ivec subgroupMin(ivec) +; ivec subgroupInclusiveMin(ivec) +; ivec subgroupExclusiveMin(ivec) + +; GLSL: uvec subgroupMin(uvec) +; uvec subgroupInclusiveMin(uvec) +; uvec subgroupExclusiveMin(uvec) + +; GLSL: vec subgroupMin(vec) +; vec subgroupInclusiveMin(vec) +; vec subgroupExclusiveMin(vec) + +; GLSL: dvec subgroupMin(dvec) +; dvec subgroupInclusiveMin(dvec) +; dvec subgroupExclusiveMin(dvec) + +; GLSL: ivec subgroupMax(ivec) +; ivec subgroupInclusiveMax(ivec) +; ivec subgroupExclusiveMax(ivec) + +; GLSL: uvec subgroupMax(uvec) +; uvec subgroupInclusiveMax(uvec) +; uvec subgroupExclusiveMax(uvec) + +; GLSL: vec subgroupMax(vec) +; vec subgroupInclusiveMax(vec) +; vec subgroupExclusiveMax(vec) + +; GLSL: dvec subgroupMax(dvec) +; dvec subgroupInclusiveMax(dvec) +; dvec subgroupExclusiveMax(dvec) + +; GLSL: ivec/uvec subgroupAnd(ivec/uvec) +; ivec/uvec subgroupInclusiveAnd(ivec/uvec) +; ivec/uvec subgroupExclusiveAnd(ivec/uvec) + +; GLSL: ivec/uvec subgroupOr(ivec/uvec) +; ivec/uvec subgroupInclusiveOr(ivec/uvec) +; ivec/uvec subgroupExclusiveOr(ivec/uvec) + +; GLSL: ivec/uvec subgroupXor(ivec/uvec) +; ivec/uvec subgroupInclusiveXor(ivec/uvec) +; ivec/uvec subgroupExclusiveXor(ivec/uvec) + +; GLSL: bvec subgroupAnd(bvec) +; bvec subgroupInclusiveAnd(bvec) +; bvec subgroupExclusiveAnd(bvec) + +; GLSL: bvec subgroupOr(bvec) +; bvec subgroupInclusiveOr(bvec) +; bvec subgroupExclusiveOr(bvec) + +; GLSL: bvec subgroupXor(bvec) +; bvec subgroupInclusiveXor(bvec) +; bvec subgroupExclusiveXor(bvec) + +; GLSL: gvec subgroupQuadBroadcast(gvec, uint) + +; GLSL: gvec subgroupQuadSwapHorizontal(gvec) +; gvec subgroupQuadSwapVertical(gvec) +; gvec subgroupQuadSwapDiagonal(gvec) + ; ===================================================================================================================== ; >>> Interpolation Functions ; ===================================================================================================================== @@ -483,6 +2060,9 @@ declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #2 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 declare i64 @llvm.cttz.i64(i64, i1) #0 +declare i64 @llvm.ctlz.i64(i64, i1) #0 +declare i64 @llvm.ctpop.i64(i64) #0 +declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/icd/api/llpc/patch/generate/glslSpecialOpEmuF16.ll b/icd/api/llpc/patch/generate/glslSpecialOpEmuF16.ll index 6eaaac95..9849eeb4 100644 --- a/icd/api/llpc/patch/generate/glslSpecialOpEmuF16.ll +++ b/icd/api/llpc/patch/generate/glslSpecialOpEmuF16.ll @@ -1,18 +1,27 @@ -;********************************************************************************************************************** -;* -;* Trade secret of Advanced Micro Devices, Inc. -;* Copyright (c) 2018, Advanced Micro Devices, Inc., (unpublished) -;* -;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply -;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of -;* the work. -;* -;********************************************************************************************************************** - -;********************************************************************************************************************** -;* @file glslSpecialOpEmuF16.ll -;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL special graphics-specific operations (float16). -;********************************************************************************************************************** +;; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; + ; Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + ; + ; Permission is hereby granted, free of charge, to any person obtaining a copy + ; of this software and associated documentation files (the "Software"), to deal + ; in the Software without restriction, including without limitation the rights + ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + ; copies of the Software, and to permit persons to whom the Software is + ; furnished to do so, subject to the following conditions: + ; + ; The above copyright notice and this permission notice shall be included in all + ; copies or substantial portions of the Software. + ; + ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + ; SOFTWARE. + ; + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" target triple = "spir64-unknown-unknown" diff --git a/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.py b/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.py index bd958a69..5ceb4471 100644 --- a/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.py +++ b/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.py @@ -40,7 +40,9 @@ LLVM_DECLS = {} # Image opcode traits are encoded in function name using these tokens -SPIRV_IMAGE_PREFIX = "llpc.image" +SPIRV_IMAGE_PREFIX = "llpc" +SPIRV_IMAGE_MODIFIER = "image" +SPIRV_IMAGE_SPARSE_MODIFIER = "sparse" SPIRV_IMAGE_OPERAND_DREF_MODIFIER = "dref" SPIRV_IMAGE_OPERAND_PROJ_MODIFIER = "proj" SPIRV_IMAGE_OPERAND_BIAS_MODIFIER = "bias" @@ -259,7 +261,8 @@ def copyFrom(self, other): self._cubeId = other._cubeId self._atomicData = other._atomicData - self._lzOptimization = other._lzOptimization + self._supportLzOptimization = other._supportLzOptimization + self._supportSparse = other._supportSparse self._mangledName = other._mangledName pass @@ -303,7 +306,8 @@ def __init__(self, mangledName, sampledType): # For zero-LOD optimization, will generate 2 version of function, a lz optimized version which uses # zero-LOD instruction, and a normal version uses lod instruction. - self._lzOptimization = (mangledName.find(SPIRV_IMAGE_OPERAND_LODLODZ_MODIFIER) != -1) + self._supportLzOptimization = (mangledName.find(SPIRV_IMAGE_OPERAND_LODLODZ_MODIFIER) != -1) + self._supportSparse = (mangledName.find(SPIRV_IMAGE_SPARSE_MODIFIER) != -1) self._mangledName = mangledName pass @@ -312,13 +316,13 @@ def parse(self): # Gets each image opcode trait token from function's mangled name tokens = self._mangledName.split('.') # Parses SpirvImageOpKind - opKind = tokens[1] + opKind = tokens[2] assert opKind in SPIRV_IMAGE_INST_KIND_DICT, "Error: " + self._mangledName self._opKind = SPIRV_IMAGE_INST_KIND_DICT[opKind] self._attr = SPIRV_IMAGE_INST_KIND_ATTR_DICT[self._opKind] # Parses dimension - dimName = tokens[2] + dimName = tokens[3] arrayed = False if dimName.find(SPIRV_IMAGE_ARRAY_MODIFIER) != -1: arrayed = True @@ -328,7 +332,7 @@ def parse(self): self._arrayed = arrayed # Parses other traits - for t in tokens[3:]: + for t in tokens[4:]: if t == SPIRV_IMAGE_OPERAND_DREF_MODIFIER: self._hasDref = True elif t == SPIRV_IMAGE_OPERAND_PROJ_MODIFIER: @@ -400,10 +404,29 @@ def __init__(self, funcDefBase, gfxLevel): self._gfxLevel = gfxLevel pass - # Generates image function implementation, will detect zero-LOD optimization and generate both normal - # and optimized version. + # Start code generation def gen(self, irOut): - if self._lzOptimization: + self._genWithSparse(irOut) + pass + + # Generate both normal and sparse version + def _genWithSparse(self, irOut): + if self._supportSparse: + # Generate sparse version + codeGen = CodeGen(self, self._gfxLevel) + codeGen._genWithLzOptimization(irOut) + + # Turn off sparse support + self._supportSparse = False + + # Generate normal version + codeGen = CodeGen(self, self._gfxLevel) + codeGen._genWithLzOptimization(irOut) + pass + + # Generate both normal and zero-LOD optimized version + def _genWithLzOptimization(self, irOut): + if self._supportLzOptimization: # Generate zero-LOD optimized version self._mangledName = self._mangledName.replace(SPIRV_IMAGE_OPERAND_LODLODZ_MODIFIER, SPIRV_IMAGE_OPERAND_LODZ_MODIFIER) @@ -411,21 +434,21 @@ def gen(self, irOut): codeGen._genInternal(irOut) # Turn off zero-LOD optimization - self._lzOptimization = False + self._supportLzOptimization = False self._mangledName = self._mangledName.replace(SPIRV_IMAGE_OPERAND_LODZ_MODIFIER, SPIRV_IMAGE_OPERAND_LOD_MODIFIER) - + # Generate normal version codeGen = CodeGen(self, self._gfxLevel) codeGen._genInternal(irOut) pass # Generates image function implementation. def _genInternal(self, irOut): - retType = self._opKind == SpirvImageOpKind.write and "void" or self.getReturnType() + retType = self._supportSparse and self.getSparseReturnType(self.getReturnType()) or self.getReturnType() irFuncDef = "define %s @%s(%s) %s\n" % (retType, - self.getFunctionName(), - self.getParamList(), - self._attr) + self.getFunctionName(), + self.getParamList(), + self._attr) irOut.write(irFuncDef) irOut.write('{\n') self.genLoadSamplerAndResource(irOut) @@ -478,31 +501,46 @@ def _genInternal(self, irOut): # Gets return type of image operation, which is type of texel. def getReturnType(self): - if self.isAtomicOp(): - return self._sampledType == SpirvSampledType.f32 and "float" or "i32" + ret = "void" + if self._opKind == SpirvImageOpKind.write: + pass + elif self.isAtomicOp(): + ret = self._sampledType == SpirvSampledType.f32 and "float" or "i32" elif self._opKind == SpirvImageOpKind.querylod: - return "<2 x float>" + ret = "<2 x float>" elif self._hasDref and self._opKind != SpirvImageOpKind.gather: assert self._sampledType == SpirvSampledType.f32 - return "float" + ret = "float" else: if self._sampledType == SpirvSampledType.f32: - return "<4 x float>" + ret = "<4 x float>" elif self._sampledType == SpirvSampledType.i32: - return "<4 x i32>" + ret = "<4 x i32>" elif self._sampledType == SpirvSampledType.u32: - return "<4 x i32>" + ret = "<4 x i32>" else: shouldNeverCall() + return ret + + def getSparseReturnType(self, dataReturnType): + assert self._supportSparse + return "{ i32, %s }" % (dataReturnType) + # Gets image function name. def getFunctionName(self): tokens = self._mangledName.split('.') - tokens[0] = SPIRV_IMAGE_PREFIX - assert tokens[2].startswith(SPIRV_IMAGE_DIM_PREFIX) - tokens[2] = tokens[2][len(SPIRV_IMAGE_DIM_PREFIX):] + assert tokens[0] == SPIRV_IMAGE_PREFIX + assert tokens[3].startswith(SPIRV_IMAGE_DIM_PREFIX) + + # Setup image sparse modifier in function name + tokens[1] = self._supportSparse and SPIRV_IMAGE_MODIFIER + SPIRV_IMAGE_SPARSE_MODIFIER \ + or SPIRV_IMAGE_MODIFIER + + # Remove dim prefix in function name + tokens[3] = tokens[3][len(SPIRV_IMAGE_DIM_PREFIX):] sampledTypeName = rFind(SPIRV_SAMPLED_TYPE_DICT, self._sampledType) - tokens.insert(2, sampledTypeName) + tokens.insert(3, sampledTypeName) funcName = '.'.join(tokens) # For atomic operations, atomic.xxx has been changed to atomic_xxx to ease python process, @@ -652,8 +690,21 @@ def processReturn(self, retVal, intrinGen, irOut): irOut.write(" %s = extractelement %s %s, i32 0\n" % (retVal, \ intrinGen.getBackendRetType(), \ oldRetVal)) - retType = self._opKind == SpirvImageOpKind.write and "void" or self.getReturnType() - irOut.write(" ret %s %s\n" % (retType, retVal)) + retType = self.getReturnType() + + if self._supportSparse: + # Return value of sparse instruction is struct + sparseRetType = self.getSparseReturnType(retType) + tempRetVal = self.acquireLocalVar() + irOut.write(" %s = insertvalue %s undef, i32 1, 0\n" % (tempRetVal, sparseRetType)) + dataRetVal = retVal + retVal = self.acquireLocalVar() + irOut.write(" %s = insertvalue %s %s, %s %s, 1\n" % (retVal, sparseRetType, tempRetVal, retType, dataRetVal)) + irOut.write(" ret %s %s\n" % (sparseRetType, retVal)) + pass + else: + irOut.write(" ret %s %s\n" % (retType, retVal)) + pass # Generates coordinate parameters. def genCoord(self, irOut): @@ -1154,7 +1205,7 @@ def genFillVAddrReg(self, constOffsetsIndex, isFetchingFromFmask, irOut): index += 1 irOut.write(ret[1]) - if self._hasLod and not self._lzOptimization: + if self._hasLod and not self._supportLzOptimization: ret = self.getInsertElement(vaddrReg, vaddrRegType, vaddrRegCompType, self._lod, index) vaddrReg = ret[0] index += 1 @@ -1168,7 +1219,7 @@ def getVAddrRegSize(self): size += 1 if self._hasBias: size += 1 - if self._hasLod and not self._lzOptimization: + if self._hasLod and not self._supportLzOptimization: size += 1 if self._hasGrad: size += self.getCoordNumComponents(False, False, False) * 2 @@ -1839,9 +1890,9 @@ def getFuncName(self): if self._hasBias: funcName += ".b" - elif self._hasLod and not self._lzOptimization: + elif self._hasLod and not self._supportLzOptimization: funcName += ".l" - elif self._hasLod and self._lzOptimization: + elif self._hasLod and self._supportLzOptimization: funcName += ".lz" elif self._hasGrad: funcName += ".d" @@ -1870,7 +1921,11 @@ def processLine(irOut, funcConfig, gfxLevel): # A mangled function configuration looks like: # llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.bias.constoffset # Supported configuration tokens: (All tokens must follow this order) - # 1. One of: (mandatory) + # 0. llpc (mandatory) + # 1. image or image|sparse (mandatory) + # image|sparse means sparse instruction is supported for this function, an additional sparse version + # will be generated. + # 2. One of: (mandatory) # sample # fetch # gather @@ -1879,7 +1934,7 @@ def processLine(irOut, funcConfig, gfxLevel): # write # atomic.exchange # atomic.compExchange - # atomic_iincrement + # atomic.iincrement # atomic.idecrement # atomic.iadd # atomic.isub @@ -1890,40 +1945,47 @@ def processLine(irOut, funcConfig, gfxLevel): # atomic.and # atomic.or # atomic.xor - # 2. Dimension string (mandatory, see below) - # 3. proj (optional) - # 4. dref (optional) - # 5. bias (optional) - # 6. lod (optional) - # 7. grad (optional) - # 8. constoffset (optional) - # 9. offset (optional) - # 10. constoffsets (optional) - # 11. sample (optional) - # 12. minlod (optional) - # 13. fmaskbased (optional) - # 14. fmaskonly (optional) + # 3. Dimension string (mandatory, see below) + # 4. proj (optional) + # 5. dref (optional) + # 6. bias (optional) + # 7. lod or lod|lodz (optional) + # lod|lodz means lz optimization is enabled for this function, besides normal lod version, an additional + # lodz version will also be generated, which leverages hardware lz instructions. + # 8. grad (optional) + # 9. constoffset (optional) + # 10. offset (optional) + # 11. constoffsets (optional) + # 12. sample (optional) + # 13. minlod (optional) + # 14. fmaskbased (optional) + # 15. fmaskonly (optional) # Dimension string: All supported dimensions are packed in a dimension string, as a configuration token. # Dimension string format: - # Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray|Rect|Buffer + # Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray|Rect|Buffer|SubpassData print(">>> %s" % (funcConfig)) - assert funcConfig.startswith(SPIRV_IMAGE_PREFIX), "Error: " + funcConfig # For atomic operations, replace atomic.xxx to atomic_xxx to ease python process funcConfig = funcConfig.replace('atomic.', 'atomic_') - funcConfig = funcConfig[(len(SPIRV_IMAGE_PREFIX)):] mangledTokens = funcConfig.split('.') + # check token 0 + assert funcConfig.startswith(SPIRV_IMAGE_PREFIX), "Error prefix: " + funcConfig + + # check token 1 + assert mangledTokens[1] in (SPIRV_IMAGE_MODIFIER, SPIRV_IMAGE_MODIFIER + '|' + SPIRV_IMAGE_SPARSE_MODIFIER), \ + "Error image modifier" + funcConfig + # Extract dimensions from dimension string - dimString = mangledTokens[2] + dimString = mangledTokens[3] assert dimString.startswith(SPIRV_IMAGE_DIM_PREFIX), "" + dimString dims = dimString[3:].split('|') - opKind = SPIRV_IMAGE_INST_KIND_DICT[mangledTokens[1]] + opKind = SPIRV_IMAGE_INST_KIND_DICT[mangledTokens[2]] # Generate function definition for each dimension for dim in dims: - mangledTokens[2] = SPIRV_IMAGE_DIM_PREFIX + dim + mangledTokens[3] = SPIRV_IMAGE_DIM_PREFIX + dim mangledName = '.'.join(mangledTokens) if opKind in (SpirvImageOpKind.sample, SpirvImageOpKind.fetch, SpirvImageOpKind.gather, \ SpirvImageOpKind.querylod, SpirvImageOpKind.read, SpirvImageOpKind.write, \ diff --git a/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.txt b/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.txt index 5726b7c6..4b345708 100644 --- a/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.txt +++ b/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.txt @@ -1,6 +1,10 @@ # Configuration to generate LLVM IR implementation for all SPIR-V image instructions # Supported configuration tokens: (All tokens must follow this order) -# 1. One of: (mandatory) +# 0. llpc (mandatory) +# 1. image or image|sparse (mandatory) +# image|sparse means sparse instruction is supported for this function, an additional sparse version +# will be generated. +# 2. One of: (mandatory) # sample # fetch # gather @@ -20,21 +24,21 @@ # atomic.and # atomic.or # atomic.xor -# 2. Dimension string (mandatory, see below) -# 3. proj (optional) -# 4. dref (optional) -# 5. bias (optional) -# 6. lod or lod|lodz (optional) +# 3. Dimension string (mandatory, see below) +# 4. proj (optional) +# 5. dref (optional) +# 6. bias (optional) +# 7. lod or lod|lodz (optional) # lod|lodz means lz optimization is enabled for this function, besides normal lod version, an additional # lodz version will also be generated, which leverages hardware lz instructions. -# 7. grad (optional) -# 8. constoffset (optional) -# 9. offset (optional) -# 10. constoffsets (optional) -# 11. sample (optional) -# 12. minlod (optional) -# 13. fmaskbased (optional) -# 14. fmaskonly (optional) +# 8. grad (optional) +# 9. constoffset (optional) +# 10. offset (optional) +# 11. constoffsets (optional) +# 12. sample (optional) +# 13. minlod (optional) +# 14. fmaskbased (optional) +# 15. fmaskonly (optional) # Dimension string: All supported dimensions are packed in a dimension string, as a configuration token. # Dimension string format: # Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray|Rect|Buffer|SubpassData @@ -42,66 +46,66 @@ # llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.bias.constoffset # Sampling instructions -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray.dref -llpc.image.sample.Dim1D|2D|3D|Rect.proj -llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.lod|lodz -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.bias -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.dref.lod|lodz -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Cube.dref.bias -llpc.image.sample.Dim1D|2D|3D|Rect.proj.lod|lodz -llpc.image.sample.Dim1D|2D|3D|Rect.proj.bias -llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.lod|lodz -llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.bias +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray.dref +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.lod|lodz +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.bias +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.dref.lod|lodz +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Cube.dref.bias +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.lod|lodz +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.bias +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref.lod|lodz +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref.bias -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray.grad -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube.dref.grad -llpc.image.sample.Dim1D|2D|3D|Rect.proj.grad -llpc.image.sample.Dim1D|2D|3D.proj.dref.grad +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray.grad +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube.dref.grad +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.grad +llpc.image|sparse.sample.Dim1D|2D|3D.proj.dref.grad -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.constoffset -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.dref.constoffset -llpc.image.sample.Dim1D|2D|3D|Rect.proj.constoffset -llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.constoffset -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray.lod|lodz.constoffset -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray.bias.constoffset -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray.dref.lod|lodz.constoffset -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray.dref.bias.constoffset -llpc.image.sample.Dim1D|2D|3D.proj.lod|lodz.constoffset -llpc.image.sample.Dim1D|2D|3D.proj.bias.constoffset -llpc.image.sample.Dim1D|2D|3D.proj.dref.lod|lodz.constoffset -llpc.image.sample.Dim1D|2D|3D.proj.dref.bias.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.dref.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray.lod|lodz.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray.bias.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray.dref.lod|lodz.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray.dref.bias.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D.proj.lod|lodz.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D.proj.bias.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D.proj.dref.lod|lodz.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D.proj.dref.bias.constoffset -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.grad.constoffset -llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.dref.grad.constoffset -llpc.image.sample.Dim1D|2D|3D|Rect.proj.grad.constoffset -llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.grad.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.grad.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.dref.grad.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.grad.constoffset +llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref.grad.constoffset # Fetch instructions -llpc.image.fetch.Dim1D|2D|3D|1DArray|2DArray.lod -llpc.image.fetch.Dim1D|2D|3D|1DArray|2DArray|Rect|Buffer -llpc.image.fetch.Dim1D|2D|3D|1DArray|2DArray.lod.constoffset -llpc.image.fetch.DimRect.constoffset -llpc.image.fetch.Dim2D|2DArray.sample -llpc.image.fetch.Dim2D|2DArray.sample.fmaskbased +llpc.image|sparse.fetch.Dim1D|2D|3D|1DArray|2DArray.lod +llpc.image|sparse.fetch.Dim1D|2D|3D|1DArray|2DArray|Rect|Buffer +llpc.image|sparse.fetch.Dim1D|2D|3D|1DArray|2DArray.lod.constoffset +llpc.image|sparse.fetch.DimRect.constoffset +llpc.image|sparse.fetch.Dim2D|2DArray.sample +llpc.image|sparse.fetch.Dim2D|2DArray.sample.fmaskbased llpc.image.fetch.Dim2D|2DArray.sample.fmaskonly # Gather instructions -llpc.image.gather.Dim2D|2DArray|Cube|CubeArray|Rect -llpc.image.gather.Dim2D|2DArray|Cube|CubeArray|Rect.dref -llpc.image.gather.Dim2D|2DArray|Rect.constoffset -llpc.image.gather.Dim2D|2DArray|Rect.dref.constoffset -llpc.image.gather.Dim2D|2DArray|Rect.offset -llpc.image.gather.Dim2D|2DArray|Rect.dref.offset -llpc.image.gather.Dim2D|2DArray|Rect.constoffsets -llpc.image.gather.Dim2D|2DArray|Rect.dref.constoffsets +llpc.image|sparse.gather.Dim2D|2DArray|Cube|CubeArray|Rect +llpc.image|sparse.gather.Dim2D|2DArray|Cube|CubeArray|Rect.dref +llpc.image|sparse.gather.Dim2D|2DArray|Rect.constoffset +llpc.image|sparse.gather.Dim2D|2DArray|Rect.dref.constoffset +llpc.image|sparse.gather.Dim2D|2DArray|Rect.offset +llpc.image|sparse.gather.Dim2D|2DArray|Rect.dref.offset +llpc.image|sparse.gather.Dim2D|2DArray|Rect.constoffsets +llpc.image|sparse.gather.Dim2D|2DArray|Rect.dref.constoffsets # Image read and write instructions -llpc.image.read.Dim1D|2D|3D|Rect|Cube|Buffer|1DArray|2DArray|CubeArray|SubpassData|SubpassDataArray -llpc.image.read.Dim2D|2DArray|SubpassData.sample -llpc.image.read.DimSubpassData.sample.fmaskbased -llpc.image.read.Dim1D|2D|3D|Cube|1DArray|2DArray|CubeArray.lod +llpc.image|sparse.read.Dim1D|2D|3D|Rect|Cube|Buffer|1DArray|2DArray|CubeArray|SubpassData|SubpassDataArray +llpc.image|sparse.read.Dim2D|2DArray|SubpassData.sample +llpc.image|sparse.read.DimSubpassData.sample.fmaskbased +llpc.image|sparse.read.Dim1D|2D|3D|Cube|1DArray|2DArray|CubeArray.lod llpc.image.write.Dim1D|2D|3D|Rect|Cube|Buffer|1DArray|2DArray|CubeArray llpc.image.write.Dim2D|2DArray.sample llpc.image.write.Dim1D|2D|3D|Cube|1DArray|2DArray|CubeArray.lod diff --git a/icd/api/llpc/patch/llpcCodeGenManager.cpp b/icd/api/llpc/patch/llpcCodeGenManager.cpp index 3415eb7e..03e70b7f 100644 --- a/icd/api/llpc/patch/llpcCodeGenManager.cpp +++ b/icd/api/llpc/patch/llpcCodeGenManager.cpp @@ -45,13 +45,10 @@ #include "spirv.hpp" #include "llpcCodeGenManager.h" -#include "llpcGfx6ConfigBuilder.h" -#ifdef LLPC_BUILD_GFX9 -#include "llpcGfx9ConfigBuilder.h" -#endif #include "llpcContext.h" #include "llpcElf.h" -#include "llpcGfx6Chip.h" +#include "llpcGfx6ConfigBuilder.h" +#include "llpcGfx9ConfigBuilder.h" #include "llpcInternal.h" namespace llvm @@ -193,7 +190,7 @@ Result CodeGenManager::GenerateCode( if (cl::EmitLlvm) { - WriteBitcodeToFile(pModule, outStream); + WriteBitcodeToFile(*pModule, outStream); return result; } @@ -316,11 +313,7 @@ Result CodeGenManager::BuildGraphicsPipelineRegConfig( } else { -#ifdef LLPC_BUILD_GFX9 result = Gfx9::ConfigBuilder::BuildPipelineVsFsRegConfig(pContext, ppConfig, pConfigSize); -#else - result = Result::Unsupported; -#endif } } else if (hasTs && (hasGs == false)) @@ -332,11 +325,7 @@ Result CodeGenManager::BuildGraphicsPipelineRegConfig( } else { -#ifdef LLPC_BUILD_GFX9 result = Gfx9::ConfigBuilder::BuildPipelineVsTsFsRegConfig(pContext, ppConfig, pConfigSize); -#else - result = Result::Unsupported; -#endif } } else if ((hasTs == false) && hasGs) @@ -348,11 +337,7 @@ Result CodeGenManager::BuildGraphicsPipelineRegConfig( } else { -#ifdef LLPC_BUILD_GFX9 result = Gfx9::ConfigBuilder::BuildPipelineVsGsFsRegConfig(pContext, ppConfig, pConfigSize); -#else - result = Result::Unsupported; -#endif } } else @@ -364,11 +349,7 @@ Result CodeGenManager::BuildGraphicsPipelineRegConfig( } else { -#ifdef LLPC_BUILD_GFX9 result = Gfx9::ConfigBuilder::BuildPipelineVsTsGsFsRegConfig(pContext, ppConfig, pConfigSize); -#else - result = Result::Unsupported; -#endif } } @@ -393,11 +374,7 @@ Result CodeGenManager::BuildComputePipelineRegConfig( } else { -#ifdef LLPC_BUILD_GFX9 result = Gfx9::ConfigBuilder::BuildPipelineCsRegConfig(pContext, ppConfig, pConfigSize); -#else - result = Result::Unsupported; -#endif } return result; diff --git a/icd/api/llpc/patch/llpcIntrinsDefs.h b/icd/api/llpc/patch/llpcIntrinsDefs.h index 9f3453ee..3baaf467 100644 --- a/icd/api/llpc/patch/llpcIntrinsDefs.h +++ b/icd/api/llpc/patch/llpcIntrinsDefs.h @@ -49,8 +49,8 @@ static const uint32_t GS_EMIT_STREAM0 = 0x22; // [3:0] = 2 (GS), [5:4] = 2 (emit enum AddrSpace { ADDR_SPACE_GLOBAL = 1, // Global memory - ADDR_SPACE_CONST = 2, // Constant memory ADDR_SPACE_LOCAL = 3, // Local memory + ADDR_SPACE_CONST = 4, // Constant memory }; // Enumerates the target for "export" instruction. diff --git a/icd/api/llpc/patch/llpcPatchAddrSpaceMutate.cpp b/icd/api/llpc/patch/llpcPatchAddrSpaceMutate.cpp index b7d12cac..596b155c 100644 --- a/icd/api/llpc/patch/llpcPatchAddrSpaceMutate.cpp +++ b/icd/api/llpc/patch/llpcPatchAddrSpaceMutate.cpp @@ -83,20 +83,48 @@ bool PatchAddrSpaceMutate::runOnModule( m_addrSpaceMap[SPIRAS_Constant] = ADDR_SPACE_CONST; m_addrSpaceMap[SPIRAS_Local] = ADDR_SPACE_LOCAL; - // We are not expecting any global variables that need their types mutating, other than unused ones - // left behind by previous passes. -#ifndef NDEBUG + // Gather the globals and then process them. We do not want to reprocess globals that we create + // here. Ignore unused globals left behind by lowering passes. + SmallVector globalVars; for (auto globalIt = module.global_begin(), globalItEnd = module.global_end(); globalIt != globalItEnd; ++globalIt) { auto pGlobalVar = dyn_cast(&*globalIt); if ((pGlobalVar != nullptr) && (pGlobalVar->use_empty() == false)) { - auto pGlobalType = globalIt->getType(); - LLPC_ASSERT(pGlobalType == MapType(pGlobalType)); + globalVars.push_back(pGlobalVar); + } + } + + // For any global variable whose type needs to change, create a new one. We only cope with the + // case where the top level address space changes, so we do not need to worry about modifying + // any initializer. + for (uint32_t globalVarIdx = 0; globalVarIdx != globalVars.size(); ++globalVarIdx) + { + auto pOldGlobalVar = globalVars[globalVarIdx]; + auto pOldGlobalVarType = cast(pOldGlobalVar->getType()); + auto pNewGlobalVarType = cast(MapType(pOldGlobalVarType)); + + if (pOldGlobalVarType != pNewGlobalVarType) + { + LLPC_ASSERT(pOldGlobalVarType->getElementType() == pNewGlobalVarType->getElementType()); + + auto pNewGlobalVar = new GlobalVariable(module, + pOldGlobalVarType->getElementType(), + pOldGlobalVar->isConstant(), + pOldGlobalVar->getLinkage(), + pOldGlobalVar->hasInitializer() ? + pOldGlobalVar->getInitializer() : nullptr, + "", + nullptr, + pOldGlobalVar->getThreadLocalMode(), + pNewGlobalVarType->getAddressSpace(), + pOldGlobalVar->isExternallyInitialized()); + + pNewGlobalVar->takeName(pOldGlobalVar); + m_globalMap[pOldGlobalVar] = pNewGlobalVar; } } -#endif // NDEBUG // Gather the functions and then process them. We do not want to reprocess functions that we create here. SmallVector funcs; diff --git a/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp b/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp index 8b3833fd..ac219b26 100644 --- a/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp +++ b/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp @@ -38,9 +38,7 @@ #include "SPIRVInternal.h" #include "llpcContext.h" #include "llpcGfx6Chip.h" -#ifdef LLPC_BUILD_GFX9 #include "llpcGfx9Chip.h" -#endif #include "llpcIntrinsDefs.h" #include "llpcPatchEntryPointMutate.h" diff --git a/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp b/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp index b024b462..b2ef00bd 100644 --- a/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp +++ b/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp @@ -65,10 +65,8 @@ PatchInOutImportExport::PatchInOutImportExport() m_pFragDepth(nullptr), m_pFragStencilRef(nullptr), m_pSampleMask(nullptr), -#ifdef LLPC_BUILD_GFX9 m_pViewportIndex(nullptr), m_pLayer(nullptr), -#endif m_hasTs(false), m_hasGs(false), m_pLds(nullptr), @@ -1290,7 +1288,6 @@ void PatchInOutImportExport::visitReturnInst( // Export gl_Layer and gl_ViewportIndex before entry-point returns if ((m_gfxIp.major >= 9) && (useLayer || useViewportIndex)) { -#ifdef LLPC_BUILD_GFX9 Value* pViewportIndexAndLayer = ConstantInt::get(m_pContext->Int32Ty(), 0); if (useViewportIndex) @@ -1425,7 +1422,6 @@ void PatchInOutImportExport::visitReturnInst( ++inOutUsage.expCount; } } -#endif } // NOTE: If no generic outputs are present in this shader, we have to export a dummy one @@ -2174,11 +2170,6 @@ Value* PatchInOutImportExport::PatchVsBuiltInInputImport( pInput = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.viewIndex); break; } - case BuiltInSubgroupSize: - { - pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize); - break; - } case BuiltInDeviceIndex: { auto pPipelineInfo = reinterpret_cast(m_pContext->GetPipelineBuildInfo()); @@ -2281,11 +2272,6 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport( pInput = inoutUsage.tcs.pInvocationId; break; } - case BuiltInSubgroupSize: - { - pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize); - break; - } case BuiltInDeviceIndex: { auto pPipelineInfo = reinterpret_cast(m_pContext->GetPipelineBuildInfo()); @@ -2447,11 +2433,6 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport( pInput = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.viewIndex); break; } - case BuiltInSubgroupSize: - { - pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize); - break; - } case BuiltInDeviceIndex: { auto pPipelineInfo = reinterpret_cast(m_pContext->GetPipelineBuildInfo()); @@ -2542,11 +2523,6 @@ Value* PatchInOutImportExport::PatchGsBuiltInInputImport( pInput = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.viewIndex); break; } - case BuiltInSubgroupSize: - { - pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize); - break; - } case BuiltInDeviceIndex: { auto pPipelineInfo = reinterpret_cast(m_pContext->GetPipelineBuildInfo()); @@ -2846,11 +2822,6 @@ Value* PatchInOutImportExport::PatchFsBuiltInInputImport( args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 4)); pInput = EmitCall(m_pModule, "llvm.amdgcn.ubfe.i32", pInputTy, args, NoAttrib, pInsertPos); - break; - } - case BuiltInSubgroupSize: - { - pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize); break; } case BuiltInDeviceIndex: @@ -2969,11 +2940,6 @@ Value* PatchInOutImportExport::PatchCsBuiltInInputImport( pInput = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.localInvocationId); break; } - case BuiltInSubgroupSize: - { - pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize); - break; - } case BuiltInDeviceIndex: { auto pPipelineInfo = reinterpret_cast(m_pContext->GetPipelineBuildInfo()); @@ -3368,10 +3334,8 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( } else { -#ifdef LLPC_BUILD_GFX9 // NOTE: The export of gl_Layer is delayed and is done before entry-point returns. m_pLayer = pOutput; -#endif } } @@ -3393,10 +3357,8 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport( } else { -#ifdef LLPC_BUILD_GFX9 // NOTE: The export of gl_ViewportIndex is delayed and is done before entry-point returns. m_pViewportIndex = pOutput; -#endif } } @@ -3820,10 +3782,8 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport( } else { -#ifdef LLPC_BUILD_GFX9 // NOTE: The export of gl_Layer is delayed and is done before entry-point returns. m_pLayer = pOutput; -#endif } } @@ -3845,10 +3805,8 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport( } else { -#ifdef LLPC_BUILD_GFX9 // NOTE: The export of gl_ViewportIndex is delayed and is done before entry-point returns. m_pViewportIndex = pOutput; -#endif } } @@ -4137,10 +4095,8 @@ void PatchInOutImportExport::PatchCopyShaderBuiltInOutputExport( } else { -#ifdef LLPC_BUILD_GFX9 // NOTE: The export of gl_Layer is delayed and is done before entry-point returns. m_pLayer = pOutput; -#endif } break; @@ -4153,10 +4109,8 @@ void PatchInOutImportExport::PatchCopyShaderBuiltInOutputExport( } else { -#ifdef LLPC_BUILD_GFX9 // NOTE: The export of gl_ViewportIndex is delayed and is done before entry-point returns. m_pViewportIndex = pOutput; -#endif } break; @@ -5445,12 +5399,9 @@ uint32_t PatchInOutImportExport::CalcPatchCountPerThreadGroup( // NOTE: Performance analysis shows that 16 patches per thread group is an optimal upper-bound. The value is only // an experimental number. For GFX9. 64 is an optimal number instead. -#ifdef LLPC_BUILD_GFX9 const auto gfxIp = m_pContext->GetGfxIpVersion(); const uint32_t optimalPatchCountPerThreadGroup = (gfxIp.major >= 9) ? 64 : 16; -#else - const uint32_t optimalPatchCountPerThreadGroup = 16; -#endif + patchCountPerThreadGroup = std::min(patchCountPerThreadGroup, optimalPatchCountPerThreadGroup); if (m_pContext->IsTessOffChip()) diff --git a/icd/api/llpc/patch/llpcPatchInOutImportExport.h b/icd/api/llpc/patch/llpcPatchInOutImportExport.h index 6e688637..a5c120ac 100644 --- a/icd/api/llpc/patch/llpcPatchInOutImportExport.h +++ b/icd/api/llpc/patch/llpcPatchInOutImportExport.h @@ -264,12 +264,10 @@ class PatchInOutImportExport: llvm::Value* m_pFragDepth; // Correspond to "out float gl_FragDepth" llvm::Value* m_pFragStencilRef; // Correspond to "out int gl_FragStencilRef" llvm::Value* m_pSampleMask; // Correspond to "out int gl_SampleMask[]" -#ifdef LLPC_BUILD_GFX9 // NOTE: For GFX9, gl_ViewportIndex and gl_Layer are packed with one channel (gl_ViewpoertInex is 16-bit high part // and gl_Layer is 16-bit low part). Thus, the export is delayed with them merged together. llvm::Value* m_pViewportIndex; // Correspond to "out int gl_ViewportIndex" llvm::Value* m_pLayer; // Correspond to "out int gl_Layer" -#endif bool m_hasTs; // Whether the pipeline has tessellation shaders diff --git a/icd/api/llpc/patch/llpcPatchResourceCollect.cpp b/icd/api/llpc/patch/llpcPatchResourceCollect.cpp index 2e20c4f4..7211a203 100644 --- a/icd/api/llpc/patch/llpcPatchResourceCollect.cpp +++ b/icd/api/llpc/patch/llpcPatchResourceCollect.cpp @@ -167,8 +167,6 @@ void PatchResourceCollect::visitCallInst( else if (mangledName.startswith(LlpcName::ImageCallPrefix)) { // Image operations - auto opName = mangledName.substr(strlen(LlpcName::ImageCallPrefix)); - ShaderImageCallMetadata imageCallMeta = {}; LLPC_ASSERT(callInst.getNumArgOperands() >= 2); uint32_t metaOperandIndex = callInst.getNumArgOperands() - 1; @@ -187,18 +185,11 @@ void PatchResourceCollect::visitCallInst( DescriptorPair descPair = { descSet, binding }; m_pResUsage->descPairs.insert(descPair.u64All); - std::string imageSampleName; - std::string imageGatherName; - std::string imageQueryLodName; - SPIRV::SPIRVImageOpKindNameMap::find(ImageOpSample, &imageSampleName); - SPIRV::SPIRVImageOpKindNameMap::find(ImageOpGather, &imageGatherName); - SPIRV::SPIRVImageOpKindNameMap::find(ImageOpQueryLod, &imageQueryLodName); - // NOTE: For image sampling operations, we have to add both resource descriptor and sampler descriptor info // to descriptor usages, operand 0 and 1 are sampler descriptor, 3 and 4 are resource descriptor - if (opName.startswith(imageSampleName) || - opName.startswith(imageGatherName) || - opName.startswith(imageQueryLodName)) + if ((imageOp == ImageOpSample) || + (imageOp == ImageOpGather) || + (imageOp == ImageOpQueryLod)) { uint32_t descSet = cast(callInst.getOperand(3))->getZExtValue(); uint32_t binding = cast(callInst.getOperand(4))->getZExtValue(); diff --git a/icd/api/llpc/tool/amdllpc.cpp b/icd/api/llpc/tool/amdllpc.cpp index bb394b4d..8999e22c 100644 --- a/icd/api/llpc/tool/amdllpc.cpp +++ b/icd/api/llpc/tool/amdllpc.cpp @@ -1149,8 +1149,7 @@ int32_t main( // Translate LLVM module to LLVM bitcode llvm::SmallString<1024> bitcodeBuf; raw_svector_ostream bitcodeStream(bitcodeBuf); - WriteBitcodeToFile(pModule.get(), bitcodeStream); - + WriteBitcodeToFile(*pModule.get(), bitcodeStream); void* pCode = new uint8_t[bitcodeBuf.size()]; memcpy(pCode, bitcodeBuf.data(), bitcodeBuf.size()); compileInfo.spirvBin[shaderStage].codeSize = bitcodeBuf.size(); diff --git a/icd/api/llpc/translator/SPIRVInternal.h b/icd/api/llpc/translator/SPIRVInternal.h index 056c5565..e1930c7a 100644 --- a/icd/api/llpc/translator/SPIRVInternal.h +++ b/icd/api/llpc/translator/SPIRVInternal.h @@ -346,7 +346,8 @@ namespace kSPIRVName { } namespace gSPIRVName { - const static char ImageCallPrefix[] = "spirv.image."; + const static char ImageCallPrefix[] = "spirv.image"; + const static char ImageCallModSparse[] = "sparse"; const static char ImageCallModDref[] = ".dref"; const static char ImageCallModProj[] = ".proj"; const static char ImageCallModBias[] = ".bias"; diff --git a/icd/api/llpc/translator/SPIRVReader.cpp b/icd/api/llpc/translator/SPIRVReader.cpp index 8f545d0b..fe298ce8 100644 --- a/icd/api/llpc/translator/SPIRVReader.cpp +++ b/icd/api/llpc/translator/SPIRVReader.cpp @@ -1771,8 +1771,6 @@ SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F, // load/store when we visit this bool variable. This issue exists in // DOOM4 released version, we have to keep the workaround. if (Dst->getType()->getPointerElementType() != Src->getType()) { - assert(Src->getType()->isAggregateType() && - Dst->getType()->getPointerElementType()->isAggregateType()); SI = transSPIRVBuiltinFromInst(BS, BB); } else { // NOTE: For those storage classes that will not involve memory @@ -2305,7 +2303,19 @@ SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F, case OpImageQueryLevels: case OpImageQuerySamples: case OpImageRead: - case OpImageWrite: { + case OpImageWrite: + case OpImageSparseSampleImplicitLod: + case OpImageSparseSampleExplicitLod: + case OpImageSparseSampleDrefImplicitLod: + case OpImageSparseSampleDrefExplicitLod: + case OpImageSparseSampleProjImplicitLod: + case OpImageSparseSampleProjExplicitLod: + case OpImageSparseSampleProjDrefImplicitLod: + case OpImageSparseSampleProjDrefExplicitLod: + case OpImageSparseFetch: + case OpImageSparseGather: + case OpImageSparseDrefGather: + case OpImageSparseRead: { return mapValue(BV, transSPIRVImageOpFromInst( static_cast(BV), @@ -2347,6 +2357,27 @@ SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F, false, 0, BB); return mapValue(BV, LI); } + case OpImageSparseTexelsResident: { + SPIRVImageSparseTexelsResident *BI = static_cast(BV); + auto ResidentCode = transValue(BI->getResidentCode(), F, BB); + + std::string FuncName("llpc.imagesparse.texel.resident"); + SmallVector Arg; + Arg.push_back(ResidentCode); + + Function *Func = M->getFunction(FuncName); + if (!Func) { + SmallVector ArgTy; + ArgTy.push_back(Type::getInt32Ty(*Context)); + FunctionType *FuncTy = FunctionType::get(Type::getInt1Ty(*Context), ArgTy, false); + Func = Function::Create(FuncTy, GlobalValue::ExternalLinkage, FuncName, M); + Func->setCallingConv(CallingConv::SPIR_FUNC); + if (isFuncNoUnwind()) + Func->addFnAttr(Attribute::NoUnwind); + } + + return mapValue(BV, CallInst::Create(Func, Arg, "", BB)); + } default: { auto OC = BV->getOpCode(); if (isSPIRVCmpInstTransToLLVMInst(static_cast(BV))) { @@ -2676,13 +2707,19 @@ SPIRVToLLVM::transSPIRVImageOpFromInst(SPIRVInstruction *BI, BasicBlock*BB) if (Info.OpKind != ImageOpQueryNonLod) { // Generate name strings for image calls: - // Format: prefix.op.[f32|i32|u32].dim[.proj][.dref][.bias][.lod][.grad] - // [.constoffset][.offset] - // [.constoffsets][.sample][.minlod] + // Format: prefix.image[sparse].op.[f32|i32|u32].dim[.proj][.dref][.bias][.lod][.grad] + // [.constoffset][.offset] + // [.constoffsets][.sample][.minlod] // Add call prefix SS << gSPIRVName::ImageCallPrefix; + // Add sparse modifier + if (Info.IsSparse) + SS << gSPIRVName::ImageCallModSparse; + + SS << "."; + // Add image operation kind std::string S; SPIRVImageOpKindNameMap::find(Info.OpKind, &S); @@ -2853,7 +2890,7 @@ SPIRVToLLVM::transSPIRVImageOpFromInst(SPIRVInstruction *BI, BasicBlock*BB) // Format: prefix.query.op.dim[.cubearray][.buffer].returntype // Add call prefix - SS << gSPIRVName::ImageCallPrefix; + SS << gSPIRVName::ImageCallPrefix << "."; // Add image operation kind: query std::string S; diff --git a/icd/api/llpc/translator/SPIRVUtil.cpp b/icd/api/llpc/translator/SPIRVUtil.cpp index 3cc64c75..af887599 100644 --- a/icd/api/llpc/translator/SPIRVUtil.cpp +++ b/icd/api/llpc/translator/SPIRVUtil.cpp @@ -108,7 +108,7 @@ saveLLVMModule(Module *M, const std::string &OutputFile) { return; } - WriteBitcodeToFile(M, Out.os()); + WriteBitcodeToFile(*M, Out.os()); Out.keep(); } diff --git a/icd/api/llpc/translator/libSPIRV/SPIRVEntry.h b/icd/api/llpc/translator/libSPIRV/SPIRVEntry.h index 4d33adca..950bc9f9 100644 --- a/icd/api/llpc/translator/libSPIRV/SPIRVEntry.h +++ b/icd/api/llpc/translator/libSPIRV/SPIRVEntry.h @@ -730,19 +730,6 @@ T* bcast(SPIRVEntry *E) { // Each time a new class is implemented, remove the corresponding typedef. // This is also an indication of how much work is left. #define _SPIRV_OP(x, ...) typedef SPIRVEntryOpCodeOnly SPIRV##x; -_SPIRV_OP(ImageSparseSampleImplicitLod, 305) -_SPIRV_OP(ImageSparseSampleExplicitLod, 306) -_SPIRV_OP(ImageSparseSampleDrefImplicitLod, 307) -_SPIRV_OP(ImageSparseSampleDrefExplicitLod, 308) -_SPIRV_OP(ImageSparseSampleProjImplicitLod, 309) -_SPIRV_OP(ImageSparseSampleProjExplicitLod, 310) -_SPIRV_OP(ImageSparseSampleProjDrefImplicitLod, 311) -_SPIRV_OP(ImageSparseSampleProjDrefExplicitLod, 312) -_SPIRV_OP(ImageSparseFetch, 313) -_SPIRV_OP(ImageSparseGather, 314) -_SPIRV_OP(ImageSparseDrefGather, 315) -_SPIRV_OP(ImageSparseTexelsResident, 316) -_SPIRV_OP(ImageSparseRead, 320) _SPIRV_OP(TypeNamedBarrier) _SPIRV_OP(NamedBarrierInitialize) _SPIRV_OP(MemoryNamedBarrier) @@ -756,41 +743,6 @@ _SPIRV_OP(NamedBarrierInitialize) _SPIRV_OP(MemoryNamedBarrier) _SPIRV_OP(ExecutionModeId) _SPIRV_OP(DecorateId) -#ifdef ICD_VULKAN_1_1 -_SPIRV_OP(GroupNonUniformAll) -_SPIRV_OP(GroupNonUniformAny) -_SPIRV_OP(GroupNonUniformAllEqual) -_SPIRV_OP(GroupNonUniformBroadcast) -_SPIRV_OP(GroupNonUniformBroadcastFirst) -_SPIRV_OP(GroupNonUniformBallot) -_SPIRV_OP(GroupNonUniformInverseBallot) -_SPIRV_OP(GroupNonUniformBallotBitExtract) -_SPIRV_OP(GroupNonUniformBallotBitCount) -_SPIRV_OP(GroupNonUniformBallotFindLSB) -_SPIRV_OP(GroupNonUniformBallotFindMSB) -_SPIRV_OP(GroupNonUniformShuffle) -_SPIRV_OP(GroupNonUniformShuffleXor) -_SPIRV_OP(GroupNonUniformShuffleUp) -_SPIRV_OP(GroupNonUniformShuffleDown) -_SPIRV_OP(GroupNonUniformIAdd) -_SPIRV_OP(GroupNonUniformFAdd) -_SPIRV_OP(GroupNonUniformIMul) -_SPIRV_OP(GroupNonUniformFMul) -_SPIRV_OP(GroupNonUniformSMin) -_SPIRV_OP(GroupNonUniformUMin) -_SPIRV_OP(GroupNonUniformFMin) -_SPIRV_OP(GroupNonUniformSMax) -_SPIRV_OP(GroupNonUniformUMax) -_SPIRV_OP(GroupNonUniformFMax) -_SPIRV_OP(GroupNonUniformBitwiseAnd) -_SPIRV_OP(GroupNonUniformBitwiseOr) -_SPIRV_OP(GroupNonUniformBitwiseXor) -_SPIRV_OP(GroupNonUniformLogicalAnd) -_SPIRV_OP(GroupNonUniformLogicalOr) -_SPIRV_OP(GroupNonUniformLogicalXor) -_SPIRV_OP(GroupNonUniformQuadBroadcast) -_SPIRV_OP(GroupNonUniformQuadSwap) -#endif _SPIRV_OP(GroupIAddNonUniformAMD) _SPIRV_OP(GroupFAddNonUniformAMD) _SPIRV_OP(GroupFMinNonUniformAMD) diff --git a/icd/api/llpc/translator/libSPIRV/SPIRVEnum.h b/icd/api/llpc/translator/libSPIRV/SPIRVEnum.h index 89e586af..fdbcc245 100644 --- a/icd/api/llpc/translator/libSPIRV/SPIRVEnum.h +++ b/icd/api/llpc/translator/libSPIRV/SPIRVEnum.h @@ -190,6 +190,15 @@ SPIRVMap::init() { ADD_VEC_INIT(CapabilityStencilExportEXT, { CapabilityShader }); ADD_VEC_INIT(CapabilityShaderViewportIndexLayerEXT, { CapabilityMultiViewport }); ADD_VEC_INIT(CapabilityUniformAndStorageBuffer16BitAccess, { CapabilityStorageBuffer16BitAccess }); +#ifdef ICD_VULKAN_1_1 + ADD_VEC_INIT(CapabilityGroupNonUniformVote, { CapabilityGroupNonUniform }); + ADD_VEC_INIT(CapabilityGroupNonUniformArithmetic, { CapabilityGroupNonUniform }); + ADD_VEC_INIT(CapabilityGroupNonUniformBallot, { CapabilityGroupNonUniform }); + ADD_VEC_INIT(CapabilityGroupNonUniformShuffle, { CapabilityGroupNonUniform }); + ADD_VEC_INIT(CapabilityGroupNonUniformShuffleRelative, { CapabilityGroupNonUniform }); + ADD_VEC_INIT(CapabilityGroupNonUniformClustered, { CapabilityGroupNonUniform }); + ADD_VEC_INIT(CapabilityGroupNonUniformQuad, { CapabilityGroupNonUniform }); +#endif } template<> inline void diff --git a/icd/api/llpc/translator/libSPIRV/SPIRVInstruction.h b/icd/api/llpc/translator/libSPIRV/SPIRVInstruction.h index 010debfc..7ade919d 100644 --- a/icd/api/llpc/translator/libSPIRV/SPIRVInstruction.h +++ b/icd/api/llpc/translator/libSPIRV/SPIRVInstruction.h @@ -553,6 +553,28 @@ class SPIRVImageTexelPointer : public SPIRVInstruction { SPIRVId Sample; }; +class SPIRVImageSparseTexelsResident : public SPIRVInstruction { +public: + const static Op OC = OpImageSparseTexelsResident; + + // Incomplete constructor + SPIRVImageSparseTexelsResident() :SPIRVInstruction(OC), ResidentCode(SPIRVID_INVALID) + {} + + SPIRVValue *getResidentCode() { return getValue(ResidentCode); } +protected: + _SPIRV_DEF_ENCDEC3(Type, Id, ResidentCode) + + void validate()const { + assert(Type->isTypeBool() && Type->isTypeScalar()); + + auto ResidentCodeTy = getValueType(ResidentCode); + assert(ResidentCodeTy->isTypeInt() && ResidentCodeTy->isTypeScalar()); + } + + SPIRVId ResidentCode; +}; + class SPIRVStore:public SPIRVInstruction, public SPIRVMemoryAccess { public: const static SPIRVWord FixedWords = 3; @@ -2212,6 +2234,39 @@ _SPIRV_OP(GroupCommitReadPipe, false, 6) _SPIRV_OP(GroupCommitWritePipe, false, 6) #ifdef ICD_VULKAN_1_1 _SPIRV_OP(GroupNonUniformElect, true, 4) +_SPIRV_OP(GroupNonUniformAll, true, 5) +_SPIRV_OP(GroupNonUniformAny, true, 5) +_SPIRV_OP(GroupNonUniformAllEqual, true, 5) +_SPIRV_OP(GroupNonUniformBroadcast, true, 6) +_SPIRV_OP(GroupNonUniformBroadcastFirst, true, 5) +_SPIRV_OP(GroupNonUniformBallot, true, 5) +_SPIRV_OP(GroupNonUniformInverseBallot, true, 5) +_SPIRV_OP(GroupNonUniformBallotBitExtract, true, 6) +_SPIRV_OP(GroupNonUniformBallotBitCount, true, 6, false, 1) +_SPIRV_OP(GroupNonUniformBallotFindLSB, true, 5) +_SPIRV_OP(GroupNonUniformBallotFindMSB, true, 5) +_SPIRV_OP(GroupNonUniformShuffle, true, 6) +_SPIRV_OP(GroupNonUniformShuffleXor, true, 6) +_SPIRV_OP(GroupNonUniformShuffleUp, true, 6) +_SPIRV_OP(GroupNonUniformShuffleDown, true, 6) +_SPIRV_OP(GroupNonUniformIAdd, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformFAdd, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformIMul, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformFMul, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformSMin, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformUMin, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformFMin, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformSMax, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformUMax, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformFMax, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformBitwiseAnd, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformBitwiseOr, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformBitwiseXor, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformLogicalAnd, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformLogicalOr, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformLogicalXor, true, 6, true, 1) +_SPIRV_OP(GroupNonUniformQuadBroadcast, true, 6) +_SPIRV_OP(GroupNonUniformQuadSwap, true, 6) #endif #undef _SPIRV_OP @@ -2280,6 +2335,18 @@ _SPIRV_OP(ImageQuerySize, true, 4) _SPIRV_OP(ImageQueryLod, true, 5) _SPIRV_OP(ImageQueryLevels, true, 4) _SPIRV_OP(ImageQuerySamples, true, 4) +_SPIRV_OP(ImageSparseSampleImplicitLod, true, 5, true) +_SPIRV_OP(ImageSparseSampleExplicitLod, true, 7, true, 2) +_SPIRV_OP(ImageSparseSampleDrefImplicitLod, true, 6, true, 3) +_SPIRV_OP(ImageSparseSampleDrefExplicitLod, true, 7, true, 3) +_SPIRV_OP(ImageSparseSampleProjImplicitLod, true, 5, true, 2) +_SPIRV_OP(ImageSparseSampleProjExplicitLod, true, 7, true, 2) +_SPIRV_OP(ImageSparseSampleProjDrefImplicitLod, true, 6, true, 3) +_SPIRV_OP(ImageSparseSampleProjDrefExplicitLod, true, 7, true, 3) +_SPIRV_OP(ImageSparseFetch, true, 4, true, 2) +_SPIRV_OP(ImageSparseGather, true, 6, true, 3) +_SPIRV_OP(ImageSparseDrefGather, true, 6, true, 3) +_SPIRV_OP(ImageSparseRead, true, 5, true, 2) #undef _SPIRV_OP // SpecConstantOp instruction diff --git a/icd/api/llpc/util/llpcDebug.cpp b/icd/api/llpc/util/llpcDebug.cpp index 90f20810..87668521 100644 --- a/icd/api/llpc/util/llpcDebug.cpp +++ b/icd/api/llpc/util/llpcDebug.cpp @@ -41,9 +41,7 @@ #include "llpcDebug.h" #include "llpcElf.h" #include "llpcGfx6Chip.h" -#ifdef LLPC_BUILD_GFX9 #include "llpcGfx9Chip.h" -#endif #include "llpcInternal.h" #include "llpcMetroHash.h" diff --git a/icd/api/llpc/util/llpcInternal.h b/icd/api/llpc/util/llpcInternal.h index 0a2a197d..facf932a 100644 --- a/icd/api/llpc/util/llpcInternal.h +++ b/icd/api/llpc/util/llpcInternal.h @@ -99,7 +99,7 @@ namespace LlpcName const static char DescriptorLoadSpillTable[] = "llpc.descriptor.load.spilltable"; const static char DescriptorLoadGsVsRingBuffer[] = "llpc.descriptor.load.gsvsringbuffer"; - const static char ImageCallPrefix[] = "llpc.image."; + const static char ImageCallPrefix[] = "llpc.image"; const static char GlobalProxyPrefix[] = "__llpc_global_proxy_"; const static char InputProxyPrefix[] = "__llpc_input_proxy_"; diff --git a/icd/api/llpc/util/llpcPipelineDumper.cpp b/icd/api/llpc/util/llpcPipelineDumper.cpp index 996ff94b..b8d503b5 100644 --- a/icd/api/llpc/util/llpcPipelineDumper.cpp +++ b/icd/api/llpc/util/llpcPipelineDumper.cpp @@ -136,7 +136,7 @@ void VKAPI_CALL IPipelineDumper::DumpPipelineBinary( // ===================================================================================================================== // Calculates graphics pipeline hash code. -uint64_t VKAPI_CALL IPipelineDumper::GetGraphicsPipelineHash( +uint64_t VKAPI_CALL IPipelineDumper::GetPipelineHash( const GraphicsPipelineBuildInfo* pPipelineInfo) // [in] Info to build this graphics pipeline { return PipelineDumper::GetGraphicsPipelineHash(pPipelineInfo); @@ -144,7 +144,7 @@ uint64_t VKAPI_CALL IPipelineDumper::GetGraphicsPipelineHash( // ===================================================================================================================== // Calculates compute pipeline hash code. -uint64_t VKAPI_CALL IPipelineDumper::GetComputePipelineHash( +uint64_t VKAPI_CALL IPipelineDumper::GetPipelineHash( const ComputePipelineBuildInfo* pPipelineInfo) // [in] Info to build this compute pipeline { return PipelineDumper::GetComputePipelineHash(pPipelineInfo); @@ -655,7 +655,7 @@ MetroHash::Hash PipelineDumper::GenerateHashForComputePipeline( MetroHash64 hasher; UpdateHashForPipelineShaderInfo(ShaderStageCompute, &pPipeline->cs, &hasher); - + hasher.Update(pPipeline->deviceIndex); MetroHash::Hash hash = {}; hasher.Finalize(hash.bytes); @@ -924,11 +924,7 @@ OStream& operator<<( } else { -#ifdef LLPC_BUILD_GFX9 pRegName = Gfx9::GetRegisterNameString(gfxIp, pConfig[i].key * 4); -#else - pRegName = "UNKNOWN"; -#endif } auto length = snprintf(formatBuf, sizeof(formatBuf), @@ -981,11 +977,7 @@ OStream& operator<<( } else { -#ifdef LLPC_BUILD_GFX9 pRegName = Gfx9::GetRegisterNameString(gfxIp, pConfig[2 * i]); -#else - pRegName = "UNKNOWN"; -#endif } auto length = snprintf(formatBuf, sizeof(formatBuf), " %-45s = 0x%08X\n", pRegName, pConfig[2 * i + 1]); out << formatBuf; diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp new file mode 100644 index 00000000..7293b0b8 --- /dev/null +++ b/icd/api/pipeline_compiler.cpp @@ -0,0 +1,766 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file pipeline_compiler.cpp + * @brief Contains implementation of Vulkan pipeline compiler + *********************************************************************************************************************** + */ + +#include "include/pipeline_compiler.h" +#include "include/vk_device.h" +#include "include/vk_shader.h" +#include "include/vk_pipeline_cache.h" +#include "include/vk_pipeline_layout.h" +#include "include/vk_render_pass.h" + +namespace vk +{ + +extern bool IsSrcAlphaUsedInBlend(VkBlendFactor blend); + +// ===================================================================================================================== +PipelineCompiler::PipelineCompiler(PhysicalDevice* pPhysicalDevice) + : + m_pPhysicalDevice(pPhysicalDevice) + , m_pLlpc(nullptr) +{ + +} + +// ===================================================================================================================== +PipelineCompiler::~PipelineCompiler() +{ + VK_ASSERT(m_pLlpc == nullptr); +} + +// ===================================================================================================================== +// Initializes pipeline compiler. +VkResult PipelineCompiler::Initialize() +{ + Pal::IDevice* pPalDevice = m_pPhysicalDevice->PalDevice(); + + // Initialzie GfxIp informations per PAL device properties + Pal::DeviceProperties info; + pPalDevice->GetProperties(&info); + m_gfxIpLevel = info.gfxLevel; + + switch (info.gfxLevel) + { + case Pal::GfxIpLevel::GfxIp6: + m_gfxIp.major = 6; + m_gfxIp.minor = 0; + break; + case Pal::GfxIpLevel::GfxIp7: + m_gfxIp.major = 7; + m_gfxIp.minor = 0; + break; + case Pal::GfxIpLevel::GfxIp8: + m_gfxIp.major = 8; + m_gfxIp.minor = 0; + break; + case Pal::GfxIpLevel::GfxIp8_1: + m_gfxIp.major = 8; + m_gfxIp.minor = 1; + break; + case Pal::GfxIpLevel::GfxIp9: + m_gfxIp.major = 9; + m_gfxIp.minor = 0; + break; + default: + VK_NEVER_CALLED(); + break; + } + + m_gfxIp.stepping = info.gfxStepping; + + // Create compiler objects + VkResult result = VK_SUCCESS; + result = CreateLlpcCompiler(); + + return result; +} + +// ===================================================================================================================== +// Destroies all compiler instance. +void PipelineCompiler::Destroy() +{ + if (m_pLlpc) + { + m_pLlpc->Destroy(); + m_pLlpc = nullptr; + } + +} + +// ===================================================================================================================== +// Creates LLPC compiler instance. +VkResult PipelineCompiler::CreateLlpcCompiler() +{ + const uint32_t OptionBufferSize = 4096; + const uint32_t MaxLlpcOptions = 32; + Llpc::ICompiler* pCompiler = nullptr; + const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings(); +#ifdef ICD_BUILD_APPPROFILE + AppProfile appProfile = m_pPhysicalDevice->GetAppProfile(); +#endif + // Get the executable name and path + char executableNameBuffer[PATH_MAX]; + char* pExecutablePtr; + Pal::Result palResult = Util::GetExecutableName(&executableNameBuffer[0], + &pExecutablePtr, + sizeof(executableNameBuffer)); + VK_ASSERT(palResult == Pal::Result::Success); + + // Initialize LLPC options according to runtime settings + const char* llpcOptions[MaxLlpcOptions] = {}; + char optionBuffers[OptionBufferSize] = {}; + + char* pOptionBuffer = &optionBuffers[0]; + size_t bufSize = OptionBufferSize; + int optionLength = 0; + uint32_t numOptions = 0; + // Identify for Icd and stanalone compiler + llpcOptions[numOptions++] = Llpc::VkIcdName; + + // LLPC log options + llpcOptions[numOptions++] = (settings.enableLog & 1) ? "-enable-errs=1" : "-enable-errs=0"; + llpcOptions[numOptions++] = (settings.enableLog & 2) ? "-enable-outs=1" : "-enable-outs=0"; + + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-log-file-outs=%s", settings.logFileName); + ++optionLength; + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += optionLength; + bufSize -= optionLength; + + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-log-file-dbgs=%s", settings.debugLogFileName); + ++optionLength; + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += optionLength; + bufSize -= optionLength; + + // Generate ELF binary, not assembly text + llpcOptions[numOptions++] = "-filetype=obj"; + + // LLPC debug options + if (settings.enableDebug) + { + llpcOptions[numOptions++] = "-debug"; + } + + if (settings.llpcOptions[0] != '\0') + { + const char* pOptions = &settings.llpcOptions[0]; + VK_ASSERT(pOptions[0] == '-'); + + // Split options + while (pOptions) + { + const char* pNext = strchr(pOptions, ' '); + if (pNext) + { + // Copy options to option buffer + optionLength = static_cast(pNext - pOptions); + memcpy(pOptionBuffer, pOptions, optionLength); + pOptionBuffer[optionLength] = 0; + + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += (optionLength + 1); + + bufSize -= (optionLength + 1); + pOptions = strchr(pOptions + optionLength, '-'); + } + else + { + // Use pOptions directly for last option + llpcOptions[numOptions++] = pOptions; + pOptions = nullptr; + } + } + } + + // LLPC pipeline dump options + if (settings.enablePipelineDump) + { + llpcOptions[numOptions++] = "-enable-pipeline-dump"; + } + + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-pipeline-dump-dir=%s", settings.pipelineDumpDir); + ++optionLength; + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += optionLength; + bufSize -= optionLength; + + if (settings.enableLlpc == LlpcModeAutoFallback) + { + llpcOptions[numOptions++] = "-disable-WIP-features=1"; + } + + // NOTE: For testing consistency, these options should be kept the same as those of + // "amdllpc" (Init()). + llpcOptions[numOptions++] = "-pragma-unroll-threshold=4096"; + llpcOptions[numOptions++] = "-unroll-allow-partial"; + llpcOptions[numOptions++] = "-lower-dyn-index"; + llpcOptions[numOptions++] = "-simplifycfg-sink-common=false"; + llpcOptions[numOptions++] = "-amdgpu-vgpr-index-mode"; // force VGPR indexing on GFX8 + + ShaderCacheMode shaderCacheMode = settings.shaderCacheMode; +#ifdef ICD_BUILD_APPPROFILE + if ((appProfile == AppProfile::Talos) || + (appProfile == AppProfile::MadMax) || + (appProfile == AppProfile::SeriousSamFusion)) + { + llpcOptions[numOptions++] = "-enable-si-scheduler"; + } + + // Force enable cache to disk to improve user experience + if ((shaderCacheMode == ShaderCacheEnableRuntimeOnly) && + ((appProfile == AppProfile::MadMax) || + (appProfile == AppProfile::SeriousSamFusion) || + (appProfile == AppProfile::F1_2017))) + { + // Force to use internal disk cache. + shaderCacheMode = ShaderCacheForceInternalCacheOnDisk; + } +#endif + + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-executable-name=%s", pExecutablePtr); + ++optionLength; + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += optionLength; + bufSize -= optionLength; + + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-cache-mode=%d", shaderCacheMode); + ++optionLength; + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += optionLength; + bufSize -= optionLength; + + if (settings.shaderReplaceMode != 0) + { + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-mode=%d", settings.shaderReplaceMode); + ++optionLength; + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += optionLength; + bufSize -= optionLength; + + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-dir=%s", settings.shaderReplaceDir); + ++optionLength; + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += optionLength; + bufSize -= optionLength; + + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-pipeline-hashes=%s", settings.shaderReplacePipelineHashes); + ++optionLength; + llpcOptions[numOptions++] = pOptionBuffer; + pOptionBuffer += optionLength; + bufSize -= optionLength; + } + + VK_ASSERT(numOptions <= MaxLlpcOptions); + + // Create LLPC compiler + Llpc::Result llpcResult = Llpc::ICompiler::Create(m_gfxIp, numOptions, llpcOptions, &pCompiler); + VK_ASSERT(llpcResult == Llpc::Result::Success); + + m_pLlpc = pCompiler; + + return (llpcResult == Llpc::Result::Success) ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED; +} + +// ===================================================================================================================== +// Creates graphics pipeline binary. +VkResult PipelineCompiler::CreateGraphicsPipelineBinary( + Device* pDevice, + uint32_t deviceIdx, + PipelineCache* pPipelineCache, + GraphicsPipelineCreateInfo* pCreateInfo, + size_t* pPipelineBinarySize, + const void** ppPipelineBinary) +{ + VkResult result = VK_SUCCESS; + const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings(); + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + + // Build the LLPC pipeline + Llpc::GraphicsPipelineBuildOut pipelineOut = {}; + void* pLlpcPipelineBuffer = nullptr; + + { + // Fill pipeline create info for LLPC + auto pPipelineBuildInfo = &pCreateInfo->pipelineInfo; + pPipelineBuildInfo->pInstance = pInstance; + pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput; + pPipelineBuildInfo->pUserData = &pLlpcPipelineBuffer; + pPipelineBuildInfo->iaState.deviceIndex = deviceIdx; + + if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCacheType() == PipelineCacheTypeLlpc)) + { + pPipelineBuildInfo->pShaderCache = pPipelineCache->GetShaderCache(deviceIdx).pLlpcShaderCache; + } + + auto llpcResult = m_pLlpc->BuildGraphicsPipeline(pPipelineBuildInfo, &pipelineOut); + if (llpcResult != Llpc::Result::Success) + { + // There shouldn't be anything to free for the failure case + VK_ASSERT(pLlpcPipelineBuffer == nullptr); + + { + result = VK_ERROR_INITIALIZATION_FAILED; + } + } + else + { + *ppPipelineBinary = pipelineOut.pipelineBin.pCode; + *pPipelineBinarySize = pipelineOut.pipelineBin.codeSize; + } + } + + return result; +} + +// ===================================================================================================================== +// Creates compute pipeline binary. +VkResult PipelineCompiler::CreateComputePipelineBinary( + Device* pDevice, + uint32_t deviceIdx, + PipelineCache* pPipelineCache, + ComputePipelineCreateInfo* pCreateInfo, + size_t* pPipelineBinarySize, + const void** ppPipelineBinary) +{ + VkResult result = VK_SUCCESS; + const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings(); + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + const ShaderModule* pShader = ShaderModule::ObjectFromHandle(pCreateInfo->pStage->module); + + // Build the LLPC pipeline + Llpc::ComputePipelineBuildOut pipelineOut = {}; + void* pLlpcPipelineBuffer = nullptr; + + { + // Fill pipeline create info for LLPC + Llpc::ComputePipelineBuildInfo* pPipelineBuildInfo = &pCreateInfo->pipelineInfo; + + pPipelineBuildInfo->pInstance = pInstance; + pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput; + pPipelineBuildInfo->pUserData = &pLlpcPipelineBuffer; + pPipelineBuildInfo->deviceIndex = deviceIdx; + + if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCacheType() == PipelineCacheTypeLlpc)) + { + pPipelineBuildInfo->pShaderCache = pPipelineCache->GetShaderCache(deviceIdx).pLlpcShaderCache; + } + + // Build pipline binary + auto llpcResult = m_pLlpc->BuildComputePipeline(pPipelineBuildInfo, &pipelineOut); + if (llpcResult != Llpc::Result::Success) + { + // There shouldn't be anything to free for the failure case + VK_ASSERT(pLlpcPipelineBuffer == nullptr); + + { + result = VK_ERROR_INITIALIZATION_FAILED; + } + } + else + { + *ppPipelineBinary = pipelineOut.pipelineBin.pCode; + *pPipelineBinarySize = pipelineOut.pipelineBin.codeSize; + } + VK_ASSERT(*ppPipelineBinary == pLlpcPipelineBuffer); + } + + return VK_SUCCESS; +} + +// ===================================================================================================================== +// Converts Vulkan graphics pipeline parameters to an internal structure +VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( + Device* pDevice, + const VkGraphicsPipelineCreateInfo* pIn, + GraphicsPipelineCreateInfo* pCreateInfo, + VbBindingInfo* pVbInfo) +{ + VkResult result = VK_SUCCESS; + const RuntimeSettings& settings = m_pPhysicalDevice->GetRuntimeSettings(); + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + + EXTRACT_VK_STRUCTURES_0( + gfxPipeline, + GraphicsPipelineCreateInfo, + pIn, + GRAPHICS_PIPELINE_CREATE_INFO) + + // Fill in necessary non-zero defaults in case some information is missing + const RenderPass* pRenderPass = nullptr; + + if (pGraphicsPipelineCreateInfo != nullptr) + { + for (uint32_t i = 0; i < pGraphicsPipelineCreateInfo->stageCount; ++i) + { + ShaderStage stage = ShaderFlagBitToStage(pGraphicsPipelineCreateInfo->pStages[i].stage); + VK_ASSERT(stage < ShaderGfxStageCount); + pCreateInfo->pStages[stage] = &pGraphicsPipelineCreateInfo->pStages[i]; + } + + VK_IGNORE(pGraphicsPipelineCreateInfo->flags & VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT); + + pRenderPass = RenderPass::ObjectFromHandle(pGraphicsPipelineCreateInfo->renderPass); + + if (pGraphicsPipelineCreateInfo->layout != VK_NULL_HANDLE) + { + pCreateInfo->pLayout = PipelineLayout::ObjectFromHandle(pGraphicsPipelineCreateInfo->layout); + } + + pCreateInfo->pipelineInfo.pVertexInput = pGraphicsPipelineCreateInfo->pVertexInputState; + + const VkPipelineInputAssemblyStateCreateInfo* pIa = pGraphicsPipelineCreateInfo->pInputAssemblyState; + // According to the spec this should never be null + VK_ASSERT(pIa != nullptr); + + pCreateInfo->pipelineInfo.iaState.enableMultiView = pRenderPass->IsMultiviewEnabled(); + pCreateInfo->pipelineInfo.iaState.topology = pIa->topology; + pCreateInfo->pipelineInfo.iaState.disableVertexReuse = false; + + EXTRACT_VK_STRUCTURES_1( + Tess, + PipelineTessellationStateCreateInfo, + PipelineTessellationDomainOriginStateCreateInfoKHR, + pGraphicsPipelineCreateInfo->pTessellationState, + PIPELINE_TESSELLATION_STATE_CREATE_INFO, + PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO_KHR) + + if (pPipelineTessellationStateCreateInfo != nullptr) + { + pCreateInfo->pipelineInfo.iaState.patchControlPoints = pPipelineTessellationStateCreateInfo->patchControlPoints; + } + + if (pPipelineTessellationDomainOriginStateCreateInfoKHR) + { + // Vulkan 1.0 incorrectly specified the tessellation u,v coordinate origin as lower left even though + // framebuffer and image coordinate origins are in the upper left. This has since been fixed, but + // an extension exists to use the previous behavior. Doing so with flat shading would likely appear + // incorrect, but Vulkan specifies that the provoking vertex is undefined when tessellation is active. + if (pPipelineTessellationDomainOriginStateCreateInfoKHR->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT_KHR) + { + pCreateInfo->pipelineInfo.iaState.switchWinding = true; + } + } + + const VkPipelineRasterizationStateCreateInfo* pRs = pGraphicsPipelineCreateInfo->pRasterizationState; + // By default rasterization is disabled, unless rasterization creation info is present + pCreateInfo->pipelineInfo.rsState.rasterizerDiscardEnable = true; + if (pRs != nullptr) + { + pCreateInfo->pipelineInfo.vpState.depthClipEnable = (pRs->depthClampEnable == VK_FALSE); + pCreateInfo->pipelineInfo.rsState.rasterizerDiscardEnable = (pRs->rasterizerDiscardEnable != VK_FALSE); + } + + bool multisampleEnable = false; + uint32_t rasterizationSampleCount = 0; + const VkPipelineMultisampleStateCreateInfo* pMs = pGraphicsPipelineCreateInfo->pMultisampleState; + + pCreateInfo->pipelineInfo.rsState.numSamples = 1; + if (pMs != nullptr) + { + multisampleEnable = (pMs->rasterizationSamples != 1); + + if (multisampleEnable) + { + VK_ASSERT(pRenderPass != nullptr); + + rasterizationSampleCount = pMs->rasterizationSamples; + uint32_t subpassCoverageSampleCount = pRenderPass->GetSubpassMaxSampleCount(pGraphicsPipelineCreateInfo->subpass); + uint32_t subpassColorSampleCount = pRenderPass->GetSubpassColorSampleCount(pGraphicsPipelineCreateInfo->subpass); + + // subpassCoverageSampleCount would be equal to zero if there are zero attachments. + subpassCoverageSampleCount = subpassCoverageSampleCount == 0 ? rasterizationSampleCount : subpassCoverageSampleCount; + + subpassColorSampleCount = subpassColorSampleCount == 0 ? subpassCoverageSampleCount : subpassColorSampleCount; + + if (pMs->sampleShadingEnable && (pMs->minSampleShading > 0.0f)) + { + pCreateInfo->pipelineInfo.rsState.perSampleShading =((subpassColorSampleCount * pMs->minSampleShading) > 1.0f); + } + else + { + pCreateInfo->pipelineInfo.rsState.perSampleShading = false; + } + + pCreateInfo->pipelineInfo.rsState.numSamples = rasterizationSampleCount; + + // NOTE: The sample pattern index here is actually the offset of sample position pair. This is + // different from the field of creation info of image view. For image view, the sample pattern + // index is really table index of the sample pattern. + pCreateInfo->pipelineInfo.rsState.samplePatternIdx = + Device::GetDefaultSamplePatternIndex(subpassCoverageSampleCount) * Pal::MaxMsaaRasterizerSamples; + } + pCreateInfo->pipelineInfo.cbState.alphaToCoverageEnable = (pMs->alphaToCoverageEnable == VK_TRUE); + } + + const VkPipelineColorBlendStateCreateInfo* pCb = pGraphicsPipelineCreateInfo->pColorBlendState; + bool dualSourceBlend = false; + + if (pCb != nullptr) + { + const uint32_t numColorTargets = Util::Min(pCb->attachmentCount, Pal::MaxColorTargets); + + for (uint32_t i = 0; i < numColorTargets; ++i) + { + const VkPipelineColorBlendAttachmentState& src = pCb->pAttachments[i]; + auto pLlpcCbDst = &pCreateInfo->pipelineInfo.cbState.target[i]; + if (pRenderPass) + { + auto cbFormat = pRenderPass->GetColorAttachmentFormat(pGraphicsPipelineCreateInfo->subpass, i); + // If the sub pass attachment format is UNDEFINED, then it means that that subpass does not + // want to write to any attachment for that output (VK_ATTACHMENT_UNUSED). Under such cases, + // disable shader writes through that target. + if (cbFormat != VK_FORMAT_UNDEFINED) + { + pLlpcCbDst->format = cbFormat; + pLlpcCbDst->blendEnable = (src.blendEnable == VK_TRUE); + pLlpcCbDst->blendSrcAlphaToColor = IsSrcAlphaUsedInBlend(src.srcAlphaBlendFactor) || + IsSrcAlphaUsedInBlend(src.dstAlphaBlendFactor) || + IsSrcAlphaUsedInBlend(src.srcColorBlendFactor) || + IsSrcAlphaUsedInBlend(src.dstColorBlendFactor); + pLlpcCbDst->channelWriteMask = src.colorWriteMask; + } + } + + dualSourceBlend |= IsDualSourceBlend(src.srcAlphaBlendFactor); + dualSourceBlend |= IsDualSourceBlend(src.dstAlphaBlendFactor); + dualSourceBlend |= IsDualSourceBlend(src.srcColorBlendFactor); + dualSourceBlend |= IsDualSourceBlend(src.dstColorBlendFactor); + } + } + pCreateInfo->pipelineInfo.cbState.dualSourceBlendEnable = dualSourceBlend; + + VkFormat dbFormat = { }; + if (pRenderPass != nullptr) + { + dbFormat = pRenderPass->GetDepthStencilAttachmentFormat(pGraphicsPipelineCreateInfo->subpass); + pCreateInfo->dbFormat = dbFormat; + } + } + + // Allocate space to create the LLPC/SCPC pipeline resource mappings + if (pCreateInfo->pLayout != nullptr) + { + size_t tempBufferSize = pCreateInfo->pLayout->GetPipelineInfo()->tempBufferSize; + + // Allocate the temp buffer + if (tempBufferSize > 0) + { + pCreateInfo->pMappingBuffer = pInstance->AllocMem( + tempBufferSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + + if (pCreateInfo->pMappingBuffer == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + } + + // Build the LLPC pipeline + Llpc::PipelineShaderInfo* shaderInfos[] = + { + &pCreateInfo->pipelineInfo.vs, + &pCreateInfo->pipelineInfo.tcs, + &pCreateInfo->pipelineInfo.tes, + &pCreateInfo->pipelineInfo.gs, + &pCreateInfo->pipelineInfo.fs + }; + + // Apply patches + pCreateInfo->pipelineInfo.pInstance = pInstance; + pCreateInfo->pipelineInfo.pfnOutputAlloc = AllocateShaderOutput; + + for (uint32_t stage = 0; stage < ShaderGfxStageCount; ++stage) + { + auto pStage = pCreateInfo->pStages[stage]; + if (pStage == nullptr) + continue; + auto pScpcShader = ShaderModule::ObjectFromHandle(pStage->module); + auto pShaderInfo = shaderInfos[stage]; + + pShaderInfo->pModuleData = pScpcShader->GetShaderData(true); + pShaderInfo->pSpecializatonInfo = pStage->pSpecializationInfo; + pShaderInfo->pEntryTarget = pStage->pName; + + // Build the resource mapping description for LLPC. This data contains things about how shader + // inputs like descriptor set bindings are communicated to this pipeline in a form that LLPC can + // understand. + if (pCreateInfo->pLayout != nullptr) + { + const bool vertexShader = (stage == ShaderStageVertex); + result = pCreateInfo->pLayout->BuildLlpcPipelineMapping( + static_cast(stage), + pCreateInfo->pMappingBuffer, + vertexShader ? pCreateInfo->pipelineInfo.pVertexInput : nullptr, + pShaderInfo, + vertexShader ? pVbInfo : nullptr); + } + } + + return result; +} + +// ===================================================================================================================== +// Checks whether dual source blend is needed. +bool PipelineCompiler::IsDualSourceBlend( + VkBlendFactor blend) +{ + bool result = false; + switch(blend) + { + case VK_BLEND_FACTOR_SRC1_COLOR: + case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR: + case VK_BLEND_FACTOR_SRC1_ALPHA: + case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA: + result = true; + break; + default: + result = false; + break; + } + return result; +} + +// ===================================================================================================================== +// Converts Vulkan compute pipeline parameters to an internal structure +VkResult PipelineCompiler::ConvertComputePipelineInfo( + const VkComputePipelineCreateInfo* pIn, + ComputePipelineCreateInfo* pCreateInfo) +{ + VkResult result = VK_SUCCESS; + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + + VK_ASSERT(pIn->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO); + + if (pIn->layout != VK_NULL_HANDLE) + { + pCreateInfo->pLayout = PipelineLayout::ObjectFromHandle(pIn->layout); + } + + pCreateInfo->flags = pIn->flags; + pCreateInfo->pStage = &pIn->stage; + // Allocate space to create the LLPC/SCPC pipeline resource mappings + if (pCreateInfo->pLayout != nullptr) + { + size_t tempBufferSize = pCreateInfo->pLayout->GetPipelineInfo()->tempBufferSize; + + // Allocate the temp buffer + if (tempBufferSize > 0) + { + pCreateInfo->pMappingBuffer = pInstance->AllocMem( + tempBufferSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + + if (pCreateInfo->pMappingBuffer == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + } + + const ShaderModule* pScpcShader = ShaderModule::ObjectFromHandle(pCreateInfo->pStage->module); + pCreateInfo->pipelineInfo.cs.pModuleData = pScpcShader->GetShaderData(true); + pCreateInfo->pipelineInfo.cs.pSpecializatonInfo = pCreateInfo->pStage->pSpecializationInfo; + pCreateInfo->pipelineInfo.cs.pEntryTarget = pCreateInfo->pStage->pName; + + // Build the resource mapping description for LLPC. This data contains things about how shader + // inputs like descriptor set bindings interact with this pipeline in a form that LLPC can + // understand. + if (pCreateInfo->pLayout != nullptr) + { + result = pCreateInfo->pLayout->BuildLlpcPipelineMapping( + ShaderStageCompute, + pCreateInfo->pMappingBuffer, + nullptr, + &pCreateInfo->pipelineInfo.cs, + nullptr); + } + + return result; +} + +// ===================================================================================================================== +// Free compute pipeline binary +void PipelineCompiler::FreeComputePipelineBinary( + ComputePipelineCreateInfo* pCreateInfo, + const void* pPipelineBinary, + size_t binarySize) +{ + { + m_pPhysicalDevice->Manager()->VkInstance()->FreeMem(const_cast(pPipelineBinary)); + } +} + +// ===================================================================================================================== +// Free graphics pipeline binary +void PipelineCompiler::FreeGraphicsPipelineBinary( + GraphicsPipelineCreateInfo* pCreateInfo, + const void* pPipelineBinary, + size_t binarySize) +{ + { + m_pPhysicalDevice->Manager()->VkInstance()->FreeMem(const_cast(pPipelineBinary)); + } +} + +// ===================================================================================================================== +// Free the temp memories in compute pipeline create info +void PipelineCompiler::FreeComputePipelineCreateInfo( + ComputePipelineCreateInfo* pCreateInfo) +{ + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + + if (pCreateInfo->pMappingBuffer != nullptr) + { + pInstance->FreeMem(pCreateInfo->pMappingBuffer); + pCreateInfo->pMappingBuffer = nullptr; + } +} + +// ===================================================================================================================== +// Free the temp memories in graphics pipeline create info +void PipelineCompiler::FreeGraphicsPipelineCreateInfo( + GraphicsPipelineCreateInfo* pCreateInfo) +{ + auto pInstance = m_pPhysicalDevice->Manager()->VkInstance(); + + if (pCreateInfo->pMappingBuffer != nullptr) + { + pInstance->FreeMem(pCreateInfo->pMappingBuffer); + pCreateInfo->pMappingBuffer = nullptr; + } +} + +} + diff --git a/icd/api/strings/base_entry_points.txt b/icd/api/strings/base_entry_points.txt index e82f7c4a..b936eb16 100644 --- a/icd/api/strings/base_entry_points.txt +++ b/icd/api/strings/base_entry_points.txt @@ -1,298 +1,299 @@ ############################################################################### -# Entry point name type version/extension - -vkGetDeviceProcAddr @none -vkGetInstanceProcAddr @none - -vkCreateInstance @none -vkEnumerateInstanceExtensionProperties @none -vkEnumerateInstanceLayerProperties @none - -vkAllocateCommandBuffers @core 1.0 -vkAllocateDescriptorSets @core 1.0 -vkAllocateMemory @core 1.0 -vkBeginCommandBuffer @core 1.0 -vkBindBufferMemory @core 1.0 -vkBindImageMemory @core 1.0 -vkCmdBeginRenderPass @core 1.0 -vkCmdBeginQuery @core 1.0 -vkCmdBindDescriptorSets @core 1.0 -vkCmdBindIndexBuffer @core 1.0 -vkCmdBindPipeline @core 1.0 -vkCmdBindVertexBuffers @core 1.0 -vkCmdBlitImage @core 1.0 -vkCmdClearAttachments @core 1.0 -vkCmdClearColorImage @core 1.0 -vkCmdClearDepthStencilImage @core 1.0 -vkCmdCopyBuffer @core 1.0 -vkCmdCopyBufferToImage @core 1.0 -vkCmdCopyImage @core 1.0 -vkCmdCopyImageToBuffer @core 1.0 -vkCmdCopyQueryPoolResults @core 1.0 -vkCmdDraw @core 1.0 -vkCmdDrawIndexed @core 1.0 -vkCmdDrawIndexedIndirect @core 1.0 -vkCmdDrawIndirect @core 1.0 -vkCmdDispatch @core 1.0 -vkCmdDispatchIndirect @core 1.0 -vkCmdEndRenderPass @core 1.0 -vkCmdEndQuery @core 1.0 -vkCmdExecuteCommands @core 1.0 -vkCmdFillBuffer @core 1.0 -vkCmdNextSubpass @core 1.0 -vkCmdPipelineBarrier @core 1.0 -vkCmdPushConstants @core 1.0 -vkCmdResetEvent @core 1.0 -vkCmdResetQueryPool @core 1.0 -vkCmdResolveImage @core 1.0 -vkCmdSetBlendConstants @core 1.0 -vkCmdSetDepthBias @core 1.0 -vkCmdSetDepthBounds @core 1.0 -vkCmdSetEvent @core 1.0 -vkCmdSetLineWidth @core 1.0 -vkCmdSetScissor @core 1.0 -vkCmdSetStencilCompareMask @core 1.0 -vkCmdSetStencilReference @core 1.0 -vkCmdSetStencilWriteMask @core 1.0 -vkCmdSetViewport @core 1.0 -vkCmdUpdateBuffer @core 1.0 -vkCmdWaitEvents @core 1.0 -vkCmdWriteTimestamp @core 1.0 -vkCreateBuffer @core 1.0 -vkCreateBufferView @core 1.0 -vkCreateCommandPool @core 1.0 -vkCreateComputePipelines @core 1.0 -vkCreateDescriptorPool @core 1.0 -vkCreateDescriptorSetLayout @core 1.0 -vkCreateDevice @core 1.0 -vkCreateEvent @core 1.0 -vkCreateFence @core 1.0 -vkCreateFramebuffer @core 1.0 -vkCreateGraphicsPipelines @core 1.0 -vkCreateImage @core 1.0 -vkCreateImageView @core 1.0 -vkCreatePipelineCache @core 1.0 -vkCreatePipelineLayout @core 1.0 -vkCreateQueryPool @core 1.0 -vkCreateRenderPass @core 1.0 -vkCreateSampler @core 1.0 -vkCreateSemaphore @core 1.0 -vkCreateShaderModule @core 1.0 -vkDestroyBuffer @core 1.0 -vkDestroyBufferView @core 1.0 -vkDestroyCommandPool @core 1.0 -vkDestroyDescriptorPool @core 1.0 -vkDestroyDescriptorSetLayout @core 1.0 -vkDestroyDevice @core 1.0 -vkDestroyEvent @core 1.0 -vkDestroyFence @core 1.0 -vkDestroyFramebuffer @core 1.0 -vkDestroyImage @core 1.0 -vkDestroyImageView @core 1.0 -vkDestroyInstance @core 1.0 -vkDestroyPipeline @core 1.0 -vkDestroyPipelineCache @core 1.0 -vkDestroyPipelineLayout @core 1.0 -vkDestroyQueryPool @core 1.0 -vkDestroyRenderPass @core 1.0 -vkDestroySampler @core 1.0 -vkDestroySemaphore @core 1.0 -vkDestroyShaderModule @core 1.0 -vkDeviceWaitIdle @core 1.0 -vkEndCommandBuffer @core 1.0 -vkEnumerateDeviceExtensionProperties @core 1.0 -vkEnumerateDeviceLayerProperties @core 1.0 -vkEnumeratePhysicalDevices @core 1.0 -vkFlushMappedMemoryRanges @core 1.0 -vkFreeCommandBuffers @core 1.0 -vkFreeDescriptorSets @core 1.0 -vkFreeMemory @core 1.0 -vkGetBufferMemoryRequirements @core 1.0 -vkGetDeviceMemoryCommitment @core 1.0 -vkGetDeviceQueue @core 1.0 -vkGetEventStatus @core 1.0 -vkGetFenceStatus @core 1.0 -vkGetImageMemoryRequirements @core 1.0 -vkGetImageSparseMemoryRequirements @core 1.0 -vkGetImageSubresourceLayout @core 1.0 -vkGetPhysicalDeviceFeatures @core 1.0 -vkGetPhysicalDeviceFormatProperties @core 1.0 -vkGetPhysicalDeviceImageFormatProperties @core 1.0 -vkGetPhysicalDeviceMemoryProperties @core 1.0 -vkGetPhysicalDeviceProperties @core 1.0 -vkGetPhysicalDeviceQueueFamilyProperties @core 1.0 -vkGetPhysicalDeviceSparseImageFormatProperties @core 1.0 -vkGetPipelineCacheData @core 1.0 -vkGetQueryPoolResults @core 1.0 -vkGetRenderAreaGranularity @core 1.0 -vkInvalidateMappedMemoryRanges @core 1.0 -vkMapMemory @core 1.0 -vkMergePipelineCaches @core 1.0 -vkQueueBindSparse @core 1.0 -vkQueueSubmit @core 1.0 -vkQueueWaitIdle @core 1.0 -vkResetCommandBuffer @core 1.0 -vkResetCommandPool @core 1.0 -vkResetDescriptorPool @core 1.0 -vkResetEvent @core 1.0 -vkResetFences @core 1.0 -vkSetEvent @core 1.0 -vkUnmapMemory @core 1.0 -vkUpdateDescriptorSets @core 1.0 -vkWaitForFences @core 1.0 - -vkEnumerateInstanceVersion @none 1.1 - -vkBindBufferMemory2 @core 1.1 -vkBindImageMemory2 @core 1.1 -vkCmdSetDeviceMask @core 1.1 -vkCmdDispatchBase @core 1.1 -vkCreateDescriptorUpdateTemplate @core 1.1 -vkCreateSamplerYcbcrConversion @core 1.1 -vkDestroyDescriptorUpdateTemplate @core 1.1 -vkDestroySamplerYcbcrConversion @core 1.1 -vkEnumeratePhysicalDeviceGroups @core 1.1 -vkGetBufferMemoryRequirements2 @core 1.1 -vkGetDescriptorSetLayoutSupport @core 1.1 -vkGetDeviceGroupPeerMemoryFeatures @core 1.1 -vkGetDeviceQueue2 @core 1.1 -vkGetImageMemoryRequirements2 @core 1.1 -vkGetImageSparseMemoryRequirements2 @core 1.1 -vkGetPhysicalDeviceExternalBufferProperties @core 1.1 -vkGetPhysicalDeviceExternalFenceProperties @core 1.1 -vkGetPhysicalDeviceExternalSemaphoreProperties @core 1.1 -vkGetPhysicalDeviceFeatures2 @core 1.1 -vkGetPhysicalDeviceFormatProperties2 @core 1.1 -vkGetPhysicalDeviceImageFormatProperties2 @core 1.1 -vkGetPhysicalDeviceMemoryProperties2 @core 1.1 -vkGetPhysicalDeviceProperties2 @core 1.1 -vkGetPhysicalDeviceQueueFamilyProperties2 @core 1.1 -vkGetPhysicalDeviceSparseImageFormatProperties2 @core 1.1 -vkTrimCommandPool @core 1.1 -vkUpdateDescriptorSetWithTemplate @core 1.1 - -vkGetPhysicalDeviceFeatures2KHR @iext KHR_get_physical_device_properties2 -vkGetPhysicalDeviceProperties2KHR @iext KHR_get_physical_device_properties2 -vkGetPhysicalDeviceFormatProperties2KHR @iext KHR_get_physical_device_properties2 -vkGetPhysicalDeviceImageFormatProperties2KHR @iext KHR_get_physical_device_properties2 -vkGetPhysicalDeviceQueueFamilyProperties2KHR @iext KHR_get_physical_device_properties2 -vkGetPhysicalDeviceMemoryProperties2KHR @iext KHR_get_physical_device_properties2 - -vkGetPhysicalDeviceSparseImageFormatProperties2KHR @iext KHR_get_physical_device_properties2 - -vkEnumeratePhysicalDeviceGroupsKHR @iext KHR_device_group_creation - -vkGetDescriptorSetLayoutSupportKHR @dext KHR_maintenance3 - -vkGetPhysicalDevicePresentRectanglesKHR @dext KHR_device_group -vkGetDeviceGroupPeerMemoryFeaturesKHR @dext KHR_device_group -vkCmdSetDeviceMaskKHR @dext KHR_device_group -vkGetDeviceGroupPresentCapabilitiesKHR @dext KHR_device_group -vkGetDeviceGroupSurfacePresentModesKHR @dext KHR_device_group -vkAcquireNextImage2KHR @dext KHR_device_group -vkCmdDispatchBaseKHR @dext KHR_device_group - -vkBindBufferMemory2KHR @dext KHR_bind_memory2 -vkBindImageMemory2KHR @dext KHR_bind_memory2 - -vkCreateDescriptorUpdateTemplateKHR @dext KHR_descriptor_update_template -vkDestroyDescriptorUpdateTemplateKHR @dext KHR_descriptor_update_template -vkUpdateDescriptorSetWithTemplateKHR @dext KHR_descriptor_update_template - -vkEnumeratePhysicalDeviceGroupsKHX @iext KHX_device_group_creation -vkGetPhysicalDevicePresentRectanglesKHX @dext KHX_device_group -vkGetDeviceGroupPeerMemoryFeaturesKHX @dext KHX_device_group -vkCmdSetDeviceMaskKHX @dext KHX_device_group -vkGetDeviceGroupPresentCapabilitiesKHX @dext KHX_device_group -vkGetDeviceGroupSurfacePresentModesKHX @dext KHX_device_group -vkAcquireNextImage2KHX @dext KHX_device_group -vkCmdDispatchBaseKHX @dext KHX_device_group - -vkGetPhysicalDeviceExternalBufferPropertiesKHR @iext KHR_external_memory_capabilities - -vkGetMemoryFdPropertiesKHR @dext KHR_external_memory_fd -vkGetMemoryFdKHR @dext KHR_external_memory_fd - -vkGetMemoryWin32HandleKHR @dext KHR_external_memory_win32 -vkGetMemoryWin32HandlePropertiesKHR @dext KHR_external_memory_win32 - -vkGetPhysicalDeviceExternalSemaphorePropertiesKHR @iext KHR_external_semaphore_capabilities -vkImportSemaphoreFdKHR @dext KHR_external_semaphore_fd -vkGetSemaphoreFdKHR @dext KHR_external_semaphore_fd - -vkImportSemaphoreWin32HandleKHR @dext KHR_external_semaphore_win32 -vkGetSemaphoreWin32HandleKHR @dext KHR_external_semaphore_win32 - -vkTrimCommandPoolKHR @dext KHR_maintenance1 - -vkDestroySurfaceKHR @iext KHR_surface -vkGetPhysicalDeviceSurfaceCapabilitiesKHR @iext KHR_surface -vkGetPhysicalDeviceSurfaceFormatsKHR @iext KHR_surface -vkGetPhysicalDeviceSurfacePresentModesKHR @iext KHR_surface -vkGetPhysicalDeviceSurfaceSupportKHR @iext KHR_surface - -vkGetPhysicalDeviceSurfaceCapabilities2KHR @iext KHR_get_surface_capabilities2 -vkGetPhysicalDeviceSurfaceFormats2KHR @iext KHR_get_surface_capabilities2 - -vkCreateXcbSurfaceKHR @iext KHR_xcb_surface -vkGetPhysicalDeviceXcbPresentationSupportKHR @iext KHR_xcb_surface - -vkCreateXlibSurfaceKHR @iext KHR_xlib_surface -vkGetPhysicalDeviceXlibPresentationSupportKHR @iext KHR_xlib_surface - -vkAcquireNextImageKHR @dext KHR_swapchain -vkCreateSwapchainKHR @dext KHR_swapchain -vkDestroySwapchainKHR @dext KHR_swapchain -vkGetSwapchainImagesKHR @dext KHR_swapchain -vkQueuePresentKHR @dext KHR_swapchain - -vkCmdDrawIndexedIndirectCountAMD @dext AMD_draw_indirect_count -vkCmdDrawIndirectCountAMD @dext AMD_draw_indirect_count - -vkGetMultiDevicePropertiesAMDInternal @none $win32_only -vkOpenWin32BufferAMDInternal @none $win32_only -vkOpenWin32ImageAMDInternal @none $win32_only -vkOpenWin32SemaphoreAMDInternal @none $win32_only - -vkGetShaderInfoAMD @dext AMD_shader_info - -vkCmdDebugMarkerBeginEXT @dext EXT_debug_marker -vkCmdDebugMarkerEndEXT @dext EXT_debug_marker -vkCmdDebugMarkerInsertEXT @dext EXT_debug_marker -vkDebugMarkerSetObjectTagEXT @dext EXT_debug_marker -vkDebugMarkerSetObjectNameEXT @dext EXT_debug_marker - -vkCreateGpaSessionAMD @dext AMD_gpa_interface -vkDestroyGpaSessionAMD @dext AMD_gpa_interface -vkSetGpaDeviceClockModeAMD @dext AMD_gpa_interface -vkCmdBeginGpaSessionAMD @dext AMD_gpa_interface -vkCmdEndGpaSessionAMD @dext AMD_gpa_interface -vkCmdBeginGpaSampleAMD @dext AMD_gpa_interface -vkCmdEndGpaSampleAMD @dext AMD_gpa_interface -vkGetGpaSessionStatusAMD @dext AMD_gpa_interface -vkGetGpaSessionResultsAMD @dext AMD_gpa_interface -vkResetGpaSessionAMD @dext AMD_gpa_interface -vkCmdCopyGpaSessionResultsAMD @dext AMD_gpa_interface +# Entry point name type version/extension + +vkGetDeviceProcAddr @none +vkGetInstanceProcAddr @none + +vkCreateInstance @none +vkEnumerateInstanceExtensionProperties @none +vkEnumerateInstanceLayerProperties @none + +vkCreateDevice @icore 1.0 +vkDestroyInstance @icore 1.0 +vkEnumerateDeviceExtensionProperties @icore 1.0 +vkEnumerateDeviceLayerProperties @icore 1.0 +vkEnumeratePhysicalDevices @icore 1.0 +vkGetPhysicalDeviceFeatures @icore 1.0 +vkGetPhysicalDeviceFormatProperties @icore 1.0 +vkGetPhysicalDeviceImageFormatProperties @icore 1.0 +vkGetPhysicalDeviceMemoryProperties @icore 1.0 +vkGetPhysicalDeviceProperties @icore 1.0 +vkGetPhysicalDeviceQueueFamilyProperties @icore 1.0 +vkGetPhysicalDeviceSparseImageFormatProperties @icore 1.0 + +vkAllocateCommandBuffers @dcore 1.0 +vkAllocateDescriptorSets @dcore 1.0 +vkAllocateMemory @dcore 1.0 +vkBeginCommandBuffer @dcore 1.0 +vkBindBufferMemory @dcore 1.0 +vkBindImageMemory @dcore 1.0 +vkCmdBeginQuery @dcore 1.0 +vkCmdBeginRenderPass @dcore 1.0 +vkCmdBindDescriptorSets @dcore 1.0 +vkCmdBindIndexBuffer @dcore 1.0 +vkCmdBindPipeline @dcore 1.0 +vkCmdBindVertexBuffers @dcore 1.0 +vkCmdBlitImage @dcore 1.0 +vkCmdClearAttachments @dcore 1.0 +vkCmdClearColorImage @dcore 1.0 +vkCmdClearDepthStencilImage @dcore 1.0 +vkCmdCopyBuffer @dcore 1.0 +vkCmdCopyBufferToImage @dcore 1.0 +vkCmdCopyImage @dcore 1.0 +vkCmdCopyImageToBuffer @dcore 1.0 +vkCmdCopyQueryPoolResults @dcore 1.0 +vkCmdDispatch @dcore 1.0 +vkCmdDispatchIndirect @dcore 1.0 +vkCmdDraw @dcore 1.0 +vkCmdDrawIndexed @dcore 1.0 +vkCmdDrawIndexedIndirect @dcore 1.0 +vkCmdDrawIndirect @dcore 1.0 +vkCmdEndQuery @dcore 1.0 +vkCmdEndRenderPass @dcore 1.0 +vkCmdExecuteCommands @dcore 1.0 +vkCmdFillBuffer @dcore 1.0 +vkCmdNextSubpass @dcore 1.0 +vkCmdPipelineBarrier @dcore 1.0 +vkCmdPushConstants @dcore 1.0 +vkCmdResetEvent @dcore 1.0 +vkCmdResetQueryPool @dcore 1.0 +vkCmdResolveImage @dcore 1.0 +vkCmdSetBlendConstants @dcore 1.0 +vkCmdSetDepthBias @dcore 1.0 +vkCmdSetDepthBounds @dcore 1.0 +vkCmdSetEvent @dcore 1.0 +vkCmdSetLineWidth @dcore 1.0 +vkCmdSetScissor @dcore 1.0 +vkCmdSetStencilCompareMask @dcore 1.0 +vkCmdSetStencilReference @dcore 1.0 +vkCmdSetStencilWriteMask @dcore 1.0 +vkCmdSetViewport @dcore 1.0 +vkCmdUpdateBuffer @dcore 1.0 +vkCmdWaitEvents @dcore 1.0 +vkCmdWriteTimestamp @dcore 1.0 +vkCreateBuffer @dcore 1.0 +vkCreateBufferView @dcore 1.0 +vkCreateCommandPool @dcore 1.0 +vkCreateComputePipelines @dcore 1.0 +vkCreateDescriptorPool @dcore 1.0 +vkCreateDescriptorSetLayout @dcore 1.0 +vkCreateEvent @dcore 1.0 +vkCreateFence @dcore 1.0 +vkCreateFramebuffer @dcore 1.0 +vkCreateGraphicsPipelines @dcore 1.0 +vkCreateImage @dcore 1.0 +vkCreateImageView @dcore 1.0 +vkCreatePipelineCache @dcore 1.0 +vkCreatePipelineLayout @dcore 1.0 +vkCreateQueryPool @dcore 1.0 +vkCreateRenderPass @dcore 1.0 +vkCreateSampler @dcore 1.0 +vkCreateSemaphore @dcore 1.0 +vkCreateShaderModule @dcore 1.0 +vkDestroyBuffer @dcore 1.0 +vkDestroyBufferView @dcore 1.0 +vkDestroyCommandPool @dcore 1.0 +vkDestroyDescriptorPool @dcore 1.0 +vkDestroyDescriptorSetLayout @dcore 1.0 +vkDestroyDevice @dcore 1.0 +vkDestroyEvent @dcore 1.0 +vkDestroyFence @dcore 1.0 +vkDestroyFramebuffer @dcore 1.0 +vkDestroyImage @dcore 1.0 +vkDestroyImageView @dcore 1.0 +vkDestroyPipeline @dcore 1.0 +vkDestroyPipelineCache @dcore 1.0 +vkDestroyPipelineLayout @dcore 1.0 +vkDestroyQueryPool @dcore 1.0 +vkDestroyRenderPass @dcore 1.0 +vkDestroySampler @dcore 1.0 +vkDestroySemaphore @dcore 1.0 +vkDestroyShaderModule @dcore 1.0 +vkDeviceWaitIdle @dcore 1.0 +vkEndCommandBuffer @dcore 1.0 +vkFlushMappedMemoryRanges @dcore 1.0 +vkFreeCommandBuffers @dcore 1.0 +vkFreeDescriptorSets @dcore 1.0 +vkFreeMemory @dcore 1.0 +vkGetBufferMemoryRequirements @dcore 1.0 +vkGetDeviceMemoryCommitment @dcore 1.0 +vkGetDeviceQueue @dcore 1.0 +vkGetEventStatus @dcore 1.0 +vkGetFenceStatus @dcore 1.0 +vkGetImageMemoryRequirements @dcore 1.0 +vkGetImageSparseMemoryRequirements @dcore 1.0 +vkGetImageSubresourceLayout @dcore 1.0 +vkGetPipelineCacheData @dcore 1.0 +vkGetQueryPoolResults @dcore 1.0 +vkGetRenderAreaGranularity @dcore 1.0 +vkInvalidateMappedMemoryRanges @dcore 1.0 +vkMapMemory @dcore 1.0 +vkMergePipelineCaches @dcore 1.0 +vkQueueBindSparse @dcore 1.0 +vkQueueSubmit @dcore 1.0 +vkQueueWaitIdle @dcore 1.0 +vkResetCommandBuffer @dcore 1.0 +vkResetCommandPool @dcore 1.0 +vkResetDescriptorPool @dcore 1.0 +vkResetEvent @dcore 1.0 +vkResetFences @dcore 1.0 +vkSetEvent @dcore 1.0 +vkUnmapMemory @dcore 1.0 +vkUpdateDescriptorSets @dcore 1.0 +vkWaitForFences @dcore 1.0 + +vkEnumerateInstanceVersion @none 1.1 + +vkEnumeratePhysicalDeviceGroups @icore 1.1 +vkGetPhysicalDeviceExternalBufferProperties @icore 1.1 +vkGetPhysicalDeviceExternalFenceProperties @icore 1.1 +vkGetPhysicalDeviceExternalSemaphoreProperties @icore 1.1 +vkGetPhysicalDeviceFeatures2 @icore 1.1 +vkGetPhysicalDeviceFormatProperties2 @icore 1.1 +vkGetPhysicalDeviceImageFormatProperties2 @icore 1.1 +vkGetPhysicalDeviceMemoryProperties2 @icore 1.1 +vkGetPhysicalDeviceProperties2 @icore 1.1 +vkGetPhysicalDeviceQueueFamilyProperties2 @icore 1.1 +vkGetPhysicalDeviceSparseImageFormatProperties2 @icore 1.1 + +vkBindBufferMemory2 @dcore 1.1 +vkBindImageMemory2 @dcore 1.1 +vkCmdDispatchBase @dcore 1.1 +vkCmdSetDeviceMask @dcore 1.1 +vkCreateDescriptorUpdateTemplate @dcore 1.1 +vkCreateSamplerYcbcrConversion @dcore 1.1 +vkDestroyDescriptorUpdateTemplate @dcore 1.1 +vkDestroySamplerYcbcrConversion @dcore 1.1 +vkGetBufferMemoryRequirements2 @dcore 1.1 +vkGetDescriptorSetLayoutSupport @dcore 1.1 +vkGetDeviceGroupPeerMemoryFeatures @dcore 1.1 +vkGetDeviceQueue2 @dcore 1.1 +vkGetImageMemoryRequirements2 @dcore 1.1 +vkGetImageSparseMemoryRequirements2 @dcore 1.1 +vkTrimCommandPool @dcore 1.1 +vkUpdateDescriptorSetWithTemplate @dcore 1.1 + +vkGetPhysicalDeviceFeatures2KHR @iext KHR_get_physical_device_properties2 +vkGetPhysicalDeviceProperties2KHR @iext KHR_get_physical_device_properties2 +vkGetPhysicalDeviceFormatProperties2KHR @iext KHR_get_physical_device_properties2 +vkGetPhysicalDeviceImageFormatProperties2KHR @iext KHR_get_physical_device_properties2 +vkGetPhysicalDeviceQueueFamilyProperties2KHR @iext KHR_get_physical_device_properties2 +vkGetPhysicalDeviceMemoryProperties2KHR @iext KHR_get_physical_device_properties2 +vkGetPhysicalDeviceSparseImageFormatProperties2KHR @iext KHR_get_physical_device_properties2 + +vkEnumeratePhysicalDeviceGroupsKHR @iext KHR_device_group_creation +vkGetPhysicalDevicePresentRectanglesKHR @dext KHR_device_group +vkGetDeviceGroupPeerMemoryFeaturesKHR @dext KHR_device_group +vkCmdSetDeviceMaskKHR @dext KHR_device_group +vkGetDeviceGroupPresentCapabilitiesKHR @dext KHR_device_group +vkGetDeviceGroupSurfacePresentModesKHR @dext KHR_device_group +vkAcquireNextImage2KHR @dext KHR_device_group +vkCmdDispatchBaseKHR @dext KHR_device_group + +vkBindBufferMemory2KHR @dext KHR_bind_memory2 +vkBindImageMemory2KHR @dext KHR_bind_memory2 + +vkCreateDescriptorUpdateTemplateKHR @dext KHR_descriptor_update_template +vkDestroyDescriptorUpdateTemplateKHR @dext KHR_descriptor_update_template +vkUpdateDescriptorSetWithTemplateKHR @dext KHR_descriptor_update_template + +vkEnumeratePhysicalDeviceGroupsKHX @iext KHX_device_group_creation +vkGetPhysicalDevicePresentRectanglesKHX @dext KHX_device_group +vkGetDeviceGroupPeerMemoryFeaturesKHX @dext KHX_device_group +vkCmdSetDeviceMaskKHX @dext KHX_device_group +vkGetDeviceGroupPresentCapabilitiesKHX @dext KHX_device_group +vkGetDeviceGroupSurfacePresentModesKHX @dext KHX_device_group +vkAcquireNextImage2KHX @dext KHX_device_group +vkCmdDispatchBaseKHX @dext KHX_device_group + +vkGetPhysicalDeviceExternalBufferPropertiesKHR @iext KHR_external_memory_capabilities + +vkGetMemoryFdPropertiesKHR @dext KHR_external_memory_fd +vkGetMemoryFdKHR @dext KHR_external_memory_fd + +vkGetMemoryWin32HandleKHR @dext KHR_external_memory_win32 +vkGetMemoryWin32HandlePropertiesKHR @dext KHR_external_memory_win32 + +vkGetPhysicalDeviceExternalSemaphorePropertiesKHR @iext KHR_external_semaphore_capabilities + +vkImportSemaphoreFdKHR @dext KHR_external_semaphore_fd +vkGetSemaphoreFdKHR @dext KHR_external_semaphore_fd + +vkImportSemaphoreWin32HandleKHR @dext KHR_external_semaphore_win32 +vkGetSemaphoreWin32HandleKHR @dext KHR_external_semaphore_win32 + +vkTrimCommandPoolKHR @dext KHR_maintenance1 + +vkGetDescriptorSetLayoutSupportKHR @dext KHR_maintenance3 + +vkDestroySurfaceKHR @iext KHR_surface +vkGetPhysicalDeviceSurfaceCapabilitiesKHR @iext KHR_surface +vkGetPhysicalDeviceSurfaceFormatsKHR @iext KHR_surface +vkGetPhysicalDeviceSurfacePresentModesKHR @iext KHR_surface +vkGetPhysicalDeviceSurfaceSupportKHR @iext KHR_surface + +vkGetPhysicalDeviceSurfaceCapabilities2KHR @iext KHR_get_surface_capabilities2 +vkGetPhysicalDeviceSurfaceFormats2KHR @iext KHR_get_surface_capabilities2 + +vkCreateXcbSurfaceKHR @iext KHR_xcb_surface +vkGetPhysicalDeviceXcbPresentationSupportKHR @iext KHR_xcb_surface + +vkCreateXlibSurfaceKHR @iext KHR_xlib_surface +vkGetPhysicalDeviceXlibPresentationSupportKHR @iext KHR_xlib_surface + +vkAcquireNextImageKHR @dext KHR_swapchain +vkCreateSwapchainKHR @dext KHR_swapchain +vkDestroySwapchainKHR @dext KHR_swapchain +vkGetSwapchainImagesKHR @dext KHR_swapchain +vkQueuePresentKHR @dext KHR_swapchain + +vkCmdDrawIndexedIndirectCountAMD @dext AMD_draw_indirect_count +vkCmdDrawIndirectCountAMD @dext AMD_draw_indirect_count + +vkGetMultiDevicePropertiesAMDInternal @none $win32_only +vkOpenWin32BufferAMDInternal @none $win32_only +vkOpenWin32ImageAMDInternal @none $win32_only +vkOpenWin32SemaphoreAMDInternal @none $win32_only + +vkGetShaderInfoAMD @dext AMD_shader_info + +vkCmdDebugMarkerBeginEXT @dext EXT_debug_marker +vkCmdDebugMarkerEndEXT @dext EXT_debug_marker +vkCmdDebugMarkerInsertEXT @dext EXT_debug_marker +vkDebugMarkerSetObjectTagEXT @dext EXT_debug_marker +vkDebugMarkerSetObjectNameEXT @dext EXT_debug_marker + +vkCreateGpaSessionAMD @dext AMD_gpa_interface +vkDestroyGpaSessionAMD @dext AMD_gpa_interface +vkSetGpaDeviceClockModeAMD @dext AMD_gpa_interface +vkCmdBeginGpaSessionAMD @dext AMD_gpa_interface +vkCmdEndGpaSessionAMD @dext AMD_gpa_interface +vkCmdBeginGpaSampleAMD @dext AMD_gpa_interface +vkCmdEndGpaSampleAMD @dext AMD_gpa_interface +vkGetGpaSessionStatusAMD @dext AMD_gpa_interface +vkGetGpaSessionResultsAMD @dext AMD_gpa_interface +vkResetGpaSessionAMD @dext AMD_gpa_interface +vkCmdCopyGpaSessionResultsAMD @dext AMD_gpa_interface -vkGetImageMemoryRequirements2KHR @dext KHR_get_memory_requirements2 -vkGetBufferMemoryRequirements2KHR @dext KHR_get_memory_requirements2 -vkGetImageSparseMemoryRequirements2KHR @dext KHR_get_memory_requirements2 +vkGetImageMemoryRequirements2KHR @dext KHR_get_memory_requirements2 +vkGetBufferMemoryRequirements2KHR @dext KHR_get_memory_requirements2 +vkGetImageSparseMemoryRequirements2KHR @dext KHR_get_memory_requirements2 -vkCmdSetSampleLocationsEXT @dext EXT_sample_locations -vkGetPhysicalDeviceMultisamplePropertiesEXT @dext EXT_sample_locations +vkCmdSetSampleLocationsEXT @dext EXT_sample_locations +vkGetPhysicalDeviceMultisamplePropertiesEXT @dext EXT_sample_locations -vkGetPhysicalDeviceExternalFencePropertiesKHR @iext KHR_external_fence_capabilities +vkGetPhysicalDeviceExternalFencePropertiesKHR @iext KHR_external_fence_capabilities -vkImportFenceFdKHR @dext KHR_external_fence_fd -vkGetFenceFdKHR @dext KHR_external_fence_fd +vkImportFenceFdKHR @dext KHR_external_fence_fd +vkGetFenceFdKHR @dext KHR_external_fence_fd -vkImportFenceWin32HandleKHR @dext KHR_external_fence_win32 -vkGetFenceWin32HandleKHR @dext KHR_external_fence_win32 +vkImportFenceWin32HandleKHR @dext KHR_external_fence_win32 +vkGetFenceWin32HandleKHR @dext KHR_external_fence_win32 -vkCmdWriteBufferMarkerAMD @dext AMD_buffer_marker +vkCmdWriteBufferMarkerAMD @dext AMD_buffer_marker -vkCreateDebugReportCallbackEXT @iext EXT_debug_report -vkDestroyDebugReportCallbackEXT @iext EXT_debug_report -vkDebugReportMessageEXT @iext EXT_debug_report +vkCreateDebugReportCallbackEXT @iext EXT_debug_report +vkDestroyDebugReportCallbackEXT @iext EXT_debug_report +vkDebugReportMessageEXT @iext EXT_debug_report -vkGetMemoryHostPointerPropertiesEXT @dext EXT_external_memory_host +vkGetMemoryHostPointerPropertiesEXT @dext EXT_external_memory_host diff --git a/icd/api/strings/base_extensions.txt b/icd/api/strings/base_extensions.txt index 863772c9..b19a50ee 100644 --- a/icd/api/strings/base_extensions.txt +++ b/icd/api/strings/base_extensions.txt @@ -48,6 +48,7 @@ VK_AMD_texture_gather_bias_lod VK_AMD_mixed_attachment_samples VK_EXT_debug_marker VK_AMD_gpu_shader_int16 +VK_EXT_shader_subgroup_ballot VK_EXT_shader_subgroup_vote VK_KHR_16bit_storage VK_KHR_storage_buffer_storage_class diff --git a/icd/api/strings/generate_strings.py b/icd/api/strings/generate_strings.py index c5ca9166..26b5782c 100644 --- a/icd/api/strings/generate_strings.py +++ b/icd/api/strings/generate_strings.py @@ -148,8 +148,10 @@ def generate_string_file_pass(string_file_prefix, header_file_prefix, gentype): if gentype == 'decl': if type == 'none': generate_entry_point_condition(f, name, "NONE", 0) - elif type == 'core': - generate_entry_point_condition(f, name, "CORE", make_version(value)) + elif type == 'icore': + generate_entry_point_condition(f, name, "CORE_INSTANCE", make_version(value)) + elif type == 'dcore': + generate_entry_point_condition(f, name, "CORE_DEVICE", make_version(value)) elif type == 'iext': generate_entry_point_condition(f, name, "INSTANCE_EXTENSION", "vk::InstanceExtensions::%s" % value.upper()) elif type == 'dext': diff --git a/icd/api/strings/strings.h b/icd/api/strings/strings.h index 1c462817..a1406af3 100644 --- a/icd/api/strings/strings.h +++ b/icd/api/strings/strings.h @@ -47,7 +47,8 @@ namespace secure enum EntryPointCondition : uint32_t { ENTRY_POINT_NONE, // First-class entry point without any condition - ENTRY_POINT_CORE, // Core entry point specific to a core Vulkan version + ENTRY_POINT_CORE_INSTANCE, // Core instance entry point specific to a core Vulkan version + ENTRY_POINT_CORE_DEVICE, // Core device entry point specific to a core Vulkan version ENTRY_POINT_INSTANCE_EXTENSION, // Instance extension specific entry point ENTRY_POINT_DEVICE_EXTENSION, // Device extension specific entry point }; diff --git a/icd/api/vk_buffer.cpp b/icd/api/vk_buffer.cpp index 5be28d13..f9464ef9 100644 --- a/icd/api/vk_buffer.cpp +++ b/icd/api/vk_buffer.cpp @@ -469,7 +469,8 @@ VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements2KHR( VkMemoryRequirements2KHR* pMemoryRequirements) { const Device* pDevice = ApiDevice::ObjectFromHandle(device); - VK_ASSERT(pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); + VK_ASSERT((pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || + pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); union { diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index 6c3ce94e..20ceb2fd 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -297,6 +297,46 @@ Pal::Result CreateClearSubresRanges( return palResult; } +// ===================================================================================================================== +// Returns attachment's PAL subresource ranges defined by clearInfo for LoadOp Clear. +// When multiview is enabled, layer ranges are modified according active views during a renderpass. +Util::Vector +LoadOpClearSubresRanges( + const Framebuffer::Attachment& attachment, + const RPLoadOpClearInfo& clearInfo, + const RenderPass& renderPass) +{ + // Note that no allocation will be performed, so Util::Vector allocator is nullptr. + Util::Vector clearSubresRanges { nullptr }; + + const auto attachmentSubresRanges = attachment.FindSubresRanges(clearInfo.aspect); + + if (renderPass.IsMultiviewEnabled()) + { + const auto activeViews = renderPass.GetActiveViewsBitMask(); + const auto layerRanges = RangesOfOnesInBitMask(activeViews); + + for (uint32_t rangeIndex = 0; rangeIndex < attachmentSubresRanges.NumElements(); ++rangeIndex) + { + for (auto layerRangeIt = layerRanges.Begin(); layerRangeIt.IsValid(); layerRangeIt.Next()) + { + clearSubresRanges.PushBack(attachmentSubresRanges.At(rangeIndex)); + clearSubresRanges.Back().startSubres.arraySlice += layerRangeIt.Get().offset; + clearSubresRanges.Back().numSlices = layerRangeIt.Get().extent; + } + } + } + else + { + for (uint32_t rangeIndex = 0; rangeIndex < attachmentSubresRanges.NumElements(); ++rangeIndex) + { + clearSubresRanges.PushBack(attachmentSubresRanges.At(rangeIndex)); + } + } + + return clearSubresRanges; +} + // ===================================================================================================================== // Populate a vector with PAL rects created from Vulkan clear rects. // Returns Pal::Result::Success if completed successfully. @@ -1826,22 +1866,30 @@ void CmdBuffer::CopyBuffer( VirtualStackFrame virtStackFrame(m_pStackAllocator); + const auto maxRegions = EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions)); + auto regionBatch = Util::Min(regionCount, maxRegions); + // Allocate space to store memory copy regions - Pal::MemoryCopyRegion* pPalRegions = virtStackFrame.AllocArray(regionCount); + Pal::MemoryCopyRegion* pPalRegions = virtStackFrame.AllocArray(regionBatch); if (pPalRegions != nullptr) { Buffer* pSrcBuffer = Buffer::ObjectFromHandle(srcBuffer); Buffer* pDstBuffer = Buffer::ObjectFromHandle(destBuffer); - for (uint32_t i = 0; i < regionCount; ++i) + for (uint32_t regionIdx = 0; regionIdx < regionCount; regionIdx += regionBatch) { - pPalRegions[i].srcOffset = pSrcBuffer->MemOffset() + pRegions[i].srcOffset; - pPalRegions[i].dstOffset = pDstBuffer->MemOffset() + pRegions[i].dstOffset; - pPalRegions[i].copySize = pRegions[i].size; - } + regionBatch = Util::Min(regionCount - regionIdx, maxRegions); - PalCmdCopyBuffer(pSrcBuffer, pDstBuffer, regionCount, pPalRegions); + for (uint32_t i = 0; i < regionBatch; ++i) + { + pPalRegions[i].srcOffset = pSrcBuffer->MemOffset() + pRegions[regionIdx + i].srcOffset; + pPalRegions[i].dstOffset = pDstBuffer->MemOffset() + pRegions[regionIdx + i].dstOffset; + pPalRegions[i].copySize = pRegions[regionIdx + i].size; + } + + PalCmdCopyBuffer(pSrcBuffer, pDstBuffer, regionBatch, pPalRegions); + } virtStackFrame.FreeArray(pPalRegions); } @@ -1866,8 +1914,11 @@ void CmdBuffer::CopyImage( VirtualStackFrame virtStackFrame(m_pStackAllocator); + const auto maxRegions = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions)), MaxPalAspectsPerMask); + auto regionBatch = Util::Min(regionCount * MaxPalAspectsPerMask, maxRegions); + Pal::ImageCopyRegion* pPalRegions = - virtStackFrame.AllocArray(regionCount * MaxPalAspectsPerMask); + virtStackFrame.AllocArray(regionBatch); if (pPalRegions != nullptr) { @@ -1880,14 +1931,21 @@ void CmdBuffer::CopyImage( const Pal::ImageLayout palSrcImgLayout = pSrcImage->GetTransferLayout(srcImageLayout, this); const Pal::ImageLayout palDstImgLayout = pDstImage->GetTransferLayout(destImageLayout, this); - uint32_t palRegionCount = 0; - - for (uint32_t i = 0; i < regionCount; ++i) + for (uint32_t regionIdx = 0; regionIdx < regionCount;) { - VkToPalImageCopyRegion(pRegions[i], srcFormat.format, dstFormat.format, pPalRegions, palRegionCount); - } + uint32_t palRegionCount = 0; - PalCmdCopyImage(pSrcImage, palSrcImgLayout, pDstImage, palDstImgLayout, palRegionCount, pPalRegions); + while ((regionIdx < regionCount) && + (palRegionCount <= (regionBatch - MaxPalAspectsPerMask))) + { + VkToPalImageCopyRegion(pRegions[regionIdx], srcFormat.format, dstFormat.format, + pPalRegions, palRegionCount); + + ++regionIdx; + } + + PalCmdCopyImage(pSrcImage, palSrcImgLayout, pDstImage, palDstImgLayout, palRegionCount, pPalRegions); + } virtStackFrame.FreeArray(pPalRegions); } @@ -1911,39 +1969,50 @@ void CmdBuffer::BlitImage( { DbgBarrierPreCmd(DbgBarrierCopyImage); - const Image* const pSrcImage = Image::ObjectFromHandle(srcImage); - const Image* const pDstImage = Image::ObjectFromHandle(destImage); - - const Pal::SwizzledFormat srcFormat = VkToPalFormat(pSrcImage->GetFormat()); - const Pal::SwizzledFormat dstFormat = VkToPalFormat(pDstImage->GetFormat()); - - Pal::ScaledCopyInfo palCopyInfo = {}; - - palCopyInfo.srcImageLayout = pSrcImage->GetTransferLayout(srcImageLayout, this); - palCopyInfo.dstImageLayout = pDstImage->GetTransferLayout(destImageLayout, this); - VirtualStackFrame virtStackFrame(m_pStackAllocator); + const auto maxRegions = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions)), MaxPalAspectsPerMask); + auto regionBatch = Util::Min(regionCount * MaxPalAspectsPerMask, maxRegions); + // Allocate space to store scaled image copy regions (we need a separate region per PAL aspect) Pal::ImageScaledCopyRegion* pPalRegions = - virtStackFrame.AllocArray(regionCount * MaxPalAspectsPerMask); + virtStackFrame.AllocArray(regionBatch); if (pPalRegions != nullptr) { - for (uint32_t i = 0; i < regionCount; ++i) - { - VkToPalImageScaledCopyRegion(pRegions[i], srcFormat.format, dstFormat.format, pPalRegions, palCopyInfo.regionCount); - } + const Image* const pSrcImage = Image::ObjectFromHandle(srcImage); + const Image* const pDstImage = Image::ObjectFromHandle(destImage); - palCopyInfo.pRegions = pPalRegions; + const Pal::SwizzledFormat srcFormat = VkToPalFormat(pSrcImage->GetFormat()); + const Pal::SwizzledFormat dstFormat = VkToPalFormat(pDstImage->GetFormat()); - // Maps blit filters to their PAL equivalent - palCopyInfo.filter = VkToPalTexFilter(VK_FALSE, filter, filter, VK_SAMPLER_MIPMAP_MODE_NEAREST); + Pal::ScaledCopyInfo palCopyInfo = {}; + + palCopyInfo.srcImageLayout = pSrcImage->GetTransferLayout(srcImageLayout, this); + palCopyInfo.dstImageLayout = pDstImage->GetTransferLayout(destImageLayout, this); + // Maps blit filters to their PAL equivalent + palCopyInfo.filter = VkToPalTexFilter(VK_FALSE, filter, filter, VK_SAMPLER_MIPMAP_MODE_NEAREST); palCopyInfo.rotation = Pal::ImageRotation::Ccw0; - // This will do a scaled blit - PalCmdScaledCopyImage(pSrcImage, pDstImage, palCopyInfo); + palCopyInfo.pRegions = pPalRegions; + + for (uint32_t regionIdx = 0; regionIdx < regionCount;) + { + palCopyInfo.regionCount = 0; + + while ((regionIdx < regionCount) && + (palCopyInfo.regionCount <= (regionBatch - MaxPalAspectsPerMask))) + { + VkToPalImageScaledCopyRegion(pRegions[regionIdx], srcFormat.format, dstFormat.format, + pPalRegions, palCopyInfo.regionCount); + + ++regionIdx; + } + + // This will do a scaled blit + PalCmdScaledCopyImage(pSrcImage, pDstImage, palCopyInfo); + } virtStackFrame.FreeArray(pPalRegions); } @@ -1955,6 +2024,9 @@ void CmdBuffer::BlitImage( DbgBarrierPostCmd(DbgBarrierCopyImage); } +// PAL version 391.1 adds support for mis-aligned buffer-image/image-buffer copies +#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 391) || \ + ((PAL_CLIENT_INTERFACE_MAJOR_VERSION == 391) && (PAL_CLIENT_INTERFACE_MINOR_VERSION < 1)) // ===================================================================================================================== // Align memory to image copy region void CmdBuffer::AlignMemoryImageCopyRegion( @@ -1981,6 +2053,7 @@ void CmdBuffer::AlignMemoryImageCopyRegion( const uint32_t copySizePixelsHeight = copySizeBytesHeight / bytesPerPixel; pRegion->imageExtent.height = copySizePixelsHeight; } +#endif // ===================================================================================================================== // Copies from a buffer of linear data to a region of an image (vkCopyBufferToImage) @@ -1995,8 +2068,11 @@ void CmdBuffer::CopyBufferToImage( VirtualStackFrame virtStackFrame(m_pStackAllocator); + const auto maxRegions = EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions)); + auto regionBatch = Util::Min(regionCount, maxRegions); + // Allocate space to store memory image copy regions - Pal::MemoryImageCopyRegion* pPalRegions = virtStackFrame.AllocArray(regionCount); + Pal::MemoryImageCopyRegion* pPalRegions = virtStackFrame.AllocArray(regionBatch); if (pPalRegions != nullptr) { @@ -2006,26 +2082,35 @@ void CmdBuffer::CopyBufferToImage( const Pal::ImageLayout layout = pDstImage->GetTransferLayout(destImageLayout, this); - for (uint32_t i = 0; i < regionCount; ++i) + for (uint32_t regionIdx = 0; regionIdx < regionCount; regionIdx += regionBatch) { - // For image-buffer copies we have to override the format for depth-only and stencil-only copies - Pal::SwizzledFormat dstFormat = VkToPalFormat(Formats::GetAspectFormat( - pDstImage->GetFormat(), pRegions[i].imageSubresource.aspectMask)); - - pPalRegions[i] = VkToPalMemoryImageCopyRegion(pRegions[i], dstFormat.format, srcMemOffset); - - if (!GpuUtil::ValidateMemoryImageRegion( - m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties(), - m_palEngineType, - *pDstImage->PalImage(), - *pSrcBuffer->PalMemory(), - pPalRegions[i])) + regionBatch = Util::Min(regionCount - regionIdx, maxRegions); + + for (uint32_t i = 0; i < regionBatch; ++i) { - AlignMemoryImageCopyRegion(pDstImage->PalImage(), &pPalRegions[i]); + // For image-buffer copies we have to override the format for depth-only and stencil-only copies + Pal::SwizzledFormat dstFormat = VkToPalFormat(Formats::GetAspectFormat( + pDstImage->GetFormat(), pRegions[regionIdx + i].imageSubresource.aspectMask)); + + pPalRegions[i] = VkToPalMemoryImageCopyRegion(pRegions[regionIdx + i], dstFormat.format, srcMemOffset); + + // PAL version 391.1 adds support for mis-aligned buffer-image/image-buffer copies +#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 391) || \ + ((PAL_CLIENT_INTERFACE_MAJOR_VERSION == 391) && (PAL_CLIENT_INTERFACE_MINOR_VERSION < 1)) + if (!GpuUtil::ValidateMemoryImageRegion( + m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties(), + m_palEngineType, + *pDstImage->PalImage(), + *pSrcBuffer->PalMemory(), + pPalRegions[i])) + { + AlignMemoryImageCopyRegion(pDstImage->PalImage(), &pPalRegions[i]); + } +#endif } - } - PalCmdCopyMemoryToImage(pSrcBuffer, pDstImage, layout, regionCount, pPalRegions); + PalCmdCopyMemoryToImage(pSrcBuffer, pDstImage, layout, regionBatch, pPalRegions); + } virtStackFrame.FreeArray(pPalRegions); } @@ -2050,8 +2135,11 @@ void CmdBuffer::CopyImageToBuffer( VirtualStackFrame virtStackFrame(m_pStackAllocator); + const auto maxRegions = EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions)); + auto regionBatch = Util::Min(regionCount, maxRegions); + // Allocate space to store memory image copy regions - Pal::MemoryImageCopyRegion* pPalRegions = virtStackFrame.AllocArray(regionCount); + Pal::MemoryImageCopyRegion* pPalRegions = virtStackFrame.AllocArray(regionBatch); if (pPalRegions != nullptr) { @@ -2063,18 +2151,21 @@ void CmdBuffer::CopyImageToBuffer( const Pal::ImageLayout layout = pSrcImage->GetTransferLayout(srcImageLayout, this); - uint32_t engineCopyCount = 0; - - for (uint32_t i = 0; i < regionCount; ++i) + for (uint32_t regionIdx = 0; regionIdx < regionCount; regionIdx += regionBatch) { - // For image-buffer copies we have to override the format for depth-only and stencil-only copies - Pal::SwizzledFormat srcFormat = VkToPalFormat(Formats::GetAspectFormat(pSrcImage->GetFormat(), - pRegions[i].imageSubresource.aspectMask)); + regionBatch = Util::Min(regionCount - regionIdx, maxRegions); - pPalRegions[engineCopyCount++] = VkToPalMemoryImageCopyRegion(pRegions[i], srcFormat.format, dstMemOffset); - } + for (uint32_t i = 0; i < regionBatch; ++i) + { + // For image-buffer copies we have to override the format for depth-only and stencil-only copies + Pal::SwizzledFormat srcFormat = VkToPalFormat(Formats::GetAspectFormat(pSrcImage->GetFormat(), + pRegions[regionIdx + i].imageSubresource.aspectMask)); - PalCmdCopyImageToMemory(pSrcImage, pDstBuffer, layout, regionCount, pPalRegions); + pPalRegions[i] = VkToPalMemoryImageCopyRegion(pRegions[regionIdx + i], srcFormat.format, dstMemOffset); + } + + PalCmdCopyImageToMemory(pSrcImage, pDstBuffer, layout, regionBatch, pPalRegions); + } virtStackFrame.FreeArray(pPalRegions); } @@ -2147,37 +2238,46 @@ void CmdBuffer::ClearColorImage( VirtualStackFrame virtStackFrame(m_pStackAllocator); + const auto maxRanges = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRanges)), MaxPalColorAspectsPerMask); + auto rangeBatch = Util::Min(rangeCount * MaxPalColorAspectsPerMask, maxRanges); + // Allocate space to store image subresource ranges - Pal::SubresRange* pPalRanges = virtStackFrame.AllocArray(rangeCount * MaxPalColorAspectsPerMask); + Pal::SubresRange* pPalRanges = virtStackFrame.AllocArray(rangeBatch); if (pPalRanges != nullptr) { - uint32_t palRangeCount = 0; - const Pal::ImageLayout layout = pImage->GetTransferLayout(imageLayout, this); - for (uint32_t i = 0; i < rangeCount; ++i) + for (uint32_t rangeIdx = 0; rangeIdx < rangeCount;) { - // Only color aspect is allowed here - VK_ASSERT(pRanges[i].aspectMask == VK_IMAGE_ASPECT_COLOR_BIT); - - VkToPalSubresRange(palFormat.format, - pRanges[i], - pImage->GetMipLevels(), - pImage->GetArraySize(), - pPalRanges, - palRangeCount); - } + uint32_t palRangeCount = 0; - PalCmdClearColorImage( - *pImage, - layout, - VkToPalClearColor(pColor, palFormat.format), - palRangeCount, - pPalRanges, - 0, - nullptr, - 0); + while ((rangeIdx < rangeCount) && + (palRangeCount <= (rangeBatch - MaxPalColorAspectsPerMask))) + { + // Only color aspect is allowed here + VK_ASSERT(pRanges[rangeIdx].aspectMask == VK_IMAGE_ASPECT_COLOR_BIT); + + VkToPalSubresRange(palFormat.format, + pRanges[rangeIdx], + pImage->GetMipLevels(), + pImage->GetArraySize(), + pPalRanges, + palRangeCount); + + ++rangeIdx; + } + + PalCmdClearColorImage( + *pImage, + layout, + VkToPalClearColor(pColor, palFormat.format), + palRangeCount, + pPalRanges, + 0, + nullptr, + 0); + } virtStackFrame.FreeArray(pPalRanges); } @@ -2236,40 +2336,49 @@ void CmdBuffer::ClearDepthStencilImage( { VirtualStackFrame virtStackFrame(m_pStackAllocator); + const auto maxRanges = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRanges)), MaxPalDepthAspectsPerMask); + auto rangeBatch = Util::Min(rangeCount * MaxPalDepthAspectsPerMask, maxRanges); + // Allocate space to store image subresource ranges (we need a separate region per PAL aspect) - Pal::SubresRange* pPalRanges = virtStackFrame.AllocArray(rangeCount * MaxPalDepthAspectsPerMask); + Pal::SubresRange* pPalRanges = virtStackFrame.AllocArray(rangeBatch); if (pPalRanges != nullptr) { - uint32_t palRangeCount = 0; - const Image* pImage = Image::ObjectFromHandle(image); const Pal::ImageLayout layout = pImage->GetTransferLayout(imageLayout, this); - for (uint32_t i = 0; i < rangeCount; ++i) + for (uint32_t rangeIdx = 0; rangeIdx < rangeCount;) { - // Only depth or stencil aspect is allowed here - VK_ASSERT((pRanges[i].aspectMask & ~(VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) == 0); - - VkToPalSubresRange(VkToPalFormat(pImage->GetFormat()).format, - pRanges[i], - pImage->GetMipLevels(), - pImage->GetArraySize(), - pPalRanges, - palRangeCount); - } + uint32_t palRangeCount = 0; - PalCmdClearDepthStencil( - *pImage, - layout, - layout, - VkToPalClearDepth(depth), - stencil, - palRangeCount, - pPalRanges, - 0, - nullptr, - 0); + while ((rangeIdx < rangeCount) && + (palRangeCount <= (rangeBatch - MaxPalDepthAspectsPerMask))) + { + // Only depth or stencil aspect is allowed here + VK_ASSERT((pRanges[rangeIdx].aspectMask & ~(VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) == 0); + + VkToPalSubresRange(VkToPalFormat(pImage->GetFormat()).format, + pRanges[rangeIdx], + pImage->GetMipLevels(), + pImage->GetArraySize(), + pPalRanges, + palRangeCount); + + ++rangeIdx; + } + + PalCmdClearDepthStencil( + *pImage, + layout, + layout, + VkToPalClearDepth(depth), + stencil, + palRangeCount, + pPalRanges, + 0, + nullptr, + 0); + } virtStackFrame.FreeArray(pPalRanges); } @@ -2316,11 +2425,9 @@ void CmdBuffer::ClearBoundAttachments( Util::Vector clearRegions { &virtStackFrame }; Util::Vector colorTargets { &virtStackFrame }; - const auto palResult1 = CreateClearRegions( - rectCount, pRects, - *pRenderPass, subpass, - &clearRegions); - + const auto maxRects = EstimateMaxObjectsOnVirtualStack(sizeof(*pRects)); + auto rectBatch = Util::Min(rectCount, maxRects); + const auto palResult1 = clearRegions.Reserve(rectBatch); const auto palResult2 = colorTargets.Reserve(attachmentCount); if ((palResult1 != Pal::Result::Success) || @@ -2373,15 +2480,25 @@ void CmdBuffer::ClearBoundAttachments( DbgBarrierPreCmd(DbgBarrierClearDepth); - // Clear the bound depth stencil target immediately - PalCmdBuffer(DefaultDeviceIndex)->CmdClearBoundDepthStencilTargets( - VkToPalClearDepth(clearInfo.clearValue.depthStencil.depth), - clearInfo.clearValue.depthStencil.stencil, - pRenderPass->GetDepthStencilAttachmentSamples(subpass), - pRenderPass->GetDepthStencilAttachmentSamples(subpass), - selectFlags, - clearRegions.NumElements(), - clearRegions.Data()); + for (uint32_t rectIdx = 0; rectIdx < rectCount; rectIdx += rectBatch) + { + rectBatch = Util::Min(rectCount - rectIdx, maxRects); + + CreateClearRegions( + rectBatch, pRects + rectIdx, + *pRenderPass, subpass, + &clearRegions); + + // Clear the bound depth stencil target immediately + PalCmdBuffer(DefaultDeviceIndex)->CmdClearBoundDepthStencilTargets( + VkToPalClearDepth(clearInfo.clearValue.depthStencil.depth), + clearInfo.clearValue.depthStencil.stencil, + pRenderPass->GetDepthStencilAttachmentSamples(subpass), + pRenderPass->GetDepthStencilAttachmentSamples(subpass), + selectFlags, + clearRegions.NumElements(), + clearRegions.Data()); + } DbgBarrierPostCmd(DbgBarrierClearDepth); } @@ -2392,12 +2509,22 @@ void CmdBuffer::ClearBoundAttachments( { DbgBarrierPreCmd(DbgBarrierClearColor); - // Clear the bound color targets - PalCmdBuffer(DefaultDeviceIndex)->CmdClearBoundColorTargets( - colorTargets.NumElements(), - colorTargets.Data(), - clearRegions.NumElements(), - clearRegions.Data()); + for (uint32_t rectIdx = 0; rectIdx < rectCount; rectIdx += rectBatch) + { + rectBatch = Util::Min(rectCount - rectIdx, maxRects); + + CreateClearRegions( + rectBatch, pRects + rectIdx, + *pRenderPass, subpass, + &clearRegions); + + // Clear the bound color targets + PalCmdBuffer(DefaultDeviceIndex)->CmdClearBoundColorTargets( + colorTargets.NumElements(), + colorTargets.Data(), + clearRegions.NumElements(), + clearRegions.Data()); + } DbgBarrierPostCmd(DbgBarrierClearColor); } @@ -2578,7 +2705,8 @@ void CmdBuffer::ClearImageAttachments( // Get the current renderpass and subpass const RenderPass* pRenderPass = m_state.allGpuState.pRenderPass; - const uint32_t subpass = m_renderPassInstance.subpass; + const uint32_t subpass = m_renderPassInstance.subpass; + const auto maxRects = EstimateMaxObjectsOnVirtualStack(sizeof(*pRects)); // Go through each of the clear attachment infos for (uint32_t idx = 0; idx < attachmentCount; ++idx) @@ -2609,29 +2737,38 @@ void CmdBuffer::ClearImageAttachments( Util::Vector clearBoxes { &virtStackFrame }; Util::Vector clearSubresRanges { &virtStackFrame }; - const auto palResult1 = CreateClearRegions( - rectCount, pRects, - *pRenderPass, subpass, - &clearBoxes); - - const auto palResult2 = CreateClearSubresRanges( - attachment, clearInfo, - rectCount, pRects, - *pRenderPass, subpass, - &clearSubresRanges); + auto rectBatch = Util::Min(rectCount, maxRects); + const auto palResult1 = clearBoxes.Reserve(rectBatch); + const auto palResult2 = clearSubresRanges.Reserve(rectBatch); if ((palResult1 == Pal::Result::Success) && (palResult2 == Pal::Result::Success)) { - PalCmdClearColorImage( - *attachment.pImage, - targetLayout, - VkToPalClearColor(&clearInfo.clearValue.color, attachment.viewFormat.format), - clearSubresRanges.NumElements(), - clearSubresRanges.Data(), - clearBoxes.NumElements(), - clearBoxes.Data(), - Pal::ClearColorImageFlags::ColorClearAutoSync); + for (uint32_t rectIdx = 0; rectIdx < rectCount; rectIdx += rectBatch) + { + rectBatch = Util::Min(rectCount - rectIdx, maxRects); + + CreateClearRegions( + rectCount, pRects + rectIdx, + *pRenderPass, subpass, + &clearBoxes); + + CreateClearSubresRanges( + attachment, clearInfo, + rectCount, pRects + rectIdx, + *pRenderPass, subpass, + &clearSubresRanges); + + PalCmdClearColorImage( + *attachment.pImage, + targetLayout, + VkToPalClearColor(&clearInfo.clearValue.color, attachment.viewFormat.format), + clearSubresRanges.NumElements(), + clearSubresRanges.Data(), + clearBoxes.NumElements(), + clearBoxes.Data(), + Pal::ClearColorImageFlags::ColorClearAutoSync); + } } else { @@ -2660,30 +2797,39 @@ void CmdBuffer::ClearImageAttachments( Util::Vector clearRects { &virtStackFrame }; Util::Vector clearSubresRanges { &virtStackFrame }; - const auto palResult1 = CreateClearRects( - rectCount, pRects, - &clearRects); - - const auto palResult2 = CreateClearSubresRanges( - attachment, clearInfo, - rectCount, pRects, - *pRenderPass, subpass, - &clearSubresRanges); + auto rectBatch = Util::Min(rectCount, maxRects); + const auto palResult1 = clearRects.Reserve(rectBatch); + const auto palResult2 = clearSubresRanges.Reserve(rectBatch); if ((palResult1 == Pal::Result::Success) && (palResult2 == Pal::Result::Success)) { - PalCmdClearDepthStencil( - *attachment.pImage, - depthLayout, - stencilLayout, - VkToPalClearDepth(clearInfo.clearValue.depthStencil.depth), - clearInfo.clearValue.depthStencil.stencil, - clearSubresRanges.NumElements(), - clearSubresRanges.Data(), - clearRects.NumElements(), - clearRects.Data(), - Pal::ClearDepthStencilFlags::DsClearAutoSync); + for (uint32_t rectIdx = 0; rectIdx < rectCount; rectIdx += rectBatch) + { + rectBatch = Util::Min(rectCount - rectIdx, maxRects); + + CreateClearRects( + rectCount, pRects + rectIdx, + &clearRects); + + CreateClearSubresRanges( + attachment, clearInfo, + rectCount, pRects + rectIdx, + *pRenderPass, subpass, + &clearSubresRanges); + + PalCmdClearDepthStencil( + *attachment.pImage, + depthLayout, + stencilLayout, + VkToPalClearDepth(clearInfo.clearValue.depthStencil.depth), + clearInfo.clearValue.depthStencil.stencil, + clearSubresRanges.NumElements(), + clearSubresRanges.Data(), + clearRects.NumElements(), + clearRects.Data(), + Pal::ClearDepthStencilFlags::DsClearAutoSync); + } } else { @@ -2705,9 +2851,12 @@ void CmdBuffer::ResolveImage( { VirtualStackFrame virtStackFrame(m_pStackAllocator); + const auto maxRects = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRects)), MaxPalAspectsPerMask); + auto rectBatch = Util::Min(rectCount * MaxPalAspectsPerMask, maxRects); + // Allocate space to store image resolve regions (we need a separate region per PAL aspect) Pal::ImageResolveRegion* pPalRegions = - virtStackFrame.AllocArray(rectCount * MaxPalAspectsPerMask); + virtStackFrame.AllocArray(rectBatch); if (pPalRegions != nullptr) { @@ -2718,23 +2867,29 @@ void CmdBuffer::ResolveImage( const Pal::SwizzledFormat srcFormat = VkToPalFormat(pSrcImage->GetFormat()); const Pal::SwizzledFormat dstFormat = VkToPalFormat(pDstImage->GetFormat()); - uint32_t palRegionCount = 0; - - for (uint32_t i = 0; i < rectCount; ++i) + for (uint32_t rectIdx = 0; rectIdx < rectCount;) { - // We expect MSAA images to never have mipmaps - VK_ASSERT(pRects[i].srcSubresource.mipLevel == 0); + uint32_t palRegionCount = 0; - VkToPalImageResolveRegion(pRects[i], srcFormat.format, dstFormat.format, pPalRegions, palRegionCount); - } + while ((rectIdx < rectCount) && + (palRegionCount <= (rectBatch - MaxPalAspectsPerMask))) + { + // We expect MSAA images to never have mipmaps + VK_ASSERT(pRects[rectIdx].srcSubresource.mipLevel == 0); + + VkToPalImageResolveRegion(pRects[rectIdx], srcFormat.format, dstFormat.format, pPalRegions, palRegionCount); - PalCmdResolveImage( - *pSrcImage, - palSrcImageLayout, - *pDstImage, - palDestImageLayout, - palRegionCount, - pPalRegions); + ++rectIdx; + } + + PalCmdResolveImage( + *pSrcImage, + palSrcImageLayout, + *pDstImage, + palDestImageLayout, + palRegionCount, + pPalRegions); + } virtStackFrame.FreeArray(pPalRegions); } @@ -4622,6 +4777,10 @@ void CmdBuffer::RPLoadOpClearColor( &m_renderPassInstance.pAttachments[clear.attachment].clearValue.color, attachment.viewFormat.format); + const auto clearSubresRanges = LoadOpClearSubresRanges( + attachment, clear, + *m_state.allGpuState.pRenderPass); + utils::IterateMask deviceGroup(GetDeviceMask()); while (deviceGroup.Iterate()) @@ -4634,8 +4793,8 @@ void CmdBuffer::RPLoadOpClearColor( *attachment.pImage->PalImage(deviceIdx), clearLayout, clearColor, - attachment.subresRangeCount, - attachment.subresRange, + clearSubresRanges.NumElements(), + clearSubresRanges.Data(), 1, &clearBox, Pal::ColorClearAutoSync); } @@ -4676,24 +4835,9 @@ void CmdBuffer::RPLoadOpClearDepthStencil( float clearDepth = VkToPalClearDepth(clearValue.depthStencil.depth); Pal::uint8 clearStencil = clearValue.depthStencil.stencil; - Pal::SubresRange clearRanges[2]; - uint32_t clearRangeCount = 0; - - for (uint32_t sr = 0; sr < attachment.subresRangeCount; ++sr) - { - VK_ASSERT(clearRangeCount < 2); - - if ((clear.aspect & VK_IMAGE_ASPECT_DEPTH_BIT) && - (attachment.subresRange[sr].startSubres.aspect == Pal::ImageAspect::Depth)) - { - clearRanges[clearRangeCount++] = attachment.subresRange[sr]; - } - else if ((clear.aspect & VK_IMAGE_ASPECT_STENCIL_BIT) && - (attachment.subresRange[sr].startSubres.aspect == Pal::ImageAspect::Stencil)) - { - clearRanges[clearRangeCount++] = attachment.subresRange[sr]; - } - } + const auto clearSubresRanges = LoadOpClearSubresRanges( + attachment, clear, + *m_state.allGpuState.pRenderPass); utils::IterateMask deviceGroup(GetDeviceMask()); @@ -4709,8 +4853,8 @@ void CmdBuffer::RPLoadOpClearDepthStencil( stencilLayout, clearDepth, clearStencil, - clearRangeCount, - clearRanges, + clearSubresRanges.NumElements(), + clearSubresRanges.Data(), 1, &clearRect, Pal::DsClearAutoSync); } @@ -5099,7 +5243,8 @@ void CmdBuffer::SetViewport( DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState); - const bool khrMaintenance1 = m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1); + const bool khrMaintenance1 = ((m_pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || + m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1)); for (uint32_t i = 0; i < viewportCount; ++i) { @@ -5356,6 +5501,14 @@ void CmdBuffer::SetStencilReference( DbgBarrierPostCmd(DbgBarrierSetDynamicPipelineState); } +// ===================================================================================================================== +// Get a safe number of objects that can be allocated by the virtual stack frame allocator without risking OOM error. +uint32_t CmdBuffer::EstimateMaxObjectsOnVirtualStack(size_t objectSize) const +{ + // Return at least 1 and use only 50% of the remaining space. + return 1 + static_cast((m_pStackAllocator->Remaining() / objectSize) >> 1); +} + #if VK_ENABLE_DEBUG_BARRIERS // ===================================================================================================================== // This function inserts a command before or after a particular Vulkan command if the given runtime settings are asking diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp index dce956ab..6e298627 100644 --- a/icd/api/vk_compute_pipeline.cpp +++ b/icd/api/vk_compute_pipeline.cpp @@ -55,137 +55,6 @@ void ComputePipeline::ConvertComputePipelineInfo( pOutInfo->pLayout = PipelineLayout::ObjectFromHandle(pIn->layout); } - pOutInfo->flags = pIn->flags; - pOutInfo->pStage = &pIn->stage; - -} - -// ===================================================================================================================== -// Creates a compute pipeline binary for each PAL device -VkResult ComputePipeline::CreateComputePipelineBinaries( - Device* pDevice, - PipelineCache* pPipelineCache, - CreateInfo* pCreateInfo, - size_t pipelineBinarySizes[MaxPalDevices], - void* pPipelineBinaries[MaxPalDevices]) -{ - VkResult result = VK_SUCCESS; - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - const ShaderModule* pShader = ShaderModule::ObjectFromHandle(pCreateInfo->pStage->module); - - // Allocate space to create the LLPC/SCPC pipeline resource mappings - void* pMappingBuffer = nullptr; - - if (pCreateInfo->pLayout != nullptr) - { - size_t tempBufferSize = pCreateInfo->pLayout->GetPipelineInfo()->tempBufferSize; - - // Allocate the temp buffer - if (tempBufferSize > 0) - { - pMappingBuffer = pDevice->VkInstance()->AllocMem( - tempBufferSize, - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - - if (pMappingBuffer == nullptr) - { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - } - } - } - - // Build the LLPC pipeline - Llpc::ComputePipelineBuildInfo pipelineBuildInfo = {}; - Llpc::ComputePipelineBuildOut pipelineOut = {}; - void* pLlpcPipelineBuffer = nullptr; - - if ((result == VK_SUCCESS) - ) - { - // Fill pipeline create info for LLPC - pipelineBuildInfo.pInstance = pDevice->VkPhysicalDevice()->VkInstance(); - pipelineBuildInfo.pfnOutputAlloc = AllocateShaderOutput; - pipelineBuildInfo.pUserData = &pLlpcPipelineBuffer; - auto pShaderInfo = &pipelineBuildInfo.cs; - - pShaderInfo->pModuleData = pShader->GetShaderData(true); - pShaderInfo->pSpecializatonInfo = pCreateInfo->pStage->pSpecializationInfo; - pShaderInfo->pEntryTarget = pCreateInfo->pStage->pName; - - // Build the resource mapping description for LLPC. This data contains things about how shader - // inputs like descriptor set bindings interact with this pipeline in a form that LLPC can - // understand. - if (pCreateInfo->pLayout != nullptr) - { - result = pCreateInfo->pLayout->BuildLlpcPipelineMapping( - ShaderStageCompute, - pMappingBuffer, - nullptr, - pShaderInfo, - nullptr); - } - } - - uint64_t pipeHash = 0; - - bool enableLlpc = false; - enableLlpc = true; - - if (result == VK_SUCCESS) - { - if (enableLlpc) - { - if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCacheType() == PipelineCacheTypeLlpc)) - { - pipelineBuildInfo.pShaderCache = pPipelineCache->GetShaderCache(DefaultDeviceIndex).pLlpcShaderCache; - } - - auto llpcResult = pDevice->GetLlpcCompiler()->BuildComputePipeline(&pipelineBuildInfo, &pipelineOut); - if (llpcResult != Llpc::Result::Success) - { - // There shouldn't be anything to free for the failure case - VK_ASSERT(pLlpcPipelineBuffer == nullptr); - - { - result = VK_ERROR_INITIALIZATION_FAILED; - } - } - } - else - if (settings.enablePipelineDump) - { - // LLPC isn't enabled but pipeline dump is required, call LLPC dump interface explicitly - void* pHandle = Llpc::IPipelineDumper::BeginPipelineDump(settings.pipelineDumpDir, &pipelineBuildInfo, nullptr); - Llpc::IPipelineDumper::EndPipelineDump(pHandle); - } - } - - // Update PAL pipeline create info with LLPC output - if (enableLlpc) - { - if (result == VK_SUCCESS) - { - - // Make sure that this is the same pointer we will free once the PAL pipeline is created - VK_ASSERT(pLlpcPipelineBuffer == pipelineOut.pipelineBin.pCode); - - pPipelineBinaries[DefaultDeviceIndex] = pLlpcPipelineBuffer; - pipelineBinarySizes[DefaultDeviceIndex] = pipelineOut.pipelineBin.codeSize; - } - } - else - { - result = VK_SUCCESS; - } - - // Free the memory for the LLPC/SCPC pipeline resource mappings - if (pMappingBuffer != nullptr) - { - pDevice->VkInstance()->FreeMem(pMappingBuffer); - } - - return result; } // ===================================================================================================================== @@ -242,22 +111,33 @@ VkResult ComputePipeline::Create( // Setup PAL create info from Vulkan inputs CreateInfo createInfo = {}; size_t pipelineBinarySizes[MaxPalDevices] = {}; - void* pPipelineBinaries[MaxPalDevices] = {}; - - ConvertComputePipelineInfo(pDevice, pCreateInfo, &createInfo); + const void* pPipelineBinaries[MaxPalDevices] = {}; + PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(); + PipelineCompiler::ComputePipelineCreateInfo binaryCreateInfo = {}; + VkResult result = pDefaultCompiler->ConvertComputePipelineInfo(pCreateInfo, &binaryCreateInfo); - VkResult result = CreateComputePipelineBinaries( - pDevice, - pPipelineCache, - &createInfo, - pipelineBinarySizes, - pPipelineBinaries); + for (uint32_t deviceIdx = 0; (result == VK_SUCCESS) && (deviceIdx < pDevice->NumPalDevices()); deviceIdx++) + { + result = pDevice->GetCompiler(deviceIdx)->CreateComputePipelineBinary( + pDevice, + deviceIdx, + pPipelineCache, + &binaryCreateInfo, + &pipelineBinarySizes[deviceIdx], + &pPipelineBinaries[deviceIdx]); + } if (result != VK_SUCCESS) { return result; } + if (result == VK_SUCCESS) + { + ConvertComputePipelineInfo(pDevice, pCreateInfo, &createInfo); + + } + size_t pipelineSize = 0; void* pSystemMem = nullptr; @@ -347,11 +227,11 @@ VkResult ComputePipeline::Create( { if (pPipelineBinaries[deviceIdx] != nullptr) { - { - pDevice->VkInstance()->FreeMem(pPipelineBinaries[deviceIdx]); - } + pDevice->GetCompiler(deviceIdx)->FreeComputePipelineBinary( + &binaryCreateInfo, pPipelineBinaries[deviceIdx], pipelineBinarySizes[deviceIdx]); } } + pDefaultCompiler->FreeComputePipelineCreateInfo(&binaryCreateInfo); // Something went wrong with creating the PAL object. Free memory and return error. if (result != VK_SUCCESS) diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp index 78e6259d..93da913b 100644 --- a/icd/api/vk_descriptor_pool.cpp +++ b/icd/api/vk_descriptor_pool.cpp @@ -228,7 +228,7 @@ VkResult DescriptorPool::AllocDescriptorSets( DescriptorSet* pSet = DescriptorSet::StateFromHandle(pDescriptorSets[allocCount]); pSet->Reassign(pLayout, setGpuMemOffset, m_gpuAddressCached, m_pCpuAddressCached, m_pDevice->NumPalDevices(), - &m_internalMem, pSetAllocHandle, &pDescriptorSets[allocCount]); + pSetAllocHandle); } else { @@ -838,7 +838,6 @@ void* DescriptorGpuMemHeap::GetDescriptorSetMappedAddress( DescriptorSetHeap::DescriptorSetHeap() : m_nextFreeHandle(0), m_maxSets(0), -m_pHandles(nullptr), m_pFreeIndexStack(nullptr), m_freeIndexStackCount(0), m_pSetMemory(nullptr) @@ -859,7 +858,7 @@ VkResult DescriptorSetHeap::Init( m_maxSets = maxSets; // Allocate memory for all sets - size_t setSize = Util::Pow2Align(sizeof(DescriptorSet), VK_DEFAULT_MEM_ALIGN); + size_t setSize = SetSize(); bool oneShot = (poolUsage & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT) == 0; @@ -873,17 +872,6 @@ VkResult DescriptorSetHeap::Init( return VK_ERROR_OUT_OF_HOST_MEMORY; } - // Allocate memory for all handles - m_pHandles = reinterpret_cast(pDevice->VkInstance()->AllocMem( - sizeof(VkDescriptorSet) * maxSets, - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)); - - if (m_pHandles == nullptr) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - // Allocate memory for the free index stack if (oneShot == false) //dynamic usage { @@ -908,8 +896,6 @@ VkResult DescriptorSetHeap::Init( flags.robustBufferAccess = pDevice->GetEnabledFeatures().robustBufferAccess ? 1 : 0; VK_PLACEMENT_NEW (pSetMem) DescriptorSet(pPool, index, flags); - - m_pHandles[index] = DescriptorSet::HandleFromVoidPointer(pSetMem); } return VK_SUCCESS; @@ -921,10 +907,19 @@ void DescriptorSetHeap::Destroy( Device* pDevice) { pDevice->VkInstance()->FreeMem(m_pSetMemory); - pDevice->VkInstance()->FreeMem(m_pHandles); pDevice->VkInstance()->FreeMem(m_pFreeIndexStack); } +// ===================================================================================================================== +// Compute a descriptor set handle from an index in the heap +VkDescriptorSet DescriptorSetHeap::DescriptorSetHandleFromIndex( + uint32_t idx) const +{ + void* pMem = Util::VoidPtrInc(m_pSetMemory, (SetSize() * idx)); + + return DescriptorSet::HandleFromVoidPointer(pMem); +} + // ===================================================================================================================== // Allocates a new VkDescriptorSet instance and returns a handle to it. bool DescriptorSetHeap::AllocSetState( @@ -933,7 +928,7 @@ bool DescriptorSetHeap::AllocSetState( // First try to allocate through free range start index since it is by far fastest if (m_nextFreeHandle < m_maxSets) { - *pSet = m_pHandles[m_nextFreeHandle++]; + *pSet = DescriptorSetHandleFromIndex(m_nextFreeHandle++); return true; } @@ -943,7 +938,7 @@ bool DescriptorSetHeap::AllocSetState( { --m_freeIndexStackCount; - *pSet = m_pHandles[m_pFreeIndexStack[m_freeIndexStackCount]]; + *pSet = DescriptorSetHandleFromIndex(m_pFreeIndexStack[m_freeIndexStackCount]); return true; } @@ -962,13 +957,14 @@ void DescriptorSetHeap::FreeSetState( { DescriptorSet* pSet = DescriptorSet::StateFromHandle(set); + // We can compute this, but a divide might be a bad idea. uint32_t heapIndex = pSet->HeapIndex(); - VK_ASSERT((heapIndex < m_maxSets) && DescriptorSet::StateFromHandle(m_pHandles[heapIndex]) == pSet); + VK_ASSERT(heapIndex < m_maxSets); #if DEBUG // Clear the descriptor set state for debugging purposes - pSet->Reassign(nullptr, 0, 0, 0, MaxPalDevices, nullptr, nullptr, nullptr); + pSet->Reset(); #endif m_pFreeIndexStack[m_freeIndexStackCount++] = heapIndex; @@ -987,14 +983,14 @@ void DescriptorSetHeap::Reset() #if DEBUG // Clear the descriptor set states for debugging purposes - size_t setSize = Util::Pow2Align(sizeof(DescriptorSet), VK_DEFAULT_MEM_ALIGN); + size_t setSize = SetSize(); for (uint32_t index = 0; index < m_maxSets; ++index) { VkDescriptorSet setHandle = DescriptorSet::HandleFromVoidPointer(Util::VoidPtrInc(m_pSetMemory, index * setSize)); - DescriptorSet::ObjectFromHandle(setHandle)->Reassign(nullptr, 0, 0, 0, MaxPalDevices, nullptr, nullptr, nullptr); + DescriptorSet::ObjectFromHandle(setHandle)->Reset(); } #endif } diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp index d00dc160..5be1f8c0 100644 --- a/icd/api/vk_descriptor_set.cpp +++ b/icd/api/vk_descriptor_set.cpp @@ -77,51 +77,36 @@ void DescriptorSet::Reassign( Pal::gpusize* gpuBaseAddress, uint32_t** cpuBaseAddress, uint32_t numPalDevices, - const InternalMemory* const pInternalMem, - void* pAllocHandle, - VkDescriptorSet* pHandle) + void* pAllocHandle) { m_pLayout = pLayout; m_pAllocHandle = pAllocHandle; - if (pInternalMem != nullptr) + for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) { - for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) - { - m_gpuAddress[deviceIdx] = gpuBaseAddress[deviceIdx] + gpuMemOffset; - } + m_gpuAddress[deviceIdx] = gpuBaseAddress[deviceIdx] + gpuMemOffset; + + // When memory is assigned to this descriptor set let's cache its mapped CPU address as we anyways use + // persistent mapped memory for descriptor pools. + m_pCpuAddress[deviceIdx] = + static_cast(Util::VoidPtrInc(cpuBaseAddress[deviceIdx], static_cast(gpuMemOffset))); + VK_ASSERT(Util::IsPow2Aligned(reinterpret_cast(m_pCpuAddress[deviceIdx]), sizeof(uint32_t))); } - if (pHandle != nullptr) - { - if (pInternalMem != nullptr) - { - // When memory is assigned to this descriptor set let's cache its mapped CPU address as we anyways use - // persistent mapped memory for descriptor pools. - for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) - { - m_pCpuAddress[deviceIdx] = static_cast(Util::VoidPtrInc(cpuBaseAddress[deviceIdx], static_cast(gpuMemOffset))); - VK_ASSERT(Util::IsPow2Aligned(reinterpret_cast(m_pCpuAddress[deviceIdx]), sizeof(uint32_t))); - } + // In this case we also have to copy the immutable sampler data from the descriptor set layout to the + // descriptor set's appropriate memory locations. + InitImmutableDescriptors(pLayout, numPalDevices); - // In this case we also have to copy the immutable sampler data from the descriptor set layout to the - // descriptor set's appropriate memory locations. - InitImmutableDescriptors(pLayout, numPalDevices); - } - else - { - // This path can only be hit if the set doesn't need GPU memory - // i.e. it doesn't have static section and fmask section data - VK_ASSERT((pLayout->Info().sta.dwSize + pLayout->Info().fmask.dwSize) == 0); +} - memset(m_pCpuAddress, 0, sizeof(m_pCpuAddress[0]) * numPalDevices); - } - } - else - { - memset(m_pCpuAddress, 0, sizeof(m_pCpuAddress[0]) * numPalDevices); - } +// ===================================================================================================================== +// Resets a DescriptorSet to an intial state +void DescriptorSet::Reset() +{ + m_pLayout = nullptr; + m_pAllocHandle = nullptr; + memset(m_pCpuAddress, 0, sizeof(m_pCpuAddress)); } // ===================================================================================================================== @@ -132,57 +117,62 @@ void DescriptorSet::InitImmutableDescriptors( { VK_ASSERT(m_pLayout == pLayout); - const size_t imageDescDwSize = pLayout->VkDevice()->GetProperties().descriptorSizes.imageView / sizeof(uint32_t); - const size_t samplerDescSize = pLayout->VkDevice()->GetProperties().descriptorSizes.sampler; - uint32_t immutableSamplersLeft = pLayout->Info().imm.numImmutableSamplers; - uint32_t binding = 0; - - uint32_t* pSrcData = pLayout->Info().imm.pImmutableSamplerData; - while (immutableSamplersLeft > 0) + if (immutableSamplersLeft > 0) { - const DescriptorSetLayout::BindingInfo& bindingInfo = pLayout->Binding(binding); - uint32_t desCount = bindingInfo.info.descriptorCount; + const size_t imageDescDwSize = pLayout->VkDevice()->GetProperties().descriptorSizes.imageView / sizeof(uint32_t); + const size_t samplerDescSize = pLayout->VkDevice()->GetProperties().descriptorSizes.sampler; + + uint32_t binding = 0; + + uint32_t* pSrcData = pLayout->Info().imm.pImmutableSamplerData; - if (bindingInfo.imm.dwSize > 0) + do { - for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) + const DescriptorSetLayout::BindingInfo& bindingInfo = pLayout->Binding(binding); + uint32_t desCount = bindingInfo.info.descriptorCount; + + if (bindingInfo.imm.dwSize > 0) { - if (bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) - { - // If it's a pure immutable sampler descriptor binding then we can copy all descriptors in one shot. - memcpy(m_pCpuAddress[deviceIdx] + bindingInfo.sta.dwOffset, - pSrcData + bindingInfo.imm.dwOffset, - bindingInfo.imm.dwSize * sizeof(uint32_t)); - } - else + for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) { - // Otherwise, if it's a combined image sampler descriptor with immutable sampler then we have to - // copy each element individually because the source and destination strides don't match. - VK_ASSERT(bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); - - for (uint32_t i = 0; i < desCount; ++i) + if (bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) + { + // If it's a pure immutable sampler descriptor binding then we can copy all descriptors in one shot. + memcpy(m_pCpuAddress[deviceIdx] + bindingInfo.sta.dwOffset, + pSrcData + bindingInfo.imm.dwOffset, + bindingInfo.imm.dwSize * sizeof(uint32_t)); + } + else { - memcpy(m_pCpuAddress[deviceIdx] + bindingInfo.sta.dwOffset + - (i * bindingInfo.sta.dwArrayStride) + imageDescDwSize, - pSrcData + bindingInfo.imm.dwOffset + (i * bindingInfo.imm.dwArrayStride), - samplerDescSize); + // Otherwise, if it's a combined image sampler descriptor with immutable sampler then we have to + // copy each element individually because the source and destination strides don't match. + VK_ASSERT(bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); + + for (uint32_t i = 0; i < desCount; ++i) + { + memcpy(m_pCpuAddress[deviceIdx] + bindingInfo.sta.dwOffset + + (i * bindingInfo.sta.dwArrayStride) + imageDescDwSize, + pSrcData + bindingInfo.imm.dwOffset + (i * bindingInfo.imm.dwArrayStride), + samplerDescSize); + } } } + // Update the remaining number of immutable samplers to copy. + immutableSamplersLeft -= desCount; } - // Update the remaining number of immutable samplers to copy. - immutableSamplersLeft -= desCount; - } - binding++; + binding++; + } + while (immutableSamplersLeft > 0); } } // ===================================================================================================================== // Write sampler descriptors -VK_INLINE void DescriptorSet::WriteSamplerDescriptors( - const Device::Properties& deviceProperties, +template +void DescriptorSet::WriteSamplerDescriptors( const VkDescriptorImageInfo* pDescriptors, uint32_t* pDestAddr, uint32_t count, @@ -192,7 +182,6 @@ VK_INLINE void DescriptorSet::WriteSamplerDescriptors( const VkDescriptorImageInfo* pImageInfo = pDescriptors; const size_t imageInfoStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes : sizeof(VkDescriptorImageInfo); - const size_t samplerDescSize = deviceProperties.descriptorSizes.sampler; for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride) { @@ -206,8 +195,8 @@ VK_INLINE void DescriptorSet::WriteSamplerDescriptors( // ===================================================================================================================== // Write combined image-sampler descriptors -VK_INLINE void DescriptorSet::WriteImageSamplerDescriptors( - const Device::Properties& deviceProperties, +template +void DescriptorSet::WriteImageSamplerDescriptors( const VkDescriptorImageInfo* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -218,8 +207,6 @@ VK_INLINE void DescriptorSet::WriteImageSamplerDescriptors( const VkDescriptorImageInfo* pImageInfo = pDescriptors; const size_t imageInfoStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes : sizeof(VkDescriptorImageInfo); - const size_t imageDescSize = deviceProperties.descriptorSizes.imageView; - const size_t samplerDescSize = deviceProperties.descriptorSizes.sampler; for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride) { @@ -236,9 +223,8 @@ VK_INLINE void DescriptorSet::WriteImageSamplerDescriptors( // ===================================================================================================================== // Write image view descriptors (including input attachments) -VK_INLINE void DescriptorSet::WriteImageDescriptors( - VkDescriptorType descType, - const Device::Properties& deviceProperties, +template +void DescriptorSet::WriteImageDescriptors( const VkDescriptorImageInfo* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -249,7 +235,6 @@ VK_INLINE void DescriptorSet::WriteImageDescriptors( const VkDescriptorImageInfo* pImageInfo = pDescriptors; const size_t imageInfoStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes : sizeof(VkDescriptorImageInfo); - const size_t imageDescSize = deviceProperties.descriptorSizes.imageView; for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride) { @@ -264,8 +249,8 @@ VK_INLINE void DescriptorSet::WriteImageDescriptors( // ===================================================================================================================== // Write fmask descriptors -VK_INLINE void DescriptorSet::WriteFmaskDescriptors( - const Device* pDevice, +template +void DescriptorSet::WriteFmaskDescriptors( const VkDescriptorImageInfo* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -276,16 +261,12 @@ VK_INLINE void DescriptorSet::WriteFmaskDescriptors( const VkDescriptorImageInfo* pImageInfo = pDescriptors; const size_t imageInfoStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes : sizeof(VkDescriptorImageInfo); - const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; - VK_ASSERT((pDevice->GetProperties().descriptorSizes.fmaskView / sizeof(uint32_t)) == dwStride); for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride) { const ImageView* const pImageView = ImageView::ObjectFromHandle(pImageInfo->imageView); const void* pImageDesc = pImageView->Descriptor(pImageInfo->imageLayout, deviceIdx, 0); - VK_ASSERT(FmaskBasedMsaaReadEnabled() == true); - if (pImageView->NeedsFmaskViewSrds()) { // Copy over FMASK descriptor @@ -306,9 +287,8 @@ VK_INLINE void DescriptorSet::WriteFmaskDescriptors( // ===================================================================================================================== // Write buffer descriptors -VK_INLINE void DescriptorSet::WriteBufferDescriptors( - const Device::Properties& deviceProperties, - VkDescriptorType type, +template +void DescriptorSet::WriteBufferDescriptors( const VkBufferView* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -319,7 +299,6 @@ VK_INLINE void DescriptorSet::WriteBufferDescriptors( const VkBufferView* pBufferView = pDescriptors; const size_t bufferViewStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes : sizeof(VkBufferView); - const size_t bufferDescSize = deviceProperties.descriptorSizes.bufferView; for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride) { @@ -333,9 +312,9 @@ VK_INLINE void DescriptorSet::WriteBufferDescriptors( // ===================================================================================================================== // Write buffer descriptors using bufferInfo field used with uniform and storage buffers -VK_INLINE void DescriptorSet::WriteBufferInfoDescriptors( +template +void DescriptorSet::WriteBufferInfoDescriptors( const Device* pDevice, - VkDescriptorType type, const VkDescriptorBufferInfo* pDescriptors, uint32_t deviceIdx, uint32_t* pDestAddr, @@ -349,20 +328,14 @@ VK_INLINE void DescriptorSet::WriteBufferInfoDescriptors( Pal::BufferViewInfo info = {}; - switch (type) - { - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - // Setup and create SRD for storage buffer case - info.swizzledFormat = Pal::UndefinedSwizzledFormat; - info.stride = 0; // Raw buffers have a zero byte stride - break; - default: - VK_NEVER_CALLED(); - break; - } + VK_ASSERT((type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) || + (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) || + (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER) || + (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)); + + // Setup and create SRD for storage buffer case + info.swizzledFormat = Pal::UndefinedSwizzledFormat; + info.stride = 0; // Raw buffers have a zero byte stride Pal::IDevice* pPalDevice = pDevice->PalDevice(deviceIdx); @@ -401,10 +374,10 @@ VK_INLINE void DescriptorSet::WriteBufferInfoDescriptors( // // NOTE: descriptorStrideInBytes is used for VK_KHR_descriptor_update_template's sparsely packed imageInfo, bufferInfo, // or bufferView array elements and defaults to 0, i.e. vkUpdateDescriptorSets behavior +template void DescriptorSet::WriteDescriptorSets( const Device* pDevice, uint32_t deviceIdx, - const Device::Properties& deviceProperties, uint32_t descriptorWriteCount, const VkWriteDescriptorSet* pDescriptorWrites, size_t descriptorStrideInBytes) @@ -418,11 +391,11 @@ void DescriptorSet::WriteDescriptorSets( DescriptorSet* pDestSet = DescriptorSet::ObjectFromHandle(params.dstSet); const DescriptorSetLayout::BindingInfo& destBinding = pDestSet->Layout()->Binding(params.dstBinding); - uint32_t* pDestAddr = pDestSet->CpuAddress(deviceIdx) + destBinding.sta.dwOffset - + (params.dstArrayElement * destBinding.sta.dwArrayStride); + uint32_t* pDestAddr = pDestSet->CpuAddress(deviceIdx) + + pDestSet->Layout()->GetDstStaOffset(destBinding, params.dstArrayElement); - uint32_t* pDestFmaskAddr = pDestSet->CpuAddress(deviceIdx) + pDestSet->Layout()->Info().sta.dwSize - + destBinding.fmask.dwOffset + (params.dstArrayElement * destBinding.fmask.dwArrayStride); + uint32_t* pDestFmaskAddr = pDestSet->CpuAddress(deviceIdx) + + pDestSet->Layout()->GetDstFmaskOffset(destBinding, params.dstArrayElement); // Determine whether the binding has immutable sampler descriptors. bool hasImmutableSampler = (destBinding.imm.dwSize != 0); @@ -436,8 +409,7 @@ void DescriptorSet::WriteDescriptorSets( } else { - pDestSet->WriteSamplerDescriptors( - deviceProperties, + WriteSamplerDescriptors( params.pImageInfo, pDestAddr, params.descriptorCount, @@ -451,9 +423,7 @@ void DescriptorSet::WriteDescriptorSets( { // If the sampler part of the combined image sampler is immutable then we should only update the image // descriptors, but have to make sure to still use the appropriate stride. - pDestSet->WriteImageDescriptors( - params.descriptorType, - deviceProperties, + WriteImageDescriptors( params.pImageInfo, deviceIdx, pDestAddr, @@ -463,8 +433,7 @@ void DescriptorSet::WriteDescriptorSets( } else { - pDestSet->WriteImageSamplerDescriptors( - deviceProperties, + WriteImageSamplerDescriptors( params.pImageInfo, deviceIdx, pDestAddr, @@ -475,8 +444,7 @@ void DescriptorSet::WriteDescriptorSets( if (pDestSet->FmaskBasedMsaaReadEnabled() && (destBinding.fmask.dwSize > 0)) { - pDestSet->WriteFmaskDescriptors( - pDevice, + WriteFmaskDescriptors( params.pImageInfo, deviceIdx, pDestFmaskAddr, @@ -490,9 +458,7 @@ void DescriptorSet::WriteDescriptorSets( case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - pDestSet->WriteImageDescriptors( - params.descriptorType, - deviceProperties, + WriteImageDescriptors( params.pImageInfo, deviceIdx, pDestAddr, @@ -502,8 +468,7 @@ void DescriptorSet::WriteDescriptorSets( if (pDestSet->FmaskBasedMsaaReadEnabled() && (destBinding.fmask.dwSize > 0)) { - pDestSet->WriteFmaskDescriptors( - pDevice, + pDestSet->WriteFmaskDescriptors( params.pImageInfo, deviceIdx, pDestFmaskAddr, @@ -514,51 +479,79 @@ void DescriptorSet::WriteDescriptorSets( break; case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - pDestSet->WriteBufferDescriptors( - deviceProperties, - params.descriptorType, + pDestSet->WriteBufferDescriptors( params.pTexelBufferView, deviceIdx, pDestAddr, params.descriptorCount, destBinding.sta.dwArrayStride, descriptorStrideInBytes); + break; + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + WriteBufferDescriptors( + params.pTexelBufferView, + deviceIdx, + pDestAddr, + params.descriptorCount, + destBinding.sta.dwArrayStride, + descriptorStrideInBytes); break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - pDestSet->WriteBufferInfoDescriptors( + WriteBufferInfoDescriptors( pDevice, - params.descriptorType, params.pBufferInfo, deviceIdx, pDestAddr, params.descriptorCount, destBinding.sta.dwArrayStride, descriptorStrideInBytes); + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + WriteBufferInfoDescriptors( + pDevice, + params.pBufferInfo, + deviceIdx, + pDestAddr, + params.descriptorCount, + destBinding.sta.dwArrayStride, + descriptorStrideInBytes); break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: // We need to treat dynamic buffer descriptors specially as we store the base buffer SRDs in // client memory. // NOTE: Nuke this once we have proper support for dynamic descriptors in SC. - pDestAddr = pDestSet->DynamicDescriptorData() + destBinding.dyn.dwOffset - + params.dstArrayElement * destBinding.dyn.dwArrayStride; + pDestAddr = pDestSet->DynamicDescriptorData() + + pDestSet->Layout()->GetDstDynOffset(destBinding, params.dstArrayElement); - pDestSet->WriteBufferInfoDescriptors( + WriteBufferInfoDescriptors( pDevice, - params.descriptorType, params.pBufferInfo, deviceIdx, pDestAddr, params.descriptorCount, destBinding.dyn.dwArrayStride, descriptorStrideInBytes); + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + // We need to treat dynamic buffer descriptors specially as we store the base buffer SRDs in + // client memory. + // NOTE: Nuke this once we have proper support for dynamic descriptors in SC. + pDestAddr = pDestSet->DynamicDescriptorData() + + pDestSet->Layout()->GetDstDynOffset(destBinding, params.dstArrayElement); + + WriteBufferInfoDescriptors( + pDevice, + params.pBufferInfo, + deviceIdx, + pDestAddr, + params.descriptorCount, + destBinding.dyn.dwArrayStride, + descriptorStrideInBytes); break; default: @@ -570,10 +563,10 @@ void DescriptorSet::WriteDescriptorSets( // ===================================================================================================================== // Copy from one descriptor set to another +template void DescriptorSet::CopyDescriptorSets( const Device* pDevice, uint32_t deviceIdx, - const Device::Properties& deviceProperties, uint32_t descriptorCopyCount, const VkCopyDescriptorSet* pDescriptorCopies) { @@ -636,7 +629,6 @@ void DescriptorSet::CopyDescriptorSets( { // If we have immutable samplers inline with the image data to copy then we have to do a per array // element copy to ensure we don't overwrite the immutable sampler data - const size_t imageDescSize = deviceProperties.descriptorSizes.imageView; for (uint32_t j = 0; j < count; ++j) { @@ -669,6 +661,85 @@ void DescriptorSet::CopyDescriptorSets( } } +// ===================================================================================================================== +template +VKAPI_ATTR void VKAPI_CALL DescriptorSet::UpdateDescriptorSets( + VkDevice device, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites, + uint32_t descriptorCopyCount, + const VkCopyDescriptorSet* pDescriptorCopies) +{ + const Device* pDevice = ApiDevice::ObjectFromHandle(device); + + for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++) + { + WriteDescriptorSets( + pDevice, + deviceIdx, + descriptorWriteCount, + pDescriptorWrites); + + CopyDescriptorSets(pDevice, + deviceIdx, + descriptorCopyCount, + pDescriptorCopies); + } +} + +// ===================================================================================================================== +PFN_vkUpdateDescriptorSets DescriptorSet::GetUpdateDescriptorSetsFunc( + const Device* pDevice) +{ + PFN_vkUpdateDescriptorSets pFunc = nullptr; + + switch (pDevice->NumPalDevices()) + { + case 1: + pFunc = GetUpdateDescriptorSetsFunc<1>(pDevice); + break; + case 2: + pFunc = GetUpdateDescriptorSetsFunc<2>(pDevice); + break; + case 3: + pFunc = GetUpdateDescriptorSetsFunc<3>(pDevice); + break; + case 4: + pFunc = GetUpdateDescriptorSetsFunc<4>(pDevice); + break; + default: + break; + } + + return pFunc; +} + +// ===================================================================================================================== +template +PFN_vkUpdateDescriptorSets DescriptorSet::GetUpdateDescriptorSetsFunc( + const Device* pDevice) +{ + const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; + const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; + const size_t bufferDescSize = pDevice->GetProperties().descriptorSizes.bufferView; + + PFN_vkUpdateDescriptorSets pFunc = nullptr; + + if ((imageDescSize == 32) && + (samplerDescSize == 16) && + (bufferDescSize == 16)) + { + pFunc = &UpdateDescriptorSets<32, 16, 16, numPalDevices>; + } + else + { + VK_NEVER_CALLED(); + pFunc = nullptr; + } + + return pFunc; +} + namespace entry { @@ -681,24 +752,108 @@ VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSets( const VkCopyDescriptorSet* pDescriptorCopies) { const Device* pDevice = ApiDevice::ObjectFromHandle(device); - const Device::Properties& deviceProperties = pDevice->GetProperties(); - for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) - { - DescriptorSet::WriteDescriptorSets(pDevice, - deviceIdx, - deviceProperties, - descriptorWriteCount, - pDescriptorWrites); - - DescriptorSet::CopyDescriptorSets(pDevice, - deviceIdx, - deviceProperties, - descriptorCopyCount, - pDescriptorCopies); - } + PFN_vkUpdateDescriptorSets pFunc = pDevice->GetUpdateDescriptorSetsFunc(); + + (*pFunc)(device, descriptorWriteCount, pDescriptorWrites, descriptorCopyCount, pDescriptorCopies); } } // namespace entry +// ===================================================================================================================== +// Template instantiation needed for references in other files. Linux complains if we don't do this. + +template +void DescriptorSet::WriteFmaskDescriptors<32>( + const VkDescriptorImageInfo* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteSamplerDescriptors<16>( + const VkDescriptorImageInfo* pDescriptors, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteImageSamplerDescriptors<32, 16>( + const VkDescriptorImageInfo* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteImageDescriptors<32>( + const VkDescriptorImageInfo* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteBufferDescriptors<16, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER>( + const VkBufferView* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteBufferDescriptors<16, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER>( + const VkBufferView* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteBufferInfoDescriptors( + const Device* pDevice, + const VkDescriptorBufferInfo* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteBufferInfoDescriptors( + const Device* pDevice, + const VkDescriptorBufferInfo* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteBufferInfoDescriptors( + const Device* pDevice, + const VkDescriptorBufferInfo* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + +template +void DescriptorSet::WriteBufferInfoDescriptors( + const Device* pDevice, + const VkDescriptorBufferInfo* pDescriptors, + uint32_t deviceIdx, + uint32_t* pDestAddr, + uint32_t count, + uint32_t dwStride, + size_t descriptorStrideInBytes); + } // namespace vk diff --git a/icd/api/vk_descriptor_update_template.cpp b/icd/api/vk_descriptor_update_template.cpp index 3596ec5f..5177813f 100644 --- a/icd/api/vk_descriptor_update_template.cpp +++ b/icd/api/vk_descriptor_update_template.cpp @@ -39,14 +39,17 @@ namespace vk // ===================================================================================================================== VkResult DescriptorUpdateTemplate::Create( + const Device* pDevice, const VkDescriptorUpdateTemplateCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDescriptorUpdateTemplateKHR* pDescriptorUpdateTemplate) { - VkResult result = VK_SUCCESS; - const size_t apiSize = sizeof(DescriptorUpdateTemplate); - const size_t entriesSize = pCreateInfo->descriptorUpdateEntryCount * sizeof(VkDescriptorUpdateTemplateEntryKHR); - const size_t objSize = apiSize + entriesSize; + VkResult result = VK_SUCCESS; + const uint32_t numEntries = pCreateInfo->descriptorUpdateEntryCount; + const DescriptorSetLayout* pLayout = DescriptorSetLayout::ObjectFromHandle(pCreateInfo->descriptorSetLayout); + const size_t apiSize = sizeof(DescriptorUpdateTemplate); + const size_t entriesSize = numEntries * sizeof(TemplateUpdateInfo); + const size_t objSize = apiSize + entriesSize; void* pSysMem = pAllocator->pfnAllocation(pAllocator->pUserData, objSize, @@ -64,12 +67,30 @@ VkResult DescriptorUpdateTemplate::Create( // we don't support VK_KHR_push_descriptors. VK_ASSERT(pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR); - VkDescriptorUpdateTemplateEntryKHR* pEntries = static_cast( - Util::VoidPtrInc(pSysMem, apiSize)); + TemplateUpdateInfo* pEntries = static_cast(Util::VoidPtrInc(pSysMem, apiSize)); - memcpy(pEntries, pCreateInfo->pDescriptorUpdateEntries, entriesSize); + for (uint32_t ii = 0; ii < numEntries; ii++) + { + const VkDescriptorUpdateTemplateEntryKHR& srcEntry = pCreateInfo->pDescriptorUpdateEntries[ii]; + const DescriptorSetLayout::BindingInfo& dstBinding = pLayout->Binding(srcEntry.dstBinding); - VK_PLACEMENT_NEW(pSysMem) DescriptorUpdateTemplate(pEntries, pCreateInfo->descriptorUpdateEntryCount); + pEntries[ii].descriptorCount = srcEntry.descriptorCount; + pEntries[ii].srcOffset = srcEntry.offset; + pEntries[ii].srcStride = srcEntry.stride; + pEntries[ii].dstBindStaDwArrayStride = dstBinding.sta.dwArrayStride; + pEntries[ii].dstBindFmaskDwArrayStride = dstBinding.fmask.dwArrayStride; + pEntries[ii].dstBindDynDataDwArrayStride = dstBinding.dyn.dwArrayStride; + pEntries[ii].dstStaOffset = + pLayout->GetDstStaOffset(dstBinding, srcEntry.dstArrayElement); + pEntries[ii].dstFmaskOffset = + pLayout->GetDstFmaskOffset(dstBinding, srcEntry.dstArrayElement); + pEntries[ii].dstDynOffset = + pLayout->GetDstDynOffset(dstBinding, srcEntry.dstArrayElement); + pEntries[ii].pFunc = + GetUpdateEntryFunc(pDevice, srcEntry.descriptorType, dstBinding); + } + + VK_PLACEMENT_NEW(pSysMem) DescriptorUpdateTemplate(pCreateInfo->descriptorUpdateEntryCount); *pDescriptorUpdateTemplate = DescriptorUpdateTemplate::HandleFromVoidPointer(pSysMem); } @@ -77,12 +98,113 @@ VkResult DescriptorUpdateTemplate::Create( return result; } +// ===================================================================================================================== +template +DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntryFunc( + const Device* pDevice, + VkDescriptorType descriptorType, + const DescriptorSetLayout::BindingInfo& dstBinding) +{ + PfnUpdateEntry pFunc = NULL; + + switch (descriptorType) + { + case VK_DESCRIPTOR_TYPE_SAMPLER: + pFunc = &UpdateEntrySampler; + break; + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: + if (pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead && (dstBinding.fmask.dwSize > 0)) + { + if (dstBinding.imm.dwSize != 0) + { + pFunc = &UpdateEntryCombinedImageSampler; + } + else + { + pFunc = &UpdateEntryCombinedImageSampler; + } + } + else + { + if (dstBinding.imm.dwSize != 0) + { + pFunc = &UpdateEntryCombinedImageSampler; + } + else + { + pFunc = &UpdateEntryCombinedImageSampler; + } + } + break; + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + if (pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead && (dstBinding.fmask.dwSize > 0)) + { + pFunc = &UpdateEntrySampledImage; + } + else + { + pFunc = &UpdateEntrySampledImage; + } + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + pFunc = &UpdateEntryTexelBuffer; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + pFunc = &UpdateEntryTexelBuffer; + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: + pFunc = &UpdateEntryBuffer; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + pFunc = &UpdateEntryBuffer; + break; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + pFunc = &UpdateEntryBuffer; + break; + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + pFunc = &UpdateEntryBuffer; + break; + default: + VK_ASSERT(!"Unexpected descriptor type"); + break; + } + + return pFunc; +} + +// ===================================================================================================================== +DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntryFunc( + const Device* pDevice, + VkDescriptorType descriptorType, + const DescriptorSetLayout::BindingInfo& dstBinding) +{ + const size_t imageDescSize = pDevice->GetProperties().descriptorSizes.imageView; + const size_t samplerDescSize = pDevice->GetProperties().descriptorSizes.sampler; + const size_t bufferDescSize = pDevice->GetProperties().descriptorSizes.bufferView; + + DescriptorUpdateTemplate::PfnUpdateEntry pFunc = nullptr; + + if ((imageDescSize == 32) && + (samplerDescSize == 16) && + (bufferDescSize == 16)) + { + pFunc = GetUpdateEntryFunc<32, 16, 16>(pDevice, descriptorType, dstBinding); + } + else + { + VK_NEVER_CALLED(); + pFunc = nullptr; + } + + return pFunc; +} + // ===================================================================================================================== DescriptorUpdateTemplate::DescriptorUpdateTemplate( - const VkDescriptorUpdateTemplateEntryKHR* pEntries, - uint32_t numEntries) + uint32_t numEntries) : - m_pEntries(pEntries), m_numEntries(numEntries) { } @@ -106,39 +228,195 @@ VkResult DescriptorUpdateTemplate::Destroy( // ===================================================================================================================== void DescriptorUpdateTemplate::Update( - Device* pDevice, + const Device* pDevice, uint32_t deviceIdx, VkDescriptorSet descriptorSet, const void* pData) { - const Device::Properties& deviceProperties = pDevice->GetProperties(); + auto pEntries = GetEntries(); - // Use descriptor write structure as params to share write code path with vkUpdateDescriptorSets. - VkWriteDescriptorSet descriptorWrite; + for (uint32_t i = 0; i < m_numEntries; ++i) + { + const void* pDescriptorInfo = Util::VoidPtrInc(pData, pEntries[i].srcOffset); - descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - descriptorWrite.pNext = nullptr; - descriptorWrite.dstSet = descriptorSet; + pEntries[i].pFunc(pDevice, descriptorSet, deviceIdx, pDescriptorInfo, pEntries[i]); + } +} - for (uint32_t i = 0; i < m_numEntries; ++i) +// ===================================================================================================================== +template +void DescriptorUpdateTemplate::UpdateEntryCombinedImageSampler( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry) +{ + DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet); + + const VkDescriptorImageInfo* pImageInfo = static_cast(pDescriptorInfo); + + uint32_t* pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset; + + if (immutable) { - const void* pDescriptorInfo = Util::VoidPtrInc(pData, m_pEntries[i].offset); - - descriptorWrite.dstBinding = m_pEntries[i].dstBinding; - descriptorWrite.dstArrayElement = m_pEntries[i].dstArrayElement; - descriptorWrite.descriptorCount = m_pEntries[i].descriptorCount; - descriptorWrite.descriptorType = m_pEntries[i].descriptorType; - // Decide which descriptor info is relevant later using descriptorType. - descriptorWrite.pImageInfo = static_cast(pDescriptorInfo); - descriptorWrite.pBufferInfo = static_cast(pDescriptorInfo); - descriptorWrite.pTexelBufferView = static_cast(pDescriptorInfo); - - DescriptorSet::WriteDescriptorSets(pDevice, - deviceIdx, - deviceProperties, - 1, - &descriptorWrite, - m_pEntries[i].stride); + // If the sampler part of the combined image sampler is immutable then we should only update the image + // descriptors, but have to make sure to still use the appropriate stride. + DescriptorSet::WriteImageDescriptors( + pImageInfo, + deviceIdx, + pDestAddr, + entry.descriptorCount, + entry.dstBindStaDwArrayStride, + entry.srcStride); + } + else + { + DescriptorSet::WriteImageSamplerDescriptors( + pImageInfo, + deviceIdx, + pDestAddr, + entry.descriptorCount, + entry.dstBindStaDwArrayStride, + entry.srcStride); + } + + if (updateFmask) + { + uint32_t* pDestFmaskAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstFmaskOffset; + + DescriptorSet::WriteFmaskDescriptors( + pImageInfo, + deviceIdx, + pDestFmaskAddr, + entry.descriptorCount, + entry.dstBindFmaskDwArrayStride, + entry.srcStride); + } +} + +// ===================================================================================================================== +template +void DescriptorUpdateTemplate::UpdateEntryTexelBuffer( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry) +{ + DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet); + + const VkBufferView* pTexelBufferView = static_cast(pDescriptorInfo); + + uint32_t* pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset; + + DescriptorSet::WriteBufferDescriptors( + pTexelBufferView, + deviceIdx, + pDestAddr, + entry.descriptorCount, + entry.dstBindStaDwArrayStride, + entry.srcStride); +} + +// ===================================================================================================================== +template +void DescriptorUpdateTemplate::UpdateEntryBuffer( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry) +{ + DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet); + + const VkDescriptorBufferInfo* pBufferInfo = static_cast(pDescriptorInfo); + + uint32_t* pDestAddr; + uint32_t stride; + + if ((descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) || + (descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)) + { + // We need to treat dynamic buffer descriptors specially as we store the base buffer SRDs in + // client memory. + // NOTE: Nuke this once we have proper support for dynamic descriptors in SC. + pDestAddr = pDstSet->DynamicDescriptorData() + entry.dstDynOffset; + stride = entry.dstBindDynDataDwArrayStride; + } + else + { + pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset; + stride = entry.dstBindStaDwArrayStride; + } + + DescriptorSet::WriteBufferInfoDescriptors( + pDevice, + pBufferInfo, + deviceIdx, + pDestAddr, + entry.descriptorCount, + stride, + entry.srcStride); +} + +// ===================================================================================================================== +template +void DescriptorUpdateTemplate::UpdateEntrySampler( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry) +{ + DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet); + + const VkDescriptorImageInfo* pImageInfo = static_cast(pDescriptorInfo); + + uint32_t* pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset; + + DescriptorSet::WriteSamplerDescriptors( + pImageInfo, + pDestAddr, + entry.descriptorCount, + entry.dstBindStaDwArrayStride, + entry.srcStride); +} + +// ===================================================================================================================== +template +void DescriptorUpdateTemplate::UpdateEntrySampledImage( + const Device* pDevice, + VkDescriptorSet descriptorSet, + uint32_t deviceIdx, + const void* pDescriptorInfo, + const TemplateUpdateInfo& entry) +{ + DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet); + + const VkDescriptorImageInfo* pImageInfo = static_cast(pDescriptorInfo); + + uint32_t* pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset; + + DescriptorSet::WriteImageDescriptors( + pImageInfo, + deviceIdx, + pDestAddr, + entry.descriptorCount, + entry.dstBindStaDwArrayStride, + entry.srcStride); + + if (updateFmask) + { + uint32_t* pDestFmaskAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstFmaskOffset; + + DescriptorSet::WriteFmaskDescriptors( + pImageInfo, + deviceIdx, + pDestFmaskAddr, + entry.descriptorCount, + entry.dstBindFmaskDwArrayStride, + entry.srcStride); } } @@ -170,10 +448,13 @@ VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSetWithTemplateKHR( Device* pDevice = ApiDevice::ObjectFromHandle(device); DescriptorUpdateTemplate* pTemplate = DescriptorUpdateTemplate::ObjectFromHandle(descriptorUpdateTemplate); - for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); ++deviceIdx) + uint32_t deviceIdx = 0; + do { pTemplate->Update(pDevice, deviceIdx, descriptorSet, pData); + deviceIdx++; } + while (deviceIdx < pDevice->NumPalDevices()); } } // namespace entry diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index 6b03c379..fd487ddb 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -36,6 +36,7 @@ #include "include/vk_buffer.h" #include "include/vk_buffer_view.h" #include "include/vk_descriptor_pool.h" +#include "include/vk_descriptor_set.h" #include "include/vk_descriptor_set_layout.h" #include "include/vk_descriptor_update_template.h" #include "include/vk_device.h" @@ -178,7 +179,8 @@ Device::Device( m_renderStateCache(this), m_enabledExtensions(enabledExtensions), m_pSqttMgr(nullptr), - m_pipelineCacheCount(0) + m_pipelineCacheCount(0), + m_pfnUpdateDescriptorSets(nullptr) { memcpy(m_pPhysicalDevices, pPhysicalDevices, sizeof(pPhysicalDevices[DefaultDeviceIndex]) * palDeviceCount); memcpy(m_pPalDevices, pPalDevices, sizeof(pPalDevices[0]) * palDeviceCount); @@ -203,8 +205,6 @@ Device::Device( m_allocatedCount = 0; m_maxAllocations = pPhysicalDevices[DefaultDeviceIndex]->GetLimits().maxMemoryAllocationCount; - memset(m_pLlpcCompiler, 0, sizeof(m_pLlpcCompiler)); - } // ===================================================================================================================== @@ -268,30 +268,6 @@ VkResult Device::Create( DeviceExtensions::Enabled enabledDeviceExtensions; -#ifdef ICD_VULKAN_1_1 - // Implicitly enable device extensions that are core in the API version - if (pPhysicalDevice->VkInstance()->GetAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) - { - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_16BIT_STORAGE); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_BIND_MEMORY2); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_DEDICATED_ALLOCATION); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_DESCRIPTOR_UPDATE_TEMPLATE); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_DEVICE_GROUP); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_EXTERNAL_FENCE); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_EXTERNAL_MEMORY); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_EXTERNAL_SEMAPHORE); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_MAINTENANCE1); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_MAINTENANCE2); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_MAINTENANCE3); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_MULTIVIEW); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_RELAXED_BLOCK_LAYOUT); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_SHADER_DRAW_PARAMETERS); -// enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_SAMPLER_YCBCR_CONVERSION); - enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_STORAGE_BUFFER_STORAGE_CLASS); - } -#endif - VK_ASSERT(pCreateInfo != nullptr); // Make sure the caller only requests extensions we actually support. @@ -444,7 +420,7 @@ VkResult Device::Create( vkResult = VK_ERROR_OUT_OF_HOST_MEMORY; - if (pMemory != nullptr) + if ((pCreateInfo != nullptr) && (pMemory != nullptr)) { vkResult = PalToVkResult(palResult); @@ -627,7 +603,7 @@ VkResult Device::Initialize( uint8_t* pPalQueueMemory) { // Initialize the internal memory manager - VkResult result = m_internalMemMgr.Init(); + VkResult result = m_internalMemMgr.Init(); // Initialize the render state cache if (result == VK_SUCCESS) @@ -737,19 +713,6 @@ VkResult Device::Initialize( } } - if (result == VK_SUCCESS) - { - for (uint32_t i = 0; i < m_palDeviceCount; ++i) - { - result = CreateLlpcCompiler(i); - - if (result != VK_SUCCESS) - { - break; - } - } - } - if (result == VK_SUCCESS) { result = CreateLlpcInternalPipelines(); @@ -798,9 +761,21 @@ VkResult Device::Initialize( } #endif + if (result == VK_SUCCESS) + { + InitEntryPointFuncs(); + } + return result; } +// ===================================================================================================================== +// Initialize the entry point functions for paths known at device init time +void Device::InitEntryPointFuncs() +{ + m_pfnUpdateDescriptorSets = DescriptorSet::GetUpdateDescriptorSetsFunc(this); +} + // ===================================================================================================================== // Initialize the specified sample pattern palette with default values. void Device::InitSamplePatternPalette( @@ -1012,15 +987,6 @@ VkResult Device::Destroy(const VkAllocationCallbacks* pAllocator) DestroyInternalPipelines(); - for (uint32_t i = 0; i < MaxPalDevices; ++i) - { - if (m_pLlpcCompiler[i] != nullptr) - { - m_pLlpcCompiler[i]->Destroy(); - m_pLlpcCompiler[i] = nullptr; - } - } - if (m_settings.useSharedCmdAllocator) { for (uint32_t deviceIdx = 0; deviceIdx < NumPalDevices(); deviceIdx++) @@ -1071,7 +1037,7 @@ VkResult Device::CreateLlpcInternalComputePipeline( shaderInfo.shaderBin.pCode = pCode; shaderInfo.shaderBin.codeSize = codeByteSize; - llpcResult = GetLlpcCompiler()->BuildShaderModule(&shaderInfo, &shaderOut); + llpcResult = GetCompiler()->GetLlpcCompiler()->BuildShaderModule(&shaderInfo, &shaderOut); if ((llpcResult != Llpc::Result::Success) && (llpcResult != Llpc::Result::Delayed)) { result = VK_ERROR_INITIALIZATION_FAILED; @@ -1091,7 +1057,7 @@ VkResult Device::CreateLlpcInternalComputePipeline( pShaderInfo->pEntryTarget = "main"; pShaderInfo->pUserDataNodes = pUserDataNodes; pShaderInfo->userDataNodeCount = numUserDataNodes; - llpcResult = GetLlpcCompiler()->BuildComputePipeline(&pipelineBuildInfo, &pipelineOut); + llpcResult = GetCompiler()->GetLlpcCompiler()->BuildComputePipeline(&pipelineBuildInfo, &pipelineOut); if (llpcResult != Llpc::Result::Success) { result = VK_ERROR_INITIALIZATION_FAILED; @@ -1413,7 +1379,7 @@ VkResult Device::CreateDescriptorUpdateTemplate( const VkAllocationCallbacks* pAllocator, VkDescriptorUpdateTemplateKHR* pDescriptorUpdateTemplate) { - return DescriptorUpdateTemplate::Create(pCreateInfo, pAllocator, pDescriptorUpdateTemplate); + return DescriptorUpdateTemplate::Create(this, pCreateInfo, pAllocator, pDescriptorUpdateTemplate); } // ===================================================================================================================== @@ -1830,6 +1796,8 @@ VkResult Device::BindImageMemory( return VK_SUCCESS; } +// ===================================================================================================================== + // ===================================================================================================================== VkResult Device::CreateSampler( const VkSamplerCreateInfo* pCreateInfo, @@ -1951,216 +1919,6 @@ VkDeviceSize Device::GetMemoryBaseAddrAlignment( return minAlignment; } -// ===================================================================================================================== -// Create LLPC compiler handle -VkResult Device::CreateLlpcCompiler( - int32_t deviceIdx) // Device index -{ - const uint32_t OptionBufferSize = 4096; - const uint32_t MaxLlpcOptions = 32; - Llpc::GfxIpVersion gfxIp = {}; - Llpc::ICompiler* pCompiler = nullptr; - - // Initialzie GfxIpVersion according to PAL gfxLevel - Pal::DeviceProperties info; - PalDevice(deviceIdx)->GetProperties(&info); - - switch (info.gfxLevel) - { - case Pal::GfxIpLevel::GfxIp6: - gfxIp.major = 6; - gfxIp.minor = 0; - break; - case Pal::GfxIpLevel::GfxIp7: - gfxIp.major = 7; - gfxIp.minor = 0; - break; - case Pal::GfxIpLevel::GfxIp8: - gfxIp.major = 8; - gfxIp.minor = 0; - break; - case Pal::GfxIpLevel::GfxIp8_1: - gfxIp.major = 8; - gfxIp.minor = 1; - break; - case Pal::GfxIpLevel::GfxIp9: - gfxIp.major = 9; - gfxIp.minor = 0; - break; - default: - VK_NEVER_CALLED(); - break; - } - - gfxIp.stepping = info.gfxStepping; - - // Get the executable name and path - char executableNameBuffer[PATH_MAX]; - - char* pExecutablePtr; - Pal::Result palResult = Util::GetExecutableName(&executableNameBuffer[0], - &pExecutablePtr, - sizeof(executableNameBuffer)); - VK_ASSERT(palResult == Pal::Result::Success); - - // Initialize LLPC options according to runtime settings - auto settings = GetRuntimeSettings(); - const char* llpcOptions[MaxLlpcOptions] = {}; - char optionBuffers[OptionBufferSize] = {}; - - char* pOptionBuffer = &optionBuffers[0]; - size_t bufSize = OptionBufferSize; - int optionLength = 0; - uint32_t numOptions = 0; - // Identify for Icd and stanalone compiler - llpcOptions[numOptions++] = Llpc::VkIcdName; - - // Generate ELF binary, not assembly text - llpcOptions[numOptions++] = "-filetype=obj"; - - // LLPC log options - llpcOptions[numOptions++] = (settings.enableLog & 1) ? "-enable-errs=1" : "-enable-errs=0"; - llpcOptions[numOptions++] = (settings.enableLog & 2) ? "-enable-outs=1" : "-enable-outs=0"; - - optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-log-file-outs=%s", settings.logFileName); - ++optionLength; - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += optionLength; - bufSize -= optionLength; - - optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-log-file-dbgs=%s", settings.debugLogFileName); - ++optionLength; - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += optionLength; - bufSize -= optionLength; - - // LLPC debug options - if (settings.enableDebug) - { - llpcOptions[numOptions++] = "-debug"; - } - - if (settings.llpcOptions[0] != '\0') - { - const char* pOptions = &settings.llpcOptions[0]; - VK_ASSERT(pOptions[0] == '-'); - - // Split options - while (pOptions) - { - const char* pNext = strchr(pOptions, ' '); - if (pNext) - { - // Copy options to option buffer - optionLength = static_cast(pNext - pOptions); - memcpy(pOptionBuffer, pOptions, optionLength); - pOptionBuffer[optionLength] = 0; - - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += (optionLength + 1); - - bufSize -= (optionLength + 1); - pOptions = strchr(pOptions + optionLength, '-'); - } - else - { - // Use pOptions directly for last option - llpcOptions[numOptions++] = pOptions; - pOptions = nullptr; - } - } - } - - // LLPC pipeline dump options - if (settings.enablePipelineDump) - { - llpcOptions[numOptions++] = "-enable-pipeline-dump"; - } - - optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-pipeline-dump-dir=%s", settings.pipelineDumpDir); - ++optionLength; - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += optionLength; - bufSize -= optionLength; - - if (settings.enableLlpc == LlpcModeAutoFallback) - { - llpcOptions[numOptions++] = "-disable-WIP-features=1"; - } - - // NOTE: For testing consistency, these options should be kept the same as those of - // "amdllpc" (Init()). - llpcOptions[numOptions++] = "-pragma-unroll-threshold=4096"; - llpcOptions[numOptions++] = "-unroll-allow-partial"; - llpcOptions[numOptions++] = "-lower-dyn-index"; - llpcOptions[numOptions++] = "-simplifycfg-sink-common=false"; - llpcOptions[numOptions++] = "-amdgpu-vgpr-index-mode"; // force VGPR indexing on GFX8 - - ShaderCacheMode shaderCacheMode = m_settings.shaderCacheMode; -#ifdef ICD_BUILD_APPPROFILE - const AppProfile appProfile = GetAppProfile(); - if ((appProfile == AppProfile::Talos) || - (appProfile == AppProfile::MadMax) || - (appProfile == AppProfile::SeriousSamFusion)) - { - llpcOptions[numOptions++] = "-enable-si-scheduler"; - } - - // Force enable cache to disk to improve user experience - if ((shaderCacheMode == ShaderCacheEnableRuntimeOnly) && - ((appProfile == AppProfile::MadMax) || - (appProfile == AppProfile::SeriousSamFusion) || - (appProfile == AppProfile::F1_2017))) - { - // Force to use internal disk cache. - shaderCacheMode = ShaderCacheForceInternalCacheOnDisk; - } -#endif - - optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-executable-name=%s", pExecutablePtr); - ++optionLength; - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += optionLength; - bufSize -= optionLength; - - optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-cache-mode=%d", shaderCacheMode); - ++optionLength; - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += optionLength; - bufSize -= optionLength; - - if (settings.shaderReplaceMode != 0) - { - optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-mode=%d", settings.shaderReplaceMode); - ++optionLength; - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += optionLength; - bufSize -= optionLength; - - optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-dir=%s", settings.shaderReplaceDir); - ++optionLength; - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += optionLength; - bufSize -= optionLength; - - optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-pipeline-hashes=%s", settings.shaderReplacePipelineHashes); - ++optionLength; - llpcOptions[numOptions++] = pOptionBuffer; - pOptionBuffer += optionLength; - bufSize -= optionLength; - } - - VK_ASSERT(numOptions <= MaxLlpcOptions); - - // Create LLPC compiler - Llpc::Result llpcResult = Llpc::ICompiler::Create(gfxIp, numOptions, llpcOptions, &pCompiler); - VK_ASSERT(llpcResult == Llpc::Result::Success); - - m_pLlpcCompiler[deviceIdx] = pCompiler; - - return (llpcResult == Llpc::Result::Success) ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED; -} - // ===================================================================================================================== // Gets default pipeline cache expected entry count based on current existing pipeline cache count. uint32_t Device::GetPipelineCacheExpectedEntryCount() @@ -2820,6 +2578,8 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorSetLayoutSupportKHR( } #endif +// ===================================================================================================================== + // ===================================================================================================================== VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryHostPointerPropertiesEXT( VkDevice device, diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp index c3fa0b15..54ff0dd6 100644 --- a/icd/api/vk_dispatch.cpp +++ b/icd/api/vk_dispatch.cpp @@ -103,7 +103,8 @@ void* GetIcdProcAddr( pFunc = pEntry->pFunc; break; } - case vk::secure::entry::ENTRY_POINT_CORE: + case vk::secure::entry::ENTRY_POINT_CORE_INSTANCE: + case vk::secure::entry::ENTRY_POINT_CORE_DEVICE: { // Check version requested against the required version. if ((pInstance != nullptr) && (pInstance->GetAPIVersion() >= pEntry->conditionValue)) @@ -591,7 +592,7 @@ VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr( return vk::entry::vkGetDeviceProcAddr(device, pName); } -} // namespace vk +} // extern "C" struct VK_LAYER_DISPATCH_TABLE { diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 14094dfd..23b24877 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -101,7 +101,6 @@ void GraphicsPipeline::BuildRasterizationState( }; // By default rasterization is disabled, unless rasterization creation info is present - pInfo->pipelineLlpc.rsState.rasterizerDiscardEnable = true; const VkPhysicalDeviceLimits& limits = pDevice->VkPhysicalDevice()->GetLimits(); @@ -116,9 +115,6 @@ void GraphicsPipeline::BuildRasterizationState( { pInfo->pipeline.rsState.depthClampDisable = (pRs->depthClampEnable == VK_FALSE); // When depth clamping is enabled, depth clipping should be disabled, and vice versa - pInfo->pipelineLlpc.vpState.depthClipEnable = (pRs->depthClampEnable == VK_FALSE); - pInfo->pipelineLlpc.rsState.rasterizerDiscardEnable = (pRs->rasterizerDiscardEnable != VK_FALSE); - pInfo->immedInfo.triangleRasterState.fillMode = VkToPalFillMode(pRs->polygonMode); pInfo->immedInfo.triangleRasterState.cullMode = VkToPalCullMode(pRs->cullMode); pInfo->immedInfo.triangleRasterState.frontFace = VkToPalFaceOrientation(pRs->frontFace); @@ -208,22 +204,20 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( if (pGraphicsPipelineCreateInfo != nullptr) { - pInfo->activeStageCount = pGraphicsPipelineCreateInfo->stageCount; - pInfo->pActiveStages = pGraphicsPipelineCreateInfo->pStages; - + for (uint32_t i = 0; i < pGraphicsPipelineCreateInfo->stageCount; ++i) + { + pInfo->activeStages = static_cast( + pInfo->activeStages | pGraphicsPipelineCreateInfo->pStages[i].stage); + } VK_IGNORE(pGraphicsPipelineCreateInfo->flags & VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT); pRenderPass = RenderPass::ObjectFromHandle(pGraphicsPipelineCreateInfo->renderPass); - pInfo->isMultiviewEnabled = pRenderPass->IsMultiviewEnabled(); - if (pGraphicsPipelineCreateInfo->layout != VK_NULL_HANDLE) { pInfo->pLayout = PipelineLayout::ObjectFromHandle(pGraphicsPipelineCreateInfo->layout); } - pInfo->pVertexInput = pGraphicsPipelineCreateInfo->pVertexInputState; - const VkPipelineInputAssemblyStateCreateInfo* pIa = pGraphicsPipelineCreateInfo->pInputAssemblyState; // According to the spec this should never be null @@ -239,36 +233,17 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( &pInfo->pipeline.iaState.topologyInfo.primitiveType, &pInfo->pipeline.iaState.topologyInfo.adjacency); - pInfo->pipelineLlpc.iaState.topology = pIa->topology; - pInfo->pipelineLlpc.iaState.disableVertexReuse = false; - - EXTRACT_VK_STRUCTURES_1( + EXTRACT_VK_STRUCTURES_0( Tess, PipelineTessellationStateCreateInfo, - PipelineTessellationDomainOriginStateCreateInfoKHR, pGraphicsPipelineCreateInfo->pTessellationState, - PIPELINE_TESSELLATION_STATE_CREATE_INFO, - PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO_KHR) + PIPELINE_TESSELLATION_STATE_CREATE_INFO) if (pPipelineTessellationStateCreateInfo != nullptr) { pInfo->pipeline.iaState.topologyInfo.patchControlPoints = pPipelineTessellationStateCreateInfo->patchControlPoints; - - pInfo->pipelineLlpc.iaState.patchControlPoints = pInfo->pipeline.iaState.topologyInfo.patchControlPoints; } - if (pPipelineTessellationDomainOriginStateCreateInfoKHR) - { - // Vulkan 1.0 incorrectly specified the tessellation u,v coordinate origin as lower left even though - // framebuffer and image coordinate origins are in the upper left. This has since been fixed, but - // an extension exists to use the previous behavior. Doing so with flat shading would likely appear - // incorrect, but Vulkan specifies that the provoking vertex is undefined when tessellation is active. - if (pPipelineTessellationDomainOriginStateCreateInfoKHR->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT_KHR) - { - pInfo->pipelineLlpc.iaState.switchWinding = true; - } - } - pInfo->immedInfo.staticStateMask = 0; const VkPipelineDynamicStateCreateInfo* pDy = pGraphicsPipelineCreateInfo->pDynamicState; @@ -313,12 +288,16 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( { VK_ASSERT(pVp->pViewports != nullptr); + const bool khrMaintenance1 = + ((pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || + pDevice->IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1)); + for (uint32_t i = 0; i < pVp->viewportCount; ++i) { VkToPalViewport( pVp->pViewports[i], i, - pDevice->IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1), + khrMaintenance1, &pInfo->immedInfo.viewportParams); } @@ -410,14 +389,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( pInfo->msaa.pixelShaderSamples = 1; } - pInfo->pipelineLlpc.rsState.numSamples = rasterizationSampleCount; - - // NOTE: The sample pattern index here is actually the offset of sample position pair. This is - // different from the field of creation info of image view. For image view, the sample pattern - // index is really table index of the sample pattern. - pInfo->pipelineLlpc.rsState.samplePatternIdx = - Device::GetDefaultSamplePatternIndex(subpassCoverageSampleCount) * Pal::MaxMsaaRasterizerSamples; - pInfo->msaa.depthStencilSamples = subpassDepthSampleCount; pInfo->msaa.shaderExportMaskSamples = subpassCoverageSampleCount; pInfo->msaa.sampleMask = (pMs->pSampleMask != nullptr) @@ -476,7 +447,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( } pInfo->pipeline.cbState.alphaToCoverageEnable = (pMs->alphaToCoverageEnable == VK_TRUE); - pInfo->pipelineLlpc.cbState.alphaToCoverageEnable = (pMs->alphaToCoverageEnable == VK_TRUE); } const VkPipelineColorBlendStateCreateInfo* pCb = pGraphicsPipelineCreateInfo->pColorBlendState; @@ -500,7 +470,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( const VkPipelineColorBlendAttachmentState& src = pCb->pAttachments[i]; auto pCbDst = &pInfo->pipeline.cbState.target[i]; - auto pLlpcCbDst = &pInfo->pipelineLlpc.cbState.target[i]; auto pBlendDst = &pInfo->blend.targets[i]; if (pRenderPass) @@ -515,20 +484,10 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( if (pCbDst->swizzledFormat.format != Pal::ChNumFormat::Undefined) { pCbDst->channelWriteMask = src.colorWriteMask; - pLlpcCbDst->format = cbFormat[i]; - pLlpcCbDst->blendEnable = (src.blendEnable == VK_TRUE); - pLlpcCbDst->blendSrcAlphaToColor = IsSrcAlphaUsedInBlend(src.srcAlphaBlendFactor) || - IsSrcAlphaUsedInBlend(src.dstAlphaBlendFactor) || - IsSrcAlphaUsedInBlend(src.srcColorBlendFactor) || - IsSrcAlphaUsedInBlend(src.dstColorBlendFactor); - blendingEnabled |= pLlpcCbDst->blendEnable; - } - else - { - pLlpcCbDst->blendEnable = false; + blendingEnabled |= (src.blendEnable == VK_TRUE); } - pBlendDst->blendEnable = pLlpcCbDst->blendEnable; + pBlendDst->blendEnable = (src.blendEnable == VK_TRUE); pBlendDst->srcBlendColor = VkToPalBlend(src.srcColorBlendFactor); pBlendDst->dstBlendColor = VkToPalBlend(src.dstColorBlendFactor); pBlendDst->blendFuncColor = VkToPalBlendFunc(src.colorBlendOp); @@ -544,7 +503,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( } pInfo->pipeline.cbState.dualSourceBlendEnable = dualSourceBlend; - pInfo->pipelineLlpc.cbState.dualSourceBlendEnable = dualSourceBlend; if (blendingEnabled == true && dynamicStateFlags[VK_DYNAMIC_STATE_BLEND_CONSTANTS] == false) { @@ -650,147 +608,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo( } -// ===================================================================================================================== -// Creates a graphics pipeline binary for each PAL device -VkResult GraphicsPipeline::CreateGraphicsPipelineBinaries( - Device* pDevice, - PipelineCache* pPipelineCache, - CreateInfo* pInfo, - VbBindingInfo* pVbInfo, - size_t pipelineBinarySizes[MaxPalDevices], - void* pPipelineBinaries[MaxPalDevices]) -{ - const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - - VkResult result = VK_SUCCESS; - void* pMappingBuffer = nullptr; - - // Allocate space to create the LLPC/SCPC pipeline resource mappings - if (pInfo->pLayout != nullptr) - { - size_t tempBufferSize = pInfo->pLayout->GetPipelineInfo()->tempBufferSize; - - // Allocate the temp buffer - if (tempBufferSize > 0) - { - pMappingBuffer = pDevice->VkInstance()->AllocMem( - tempBufferSize, - VK_DEFAULT_MEM_ALIGN, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - - if (pMappingBuffer == nullptr) - { - result = VK_ERROR_OUT_OF_HOST_MEMORY; - } - } - } - - bool enableLlpc = false; - - if (result == VK_SUCCESS) - { - // Build the LLPC pipeline - Llpc::GraphicsPipelineBuildOut pipelineOut = {}; - void* pLlpcPipelineBuffer = nullptr; - { - Llpc::PipelineShaderInfo* shaderInfos[] = - { - &pInfo->pipelineLlpc.vs, - &pInfo->pipelineLlpc.tcs, - &pInfo->pipelineLlpc.tes, - &pInfo->pipelineLlpc.gs, - &pInfo->pipelineLlpc.fs - }; - - // Apply patches - pInfo->pipelineLlpc.pInstance = pDevice->VkPhysicalDevice()->VkInstance(); - pInfo->pipelineLlpc.pfnOutputAlloc = AllocateShaderOutput; - pInfo->pipelineLlpc.pUserData = &pLlpcPipelineBuffer; - pInfo->pipelineLlpc.pVertexInput = pInfo->pVertexInput; - - pInfo->pipelineLlpc.iaState.enableMultiView = pInfo->pipeline.viewInstancingDesc.enableMasking; - pInfo->pipelineLlpc.rsState.perSampleShading = (pInfo->msaa.pixelShaderSamples > 1); - - for (uint32_t stage = 0; stage < pInfo->activeStageCount; ++stage) - { - auto pStage = &pInfo->pActiveStages[stage]; - auto pShader = ShaderModule::ObjectFromHandle(pStage->module); - auto shaderStage = ShaderFlagBitToStage(pStage->stage); - auto pShaderInfo = shaderInfos[shaderStage]; - - pShaderInfo->pModuleData = pShader->GetShaderData(true); - pShaderInfo->pSpecializatonInfo = pStage->pSpecializationInfo; - pShaderInfo->pEntryTarget = pStage->pName; - - // Build the resource mapping description for LLPC. This data contains things about how shader - // inputs like descriptor set bindings are communicated to this pipeline in a form that LLPC can - // understand. - if (pInfo->pLayout != nullptr) - { - const bool vertexShader = (shaderStage == ShaderStageVertex); - result = pInfo->pLayout->BuildLlpcPipelineMapping( - shaderStage, - pMappingBuffer, - vertexShader ? pInfo->pVertexInput : nullptr, - pShaderInfo, - vertexShader ? pVbInfo : nullptr); - } - } - } - - uint64_t pipeHash = 0; - enableLlpc = true; - - if (result == VK_SUCCESS) - { - if (enableLlpc) - { - if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCacheType() == PipelineCacheTypeLlpc)) - { - pInfo->pipelineLlpc.pShaderCache = pPipelineCache->GetShaderCache(DefaultDeviceIndex).pLlpcShaderCache; - } - Llpc::Result llpcResult = pDevice->GetLlpcCompiler()->BuildGraphicsPipeline(&pInfo->pipelineLlpc, &pipelineOut); - if (llpcResult != Llpc::Result::Success) - { - // There shouldn't be anything to free for the failure case - VK_ASSERT(pLlpcPipelineBuffer == nullptr); - - { - result = VK_ERROR_INITIALIZATION_FAILED; - } - } - } - else - if (settings.enablePipelineDump) - { - // LLPC isn't enabled but pipeline dump is required, call LLPC dump interface explicitly - void* pHandle = Llpc::IPipelineDumper::BeginPipelineDump(settings.pipelineDumpDir, nullptr, &pInfo->pipelineLlpc); - Llpc::IPipelineDumper::EndPipelineDump(pHandle); - } - - if (enableLlpc) - { - if (result == VK_SUCCESS) - { - // Make sure that this is the same pointer we will free once the PAL pipeline is created - VK_ASSERT(pLlpcPipelineBuffer == pipelineOut.pipelineBin.pCode); - - // Update pipeline create info with the pipeline binary - pPipelineBinaries[DefaultDeviceIndex] = pLlpcPipelineBuffer; - pipelineBinarySizes[DefaultDeviceIndex] = pipelineOut.pipelineBin.codeSize; - } - } - } - } - - if (pMappingBuffer != nullptr) - { - pDevice->VkInstance()->FreeMem(pMappingBuffer); - } - - return result; -} - // ===================================================================================================================== // Create a graphics pipeline object. VkResult GraphicsPipeline::Create( @@ -803,21 +620,30 @@ VkResult GraphicsPipeline::Create( // Parse the create info and build patched AMDIL shaders CreateInfo createInfo = {}; VbBindingInfo vbInfo = {}; + PipelineCompiler::GraphicsPipelineCreateInfo binaryCreateInfo = {}; size_t pipelineBinarySizes[MaxPalDevices] = {}; - void* pPipelineBinaries[MaxPalDevices] = {}; + const void* pPipelineBinaries[MaxPalDevices] = {}; Pal::Result palResult = Pal::Result::Success; + PipelineCompiler* pDefaultCompiler = pDevice->GetCompiler(); - ConvertGraphicsPipelineInfo(pDevice, pCreateInfo, &createInfo); - - VkResult result = CreateGraphicsPipelineBinaries( + VkResult result = pDefaultCompiler->ConvertGraphicsPipelineInfo(pDevice, pCreateInfo, &binaryCreateInfo, &vbInfo); + const uint32_t numPalDevices = pDevice->NumPalDevices(); + for (uint32_t i = 0; (result == VK_SUCCESS) && (i < numPalDevices); ++i) + { + result = pDevice->GetCompiler(i)->CreateGraphicsPipelineBinary( pDevice, + i, pPipelineCache, - &createInfo, - &vbInfo, - pipelineBinarySizes, - pPipelineBinaries); + &binaryCreateInfo, + &pipelineBinarySizes[i], + &pPipelineBinaries[i]); + } - const uint32_t numPalDevices = pDevice->NumPalDevices(); + if (result == VK_SUCCESS) + { + ConvertGraphicsPipelineInfo(pDevice, pCreateInfo, &createInfo); + + } RenderStateCache* pRSCache = pDevice->GetRenderStateCache(); @@ -934,7 +760,7 @@ VkResult GraphicsPipeline::Create( const bool viewIndexFromDeviceIndex = Util::TestAnyFlagSet( pCreateInfo->flags, - VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT); + VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHX); // On success, wrap it up in a Vulkan object. if (result == VK_SUCCESS) @@ -960,11 +786,11 @@ VkResult GraphicsPipeline::Create( { if (pPipelineBinaries[deviceIdx] != nullptr) { - { - pDevice->VkInstance()->FreeMem(pPipelineBinaries[deviceIdx]); - } + pDevice->GetCompiler(deviceIdx)->FreeGraphicsPipelineBinary( + &binaryCreateInfo, pPipelineBinaries[deviceIdx], pipelineBinarySizes[deviceIdx]); } } + pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfo); if (result != VK_SUCCESS) { diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp index d4ad84fe..91c0fdc2 100644 --- a/icd/api/vk_image.cpp +++ b/icd/api/vk_image.cpp @@ -1745,7 +1745,8 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements2KHR( VkMemoryRequirements2KHR* pMemoryRequirements) { const Device* pDevice = ApiDevice::ObjectFromHandle(device); - VK_ASSERT(pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); + VK_ASSERT((pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || + pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); union { @@ -1785,7 +1786,8 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements2KHR( VkSparseImageMemoryRequirements2KHR* pSparseMemoryRequirements) { const Device* pDevice = ApiDevice::ObjectFromHandle(device); - VK_ASSERT(pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); + VK_ASSERT((pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || + pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2)); union { diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp index 44b53b59..5227a5b3 100644 --- a/icd/api/vk_instance.cpp +++ b/icd/api/vk_instance.cpp @@ -104,9 +104,7 @@ VkResult Instance::EnumerateVersion( uint32_t* pApiVersion) { // Report 1.1 support - *pApiVersion = VK_MAKE_VERSION(VULKAN_API_MAJOR_VERSION, - VULKAN_API_MINOR_VERSION, - VULKAN_API_BUILD_VERSION); + *pApiVersion = (VK_API_VERSION_1_1 | VK_HEADER_VERSION); return VK_SUCCESS; } @@ -162,9 +160,7 @@ VkResult Instance::Create( uint32_t apiVersion = VK_MAKE_VERSION(1,0,0); #else // Default to the highest supported API version - uint32_t apiVersion = VK_MAKE_VERSION(VULKAN_API_MAJOR_VERSION, - VULKAN_API_MINOR_VERSION, - VULKAN_API_BUILD_VERSION); + uint32_t apiVersion = (VK_API_VERSION_1_0 | VK_HEADER_VERSION); #endif if ((pAppInfo != nullptr) && (pAppInfo->apiVersion != 0)) @@ -182,18 +178,6 @@ VkResult Instance::Create( apiVersion = pAppInfo->apiVersion; } -#ifdef ICD_VULKAN_1_1 - // Implicitly enable instance extensions that are core in the API version - if (apiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_DEVICE_GROUP_CREATION); - enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_EXTERNAL_FENCE_CAPABILITIES); - enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_EXTERNAL_MEMORY_CAPABILITIES); - enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_EXTERNAL_SEMAPHORE_CAPABILITIES); - enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_GET_PHYSICAL_DEVICE_PROPERTIES2); - }; -#endif - // pAllocCb is never NULL here because the entry point will fill it in if the // application doesn't. VK_ASSERT(pAllocCb != nullptr); diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp index 9bacecf2..059b9743 100644 --- a/icd/api/vk_memory.cpp +++ b/icd/api/vk_memory.cpp @@ -719,7 +719,8 @@ VkResult Memory::OpenExternalMemory( // Returns the external shared handle of the memory object. Pal::OsExternalHandle Memory::GetShareHandle(VkExternalMemoryHandleTypeFlagBitsKHR handleType) { - VK_ASSERT(m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_EXTERNAL_MEMORY_FD) || + VK_ASSERT((m_pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) || + m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_EXTERNAL_MEMORY_FD) || m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_EXTERNAL_MEMORY_WIN32)); return PalMemory()->GetSharedExternalHandle(); diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index 49fe6502..44b7ad7b 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -256,14 +256,15 @@ PhysicalDevice::PhysicalDevice( #ifdef ICD_BUILD_APPPROFILE m_appProfile(appProfile), #endif - m_supportedExtensions() + m_supportedExtensions(), + m_compiler(this) { memset(&m_limits, 0, sizeof(m_limits)); memset(m_formatFeatureMsaaTarget, 0, sizeof(m_formatFeatureMsaaTarget)); memset(&m_queueFamilies, 0, sizeof(m_queueFamilies)); memset(&m_memoryProperties, 0, sizeof(m_memoryProperties)); memset(&m_gpaProps, 0, sizeof(m_gpaProps)); - for (uint32_t i = 0; i< VK_MEMORY_TYPE_NUM; i++) + for (uint32_t i = 0; i < VK_MEMORY_TYPE_NUM; i++) { m_memoryPalHeapToVkIndex[i] = VK_MEMORY_TYPE_NUM; // invalid index m_memoryVkIndexToPalHeap[i] = Pal::GpuHeapCount; // invalid index @@ -353,37 +354,18 @@ static void GetFormatFeatureFlags( VkFormatFeatureFlags depthFlags = PalToVkFormatFeatureFlags( formatProperties.features[depthFormatIdx][tilingIdx]); - if (depthFlags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) + if ((depthFlags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) != 0) { retFlags |= (depthFlags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT); } - } - - const uint32_t minMaxFeatureBits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT; - - // Handle the various special cases for Min\Max Image Filtering support - if ((retFlags & minMaxFeatureBits) != 0) - { - const auto& info = Pal::Formats::FormatInfoTable[static_cast(swizzledFormat.format)]; - - // min/max filtering is supported only for single-component formats unless multiChannelMinMaxFilter == true - // Depth-stencil is considered a single-component format because stencil and depth are separate, single - // channel images and in Hw, you can only sample from one of them at a time. - bool supported = (info.componentCount == 1) || - Formats::IsDepthStencilFormat(format) || - (multiChannelMinMaxFilter == true); - if ((Formats::IsDepthStencilFormat(format) == false) && - ((info.numericSupport == Pal::Formats::NumericSupportFlags::Uint) || - (info.numericSupport == Pal::Formats::NumericSupportFlags::Sint))) + // According to the Vulkan Spec (section 32.2.0) + // Re: VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT - If the format is a depth / stencil format, + // this bit only indicates that the depth aspect(not the stencil aspect) of an image of this format + // supports min/max filtering. + if ((depthFlags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT) != 0) { - // TODO: Disable Uint and Sint via Pal. - supported = false; - } - - if (supported == false) - { - retFlags &= ~minMaxFeatureBits; + retFlags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT; } } @@ -586,7 +568,13 @@ VkResult PhysicalDevice::Initialize() PopulateGpaProperties(); } - return PalToVkResult(result); + VkResult vkResult = PalToVkResult(result); + if (vkResult == VK_SUCCESS) + { + vkResult = m_compiler.Initialize(); + } + + return vkResult; } // ===================================================================================================================== @@ -648,7 +636,7 @@ void PhysicalDevice::PopulateFormatProperties() Pal::MergedFormatPropertiesTable fmtProperties = {}; m_pPalDevice->GetFormatProperties(&fmtProperties); - const bool multiChannelMinMaxFilter = m_properties.gfxipProperties.flags.supportPerChannelMinMaxFilter != 0; + const bool multiChannelMinMaxFilter = IsPerChannelMinMaxFilteringSupported(); for (uint32_t i = 0; i < VK_SUPPORTED_FORMAT_COUNT; i++) { @@ -732,6 +720,7 @@ void PhysicalDevice::LateInitialize() // ===================================================================================================================== VkResult PhysicalDevice::Destroy(void) { + m_compiler.Destroy(); this->~PhysicalDevice(); VkInstance()->FreeMem(ApiPhysicalDevice::FromObject(this)); @@ -1291,9 +1280,7 @@ void PhysicalDevice::GetSparseImageFormatProperties( uint32_t PhysicalDevice::GetSupportedAPIVersion() const { // Currently all of our HW supports Vulkan 1.1 - uint32_t apiVersion = VK_MAKE_VERSION(VULKAN_API_MAJOR_VERSION, - VULKAN_API_MINOR_VERSION, - VULKAN_API_BUILD_VERSION); + uint32_t apiVersion = (VK_API_VERSION_1_1 | VK_HEADER_VERSION); // For sanity check we do at least want to make sure that all the necessary extensions are supported and exposed. // The spec does not require Vulkan 1.1 implementations to expose the corresponding 1.0 extensions, but we'll @@ -1343,9 +1330,7 @@ VkResult PhysicalDevice::GetDeviceProperties( #ifdef ICD_VULKAN_1_1 pProperties->apiVersion = GetSupportedAPIVersion(); #else - pProperties->apiVersion = VK_MAKE_VERSION(VULKAN_API_MAJOR_VERSION, - VULKAN_API_MINOR_VERSION, - VULKAN_API_BUILD_VERSION); + pProperties->apiVersion = (VK_API_VERSION_1_0 | VK_HEADER_VERSION); #endif // Radeon Settings UI diplays driverVersion using sizes 10.10.12 like apiVersion, but our driverVersion uses 10.22. @@ -2461,7 +2446,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions( availableExtensions.AddExtension(VK_DEVICE_EXTENSION(AMD_RASTERIZATION_ORDER)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(AMD_DRAW_INDIRECT_COUNT)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(AMD_NEGATIVE_VIEWPORT_HEIGHT)); - + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_SUBGROUP_BALLOT)); + availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_SUBGROUP_VOTE)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_STENCIL_EXPORT)); availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_VIEWPORT_INDEX_LAYER)); @@ -2636,11 +2622,11 @@ void PhysicalDevice::PopulateQueueFamilies() } // Determine the queue family to PAL engine type mapping and populate its properties - for (uint32_t i = 0; i < Pal::EngineTypeCount; ++i) + for (uint32_t engineType = 0; engineType < Pal::EngineTypeCount; ++engineType) { // Only add queue families for PAL engine types that have at least one queue present and that supports some // functionality exposed in Vulkan. - const auto& engineProps = m_properties.engineProperties[i]; + const auto& engineProps = m_properties.engineProperties[engineType]; // Update supportedQueueFlags based on what is enabled, as well as specific engine properties. // In particular, sparse binding support requires the engine to support virtual memory remap. @@ -2650,35 +2636,52 @@ void PhysicalDevice::PopulateQueueFamilies() supportedQueueFlags &= ~VK_QUEUE_SPARSE_BINDING_BIT; } - if ((engineProps.engineCount != 0) && ((vkQueueFlags[i] & supportedQueueFlags) != 0)) + if ((engineProps.engineCount != 0) && ((vkQueueFlags[engineType] & supportedQueueFlags) != 0)) { - m_queueFamilies[m_queueFamilyCount].palEngineType = static_cast(i); + m_queueFamilies[m_queueFamilyCount].palEngineType = static_cast(engineType); const Pal::QueueType primaryQueueType = palQueueTypes[GetQueueFamilyPalEngineType(m_queueFamilyCount)]; VK_ASSERT((engineProps.queueSupport & (1 << primaryQueueType)) != 0); m_queueFamilies[m_queueFamilyCount].palQueueType = primaryQueueType; uint32_t palImageLayoutFlag = 0; - switch (i) + uint32_t transferGranularityOverride = 0; + + switch (engineType) { - case Pal::EngineTypeUniversal: palImageLayoutFlag = Pal::LayoutUniversalEngine; break; - case Pal::EngineTypeCompute: palImageLayoutFlag = Pal::LayoutComputeEngine; break; - case Pal::EngineTypeExclusiveCompute: palImageLayoutFlag = Pal::LayoutComputeEngine; break; - case Pal::EngineTypeDma: palImageLayoutFlag = Pal::LayoutDmaEngine; break; - default: break; // no-op + case Pal::EngineTypeUniversal: + palImageLayoutFlag = Pal::LayoutUniversalEngine; + transferGranularityOverride = m_settings.transferGranularityUniversalOverride; + break; + case Pal::EngineTypeCompute: + case Pal::EngineTypeExclusiveCompute: + palImageLayoutFlag = Pal::LayoutComputeEngine; + transferGranularityOverride = m_settings.transferGranularityComputeOverride; + break; + case Pal::EngineTypeDma: + palImageLayoutFlag = Pal::LayoutDmaEngine; + transferGranularityOverride = m_settings.transferGranularityDmaOverride; + break; + default: + break; // no-op } m_queueFamilies[m_queueFamilyCount].palImageLayoutFlag = palImageLayoutFlag; - VkQueueFamilyProperties& queueFamilyProps = m_queueFamilies[m_queueFamilyCount].properties; - - queueFamilyProps.queueFlags = (vkQueueFlags[i] & supportedQueueFlags); - - queueFamilyProps.queueCount = engineProps.engineCount; + VkQueueFamilyProperties* pQueueFamilyProps = &m_queueFamilies[m_queueFamilyCount].properties; - queueFamilyProps.timestampValidBits = (engineProps.flags.supportsTimestamps != 0) ? 64 : 0; + pQueueFamilyProps->queueFlags = (vkQueueFlags[engineType] & supportedQueueFlags); + pQueueFamilyProps->queueCount = engineProps.engineCount; + pQueueFamilyProps->timestampValidBits = (engineProps.flags.supportsTimestamps != 0) ? 64 : 0; + pQueueFamilyProps->minImageTransferGranularity = PalToVkExtent3d(engineProps.minTiledImageCopyAlignment); - queueFamilyProps.minImageTransferGranularity = PalToVkExtent3d(engineProps.minTiledImageCopyAlignment); + // Override reported transfer granularity via panel setting + if ((transferGranularityOverride & 0xf0000000) != 0) + { + pQueueFamilyProps->minImageTransferGranularity.width = ((transferGranularityOverride >> 0) & 0xff); + pQueueFamilyProps->minImageTransferGranularity.height = ((transferGranularityOverride >> 8) & 0xff); + pQueueFamilyProps->minImageTransferGranularity.depth = ((transferGranularityOverride >> 16) & 0xff); + } m_queueFamilyCount++; } @@ -3055,7 +3058,7 @@ void PhysicalDevice::GetDeviceProperties2( VkPhysicalDeviceSubgroupProperties* pSubgroupProperties; #endif VkPhysicalDeviceExternalMemoryHostPropertiesEXT* pExternalMemoryHostProperties; - VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT* pSamplerFilterMinmaxPropertiesEXT; + VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT* pMinMaxProperties; VkPhysicalDeviceShaderCorePropertiesAMD* pShaderCoreProperties; }; @@ -3137,7 +3140,9 @@ void PhysicalDevice::GetDeviceProperties2( VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT; - pSubgroupProperties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT; + pSubgroupProperties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_BALLOT_BIT; pSubgroupProperties->quadOperationsInAllStages = VK_TRUE; break; @@ -3146,8 +3151,8 @@ void PhysicalDevice::GetDeviceProperties2( case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT: { - pSamplerFilterMinmaxPropertiesEXT->filterMinmaxImageComponentMapping = VK_FALSE; - pSamplerFilterMinmaxPropertiesEXT->filterMinmaxSingleComponentFormats = VK_TRUE; + pMinMaxProperties->filterMinmaxImageComponentMapping = IsPerChannelMinMaxFilteringSupported(); + pMinMaxProperties->filterMinmaxSingleComponentFormats = VK_TRUE; break; } diff --git a/icd/api/vk_pipeline_cache.cpp b/icd/api/vk_pipeline_cache.cpp index bcce0e5e..f692b24d 100644 --- a/icd/api/vk_pipeline_cache.cpp +++ b/icd/api/vk_pipeline_cache.cpp @@ -151,7 +151,7 @@ VkResult PipelineCache::Create( Llpc::ShaderCacheCreateInfo createInfo = {}; for (uint32_t i = 0; i < numPalDevices; i++) { - auto pCompiler = pDevice->GetLlpcCompiler(); + auto pCompiler = pDevice->GetCompiler()->GetLlpcCompiler(); if (useInitialData) { diff --git a/icd/api/vk_shader.cpp b/icd/api/vk_shader.cpp index a2272ca5..a881649e 100644 --- a/icd/api/vk_shader.cpp +++ b/icd/api/vk_shader.cpp @@ -174,7 +174,7 @@ VkResult ShaderModule::Init(const Device* pDevice) moduleInfo.shaderBin.pCode = m_pCode; moduleInfo.shaderBin.codeSize = m_codeSize; - Llpc::Result llpcResult = pDevice->GetLlpcCompiler()->BuildShaderModule(&moduleInfo, &m_llpcConvertOut); + Llpc::Result llpcResult = pDevice->GetCompiler()->GetLlpcCompiler()->BuildShaderModule(&moduleInfo, &m_llpcConvertOut); if ((llpcResult != Llpc::Result::Success) && (llpcResult != Llpc::Result::Delayed)) { diff --git a/icd/make/importdefs b/icd/make/importdefs index bdf63ef2..e94b91fe 100644 --- a/icd/make/importdefs +++ b/icd/make/importdefs @@ -1,13 +1,13 @@ # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. It must # be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -ICD_PAL_CLIENT_MAJOR_VERSION = 387 +ICD_PAL_CLIENT_MAJOR_VERSION = 388 # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. It describes # the interface version of the gpuopen shared module (part of PAL) that the ICD supports. ICD_GPUOPEN_CLIENT_MAJOR_VERSION = 26 ICD_GPUOPEN_CLIENT_MINOR_VERSION = 0 -# This will become the value of SCPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_SCPC=1. It describes the verson of the +# This will become the value of SCPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_SCPC=1. It describes the version of the # interface version of SCPC (currently part of PAL) that the ICD supports. ICD_SCPC_CLIENT_MAJOR_VERSION = 2 diff --git a/icd/res/ver.h b/icd/res/ver.h index 058c2559..52b792e4 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -29,14 +29,6 @@ #define MKSTR(x) #x #define MAKE_VERSION_STRING(x) MKSTR(x) -#define VULKAN_API_MAJOR_VERSION 1 -#ifdef ICD_VULKAN_1_1 -#define VULKAN_API_MINOR_VERSION 1 -#else -#define VULKAN_API_MINOR_VERSION 0 -#endif -#define VULKAN_API_BUILD_VERSION 70 - // This value is used for the VkPhysicalDeviceProperties uint32 driverVersion which is OS agnostic #define VULKAN_ICD_MAJOR_VERSION 2 @@ -44,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 18 +#define VULKAN_ICD_BUILD_VERSION 19 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION diff --git a/icd/settings/settings.cfg b/icd/settings/settings.cfg index 9b4a9169..cf05f6a8 100644 --- a/icd/settings/settings.cfg +++ b/icd/settings/settings.cfg @@ -1243,6 +1243,47 @@ Node = "Memory" VariableDefault = "false"; SettingScope = "PrivateDriverKey"; } + + Leaf + { + SettingName = "TransferGranularityUniversalOverride"; + SettingType = "HEX_STR"; + Description = "Override reported minImageTransferGranularity field for graphics queue families. This\r\n + is encoded as a hex string of the form 0xb000zzyyxx, where 'xx', 'yy', and 'zz' are the\r\n + reported transfer granularities in the X, Y and Z extents respectively, and 'b' is\r\n + a control flag: if 'b' is non-zero, this override is applies; otherwise the standard\r\n + transfer granularity is used.\r\n"; + VariableName = "transferGranularityUniversalOverride"; + VariableType = "uint32_t"; + VariableDefault = "0"; + SettingScope = "PrivateDriverKey"; + } + + Leaf + { + SettingName = "TransferGranularityComputeOverride"; + SettingType = "HEX_STR"; + Description = "Override reported minImageTransferGranularity field for compute queue families. For how\r\n + this value is interpreted by the driver, see the description for\r\n + TransferGranularityUniversalOverride.\r\n"; + VariableName = "transferGranularityComputeOverride"; + VariableType = "uint32_t"; + VariableDefault = "0"; + SettingScope = "PrivateDriverKey"; + } + + Leaf + { + SettingName = "TransferGranularityDmaOverride"; + SettingType = "HEX_STR"; + Description = "Override reported minImageTransferGranularity field for DMA (i.e. SDMA) queue families.\r\n + For how this value is interpreted by the driver, see the description for\r\n + TransferGranularityUniversalOverride.\r\n"; + VariableName = "transferGranularityDmaOverride"; + VariableType = "uint32_t"; + VariableDefault = "0"; + SettingScope = "PrivateDriverKey"; + } } Node = "Optimization"