diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt
index 8ecb6d0f..db718ea5 100644
--- a/icd/CMakeLists.txt
+++ b/icd/CMakeLists.txt
@@ -159,6 +159,7 @@ target_sources(xgl PRIVATE
     api/color_space_helper.cpp
     api/gpu_event_mgr.cpp
     api/internal_mem_mgr.cpp
+    api/pipeline_compiler.cpp
     api/stencil_ops_combiner.cpp
     api/vert_buf_binding_mgr.cpp
     api/virtual_stack_mgr.cpp
diff --git a/icd/api/include/khronos/sdk-1.1/vk_layer_dispatch_table.h b/icd/api/include/khronos/sdk-1.1/vk_layer_dispatch_table.h
index 2cc12a6b..572982df 100644
--- a/icd/api/include/khronos/sdk-1.1/vk_layer_dispatch_table.h
+++ b/icd/api/include/khronos/sdk-1.1/vk_layer_dispatch_table.h
@@ -499,6 +499,9 @@ typedef struct VkLayerDispatchTable_ {
 
     // ---- VK_EXT_external_memory_host extension commands
     PFN_vkGetMemoryHostPointerPropertiesEXT GetMemoryHostPointerPropertiesEXT;
+
+    // ---- VK_AMD_buffer_marker extension commands
+    PFN_vkCmdWriteBufferMarkerAMD CmdWriteBufferMarkerAMD;
 } VkLayerDispatchTable;
 
 
diff --git a/icd/api/include/pipeline_compiler.h b/icd/api/include/pipeline_compiler.h
new file mode 100644
index 00000000..3d50c41f
--- /dev/null
+++ b/icd/api/include/pipeline_compiler.h
@@ -0,0 +1,149 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  pipeline_compiler.h
+ * @brief Contains declaration of Vulkan pipeline compiler
+ ***********************************************************************************************************************
+ */
+
+#pragma once
+
+#include "include/khronos/vulkan.h"
+#include "include/vk_utils.h"
+#include "include/vk_defines.h"
+#include "include/vk_shader_code.h"
+
+#include "llpc.h"
+
+namespace Bil
+{
+
+struct BilConvertOptions;
+struct BilShaderPatchOutput;
+enum BilDescriptorType : uint32_t;
+
+}
+
+namespace vk
+{
+
+class PhysicalDevice;
+class PipelineLayout;
+class PipelineCache;
+struct VbBindingInfo;
+
+// =====================================================================================================================
+// Represents Vulkan pipeline compiler, it wraps LLPC and SCPC, and hides the differences.
+class PipelineCompiler
+{
+public:
+    // Creation info parameters for all the necessary LLPC/SCPC state objects encapsulated
+    // by the Vulkan graphics pipeline.
+    struct GraphicsPipelineCreateInfo
+    {
+        Llpc::GraphicsPipelineBuildInfo        pipelineInfo;
+        const PipelineLayout*                  pLayout;
+        const VkPipelineShaderStageCreateInfo* pStages[ShaderGfxStageCount];
+        VkPipelineCreateFlags                  flags;
+        void*                                  pMappingBuffer;
+        VkFormat                               dbFormat;
+    };
+
+    // Creation info parameters for all the necessary LLPC/SCPC state objects encapsulated
+    // by the Vulkan compute pipeline.
+    struct ComputePipelineCreateInfo
+    {
+        Llpc::ComputePipelineBuildInfo         pipelineInfo;
+        const PipelineLayout*                  pLayout;
+        const VkPipelineShaderStageCreateInfo* pStage;
+        VkPipelineCreateFlags                  flags;
+        void*                                  pMappingBuffer;
+    };
+
+    PipelineCompiler(PhysicalDevice* pPhysicalDevice);
+    ~PipelineCompiler();
+    VkResult Initialize();
+    void Destroy();
+
+    VkResult CreateGraphicsPipelineBinary(
+        Device*                             pDevice,
+        uint32_t                            deviceIndex,
+        PipelineCache*                      pPipelineCache,
+        GraphicsPipelineCreateInfo*         pCreateInfo,
+        size_t*                             pPipelineBinarySize,
+        const void**                        ppPipelineBinary);
+
+    VkResult CreateComputePipelineBinary(
+        Device*                             pDevice,
+        uint32_t                            deviceIndex,
+        PipelineCache*                      pPipelineCache,
+        ComputePipelineCreateInfo*          pInfo,
+        size_t*                             pPipelineBinarySize,
+        const void**                        ppPipelineBinary);
+
+    VkResult ConvertGraphicsPipelineInfo(
+        Device*                             pDevice,
+        const VkGraphicsPipelineCreateInfo* pIn,
+        GraphicsPipelineCreateInfo*         pInfo,
+        VbBindingInfo*                      pVbInfo);
+
+    VkResult ConvertComputePipelineInfo(
+        const VkComputePipelineCreateInfo*  pIn,
+        ComputePipelineCreateInfo*          pInfo);
+
+    void FreeComputePipelineBinary(
+        ComputePipelineCreateInfo* pCreateInfo,
+        const void*                pPipelineBinary,
+        size_t                     binarySize);
+
+    void FreeGraphicsPipelineBinary(
+        GraphicsPipelineCreateInfo* pCreateInfo,
+        const void*                pPipelineBinary,
+        size_t                     binarySize);
+
+    void FreeComputePipelineCreateInfo(ComputePipelineCreateInfo* pCreateInfo);
+
+    void FreeGraphicsPipelineCreateInfo(GraphicsPipelineCreateInfo* pCreateInfo);
+    // Get LLPC compiler explicitly.
+    // TODO: Should be removed in the future
+    Llpc::ICompiler* GetLlpcCompiler() { return m_pLlpc; }
+
+private:
+    VkResult CreateLlpcCompiler();
+
+    static bool IsDualSourceBlend(VkBlendFactor blend);
+
+    // -----------------------------------------------------------------------------------------------------------------
+
+    PhysicalDevice*    m_pPhysicalDevice;      // Vulkan physical device object
+    Llpc::GfxIpVersion m_gfxIp;                // Graphics IP version info, used by LLPC
+    Pal::GfxIpLevel    m_gfxIpLevel;           // Graphics IP Level, used by SCPC
+
+    Llpc::ICompiler*    m_pLlpc;               // LLPC compiler object
+
+}; // class PipelineCompiler
+
+} // namespce vk
diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h
index 29f8729b..23308d19 100644
--- a/icd/api/include/vk_cmdbuffer.h
+++ b/icd/api/include/vk_cmdbuffer.h
@@ -853,6 +853,8 @@ class CmdBuffer
         const uint32_t            queryCount,
         const uint32_t            timestampChunk);
 
+    VK_INLINE uint32_t EstimateMaxObjectsOnVirtualStack(size_t objectSize) const;
+
 #if VK_ENABLE_DEBUG_BARRIERS
     void DbgCmdBarrier(bool preCmd);
 #endif
diff --git a/icd/api/include/vk_compute_pipeline.h b/icd/api/include/vk_compute_pipeline.h
index d4e45fc6..62412eff 100644
--- a/icd/api/include/vk_compute_pipeline.h
+++ b/icd/api/include/vk_compute_pipeline.h
@@ -110,8 +110,6 @@ class ComputePipeline : public Pipeline, public NonDispatchable<VkPipeline, Comp
         ImmedInfo                              immedInfo;
         Pal::ComputePipelineCreateInfo         pipeline;
         const PipelineLayout*                  pLayout;
-        const VkPipelineShaderStageCreateInfo* pStage;
-        VkPipelineCreateFlags                  flags;
     };
 
     static void ConvertComputePipelineInfo(
diff --git a/icd/api/include/vk_descriptor_pool.h b/icd/api/include/vk_descriptor_pool.h
index 8025b694..73c865a5 100644
--- a/icd/api/include/vk_descriptor_pool.h
+++ b/icd/api/include/vk_descriptor_pool.h
@@ -40,6 +40,7 @@
 #include "include/vk_dispatch.h"
 #include "include/vk_memory.h"
 #include "include/internal_mem_mgr.h"
+#include "include/vk_descriptor_set.h"
 
 namespace vk
 {
@@ -158,9 +159,13 @@ class DescriptorSetHeap
     void Reset();
 
 private:
+
+    size_t SetSize() const { return Util::Pow2Align(sizeof(DescriptorSet), VK_DEFAULT_MEM_ALIGN); }
+
+    VkDescriptorSet DescriptorSetHandleFromIndex(uint32_t idx) const;
+
     uint32_t             m_nextFreeHandle;
     uint32_t             m_maxSets;
-    VkDescriptorSet*     m_pHandles;
 
     uint32_t*            m_pFreeIndexStack;
     uint32_t             m_freeIndexStackCount;
diff --git a/icd/api/include/vk_descriptor_set.h b/icd/api/include/vk_descriptor_set.h
index 73a04864..3dc67c42 100644
--- a/icd/api/include/vk_descriptor_set.h
+++ b/icd/api/include/vk_descriptor_set.h
@@ -68,16 +68,16 @@ union DescriptorSetFlags
 class DescriptorSet : public NonDispatchable<VkDescriptorSet, DescriptorSet>
 {
 public:
-    VK_INLINE void WriteSamplerDescriptors(
-        const Device::Properties&       deviceProperties,
+    template <size_t samplerDescSize>
+    static void WriteSamplerDescriptors(
         const VkDescriptorImageInfo*    pDescriptors,
         uint32_t*                       pDestAddr,
         uint32_t                        count,
         uint32_t                        dwStride,
         size_t                          descriptorStrideInBytes);
 
-    VK_INLINE void WriteImageSamplerDescriptors(
-        const Device::Properties&       deviceProperties,
+    template <size_t imageDescSize, size_t samplerDescSize>
+    static void WriteImageSamplerDescriptors(
         const VkDescriptorImageInfo*    pDescriptors,
         uint32_t                        deviceIdx,
         uint32_t*                       pDestAddr,
@@ -85,9 +85,8 @@ class DescriptorSet : public NonDispatchable<VkDescriptorSet, DescriptorSet>
         uint32_t                        dwStride,
         size_t                          descriptorStrideInBytes);
 
-    VK_INLINE void WriteImageDescriptors(
-        VkDescriptorType                descType,
-        const Device::Properties&       deviceProperties,
+    template <size_t imageDescSize>
+    static void WriteImageDescriptors(
         const VkDescriptorImageInfo*    pDescriptors,
         uint32_t                        deviceIdx,
         uint32_t*                       pDestAddr,
@@ -95,8 +94,8 @@ class DescriptorSet : public NonDispatchable<VkDescriptorSet, DescriptorSet>
         uint32_t                        dwStride,
         size_t                          descriptorStrideInBytes);
 
-    VK_INLINE void WriteFmaskDescriptors(
-        const Device*                   pDevice,
+    template <size_t imageDescSize>
+    static void WriteFmaskDescriptors(
         const VkDescriptorImageInfo*    pDescriptors,
         uint32_t                        deviceIdx,
         uint32_t*                       pDestAddr,
@@ -104,9 +103,9 @@ class DescriptorSet : public NonDispatchable<VkDescriptorSet, DescriptorSet>
         uint32_t                        dwStride,
         size_t                          descriptorStrideInBytes);
 
-    VK_INLINE void WriteBufferInfoDescriptors(
+    template <VkDescriptorType type>
+    static void WriteBufferInfoDescriptors(
         const Device*                   pDevice,
-        VkDescriptorType                type,
         const VkDescriptorBufferInfo*   pDescriptors,
         uint32_t                        deviceIdx,
         uint32_t*                       pDestAddr,
@@ -114,9 +113,8 @@ class DescriptorSet : public NonDispatchable<VkDescriptorSet, DescriptorSet>
         uint32_t                        dwStride,
         size_t                          descriptorStrideInBytes);
 
-    VK_INLINE void WriteBufferDescriptors(
-        const Device::Properties&       deviceProperties,
-        VkDescriptorType                type,
+    template <size_t bufferDescSize, VkDescriptorType type>
+    static void WriteBufferDescriptors(
         const VkBufferView*             pDescriptors,
         uint32_t                        deviceIdx,
         uint32_t*                       pDestAddr,
@@ -161,39 +159,52 @@ class DescriptorSet : public NonDispatchable<VkDescriptorSet, DescriptorSet>
         const uint32_t* pDynamicOffsets,
         uint32_t        numDynamicDescriptors);
 
+    static PFN_vkUpdateDescriptorSets GetUpdateDescriptorSetsFunc(const Device* pDevice);
+
+protected:
+    DescriptorSet(
+        DescriptorPool*    pPool,
+        uint32_t           heapIndex,
+        DescriptorSetFlags flags);
+
+    ~DescriptorSet()
+        { PAL_NEVER_CALLED(); }
+
+    template <uint32_t numPalDevices>
+    static PFN_vkUpdateDescriptorSets GetUpdateDescriptorSetsFunc(const Device* pDevice);
+
+    template <size_t imageDescSize, size_t samplerDescSize, size_t bufferDescSize, uint32_t numPalDevices>
+    static VKAPI_ATTR void VKAPI_CALL UpdateDescriptorSets(
+        VkDevice                                    device,
+        uint32_t                                    descriptorWriteCount,
+        const VkWriteDescriptorSet*                 pDescriptorWrites,
+        uint32_t                                    descriptorCopyCount,
+        const VkCopyDescriptorSet*                  pDescriptorCopies);
+
+    template <size_t imageDescSize, size_t samplerDescSize, size_t bufferDescSize>
     static void WriteDescriptorSets(
         const Device*                pDevice,
         uint32_t                     deviceIdx,
-        const Device::Properties&    deviceProperties,
         uint32_t                     descriptorWriteCount,
         const VkWriteDescriptorSet*  pDescriptorWrites,
-        size_t                       descriptorStrideInBytes = 0);
+         size_t                      descriptorStrideInBytes = 0);
 
+    template <size_t imageDescSize>
     static void CopyDescriptorSets(
         const Device*                pDevice,
         uint32_t                     deviceIdx,
-        const Device::Properties&    deviceProperties,
         uint32_t                     descriptorCopyCount,
         const VkCopyDescriptorSet*   pDescriptorCopies);
 
-protected:
-    DescriptorSet(
-        DescriptorPool*    pPool,
-        uint32_t           heapIndex,
-        DescriptorSetFlags flags);
-
-    ~DescriptorSet()
-        { PAL_NEVER_CALLED(); }
-
     void Reassign(
         const DescriptorSetLayout*  pLayout,
         Pal::gpusize                gpuMemOffset,
         Pal::gpusize*               gpuBaseAddress,
         uint32_t**                  cpuBaseAddress,
         uint32_t                    numPalDevices,
-        const InternalMemory* const pInternalMem,
-        void*                       pAllocHandle,
-        VkDescriptorSet*            pHandle);
+        void*                       pAllocHandle);
+
+    void Reset();
 
     void InitImmutableDescriptors(
         const DescriptorSetLayout*  pLayout,
diff --git a/icd/api/include/vk_descriptor_set_layout.h b/icd/api/include/vk_descriptor_set_layout.h
index e4a3948a..4422cd70 100644
--- a/icd/api/include/vk_descriptor_set_layout.h
+++ b/icd/api/include/vk_descriptor_set_layout.h
@@ -126,6 +126,29 @@ class DescriptorSetLayout : public NonDispatchable<VkDescriptorSetLayout, Descri
     static uint32_t GetDescImmutableSectionDwSize(const Device* pDevice, VkDescriptorType type);
     static uint32_t GetDynamicBufferDescDwSize(const Device* pDevice);
 
+    size_t GetDstStaOffset(const BindingInfo& dstBinding, uint32_t dstArrayElement) const
+    {
+        size_t offset = dstBinding.sta.dwOffset + (dstArrayElement * dstBinding.sta.dwArrayStride);
+
+        return offset;
+    }
+
+    size_t GetDstFmaskOffset(const BindingInfo& dstBinding, uint32_t dstArrayElement) const
+    {
+        size_t offset = Info().sta.dwSize +
+                        dstBinding.fmask.dwOffset +
+                        (dstArrayElement * dstBinding.fmask.dwArrayStride);
+
+        return offset;
+    }
+
+    size_t GetDstDynOffset(const BindingInfo& dstBinding, uint32_t dstArrayElement) const
+    {
+        size_t offset = dstBinding.dyn.dwOffset + dstArrayElement * dstBinding.dyn.dwArrayStride;
+
+        return offset;
+    }
+
 protected:
     DescriptorSetLayout(
         const Device*     pDevice,
diff --git a/icd/api/include/vk_descriptor_update_template.h b/icd/api/include/vk_descriptor_update_template.h
index d7041ea7..962482ab 100644
--- a/icd/api/include/vk_descriptor_update_template.h
+++ b/icd/api/include/vk_descriptor_update_template.h
@@ -36,11 +36,11 @@
 
 #include "include/khronos/vulkan.h"
 #include "include/vk_dispatch.h"
+#include "include/vk_descriptor_set_layout.h"
 
 namespace vk
 {
 
-class DescriptorSet;
 class Device;
 
 // =====================================================================================================================
@@ -50,6 +50,7 @@ class DescriptorUpdateTemplate : public NonDispatchable<VkDescriptorUpdateTempla
 {
 public:
     static VkResult Create(
+        const Device*                                   pDevice,
         const VkDescriptorUpdateTemplateCreateInfoKHR*  pCreateInfo,
         const VkAllocationCallbacks*                    pAllocator,
         VkDescriptorUpdateTemplateKHR*                  pDescriptorUpdateTemplate);
@@ -59,20 +60,98 @@ class DescriptorUpdateTemplate : public NonDispatchable<VkDescriptorUpdateTempla
         const VkAllocationCallbacks* pAllocator);
 
     void Update(
-        Device*         pDevice,
+        const Device*   pDevice,
         uint32_t        deviceIdx,
         VkDescriptorSet descriptorSet,
         const void*     pData);
 
-protected:
-    DescriptorUpdateTemplate(
-        const VkDescriptorUpdateTemplateEntryKHR* pEntries,
-        uint32_t                                  numEntries);
-
-    virtual ~DescriptorUpdateTemplate();
+private:
 
-    const VkDescriptorUpdateTemplateEntryKHR* m_pEntries;
-    uint32_t                                  m_numEntries;
+    DescriptorUpdateTemplate(
+        uint32_t                    numEntries);
+
+    ~DescriptorUpdateTemplate();
+
+    struct TemplateUpdateInfo;
+
+    typedef void(*PfnUpdateEntry)(
+        const Device*               pDevice,
+        VkDescriptorSet             descriptorSet,
+        uint32_t                    deviceIdx,
+        const void*                 pDescriptorInfo,
+        const TemplateUpdateInfo&   entry);
+
+    struct TemplateUpdateInfo
+    {
+        PfnUpdateEntry  pFunc;
+        size_t          srcOffset;
+        size_t          srcStride;
+        size_t          dstStaOffset;
+        uint32_t        descriptorCount;
+        uint32_t        dstBindStaDwArrayStride;
+        size_t          dstFmaskOffset;
+        uint32_t        dstBindFmaskDwArrayStride;
+        uint32_t        dstBindDynDataDwArrayStride;
+        size_t          dstDynOffset;
+    };
+
+    TemplateUpdateInfo* GetEntries() const
+    {
+        return static_cast<TemplateUpdateInfo*>(Util::VoidPtrInc(this, sizeof(*this)));
+    }
+
+    template <size_t imageDescSize, size_t samplerDescSize, size_t bufferDescSize>
+    static PfnUpdateEntry GetUpdateEntryFunc(
+        const Device*                           pDevice,
+        VkDescriptorType                        descriptorType,
+        const DescriptorSetLayout::BindingInfo& dstBinding);
+
+    static PfnUpdateEntry GetUpdateEntryFunc(
+        const Device*                           pDevice,
+        VkDescriptorType                        descriptorType,
+        const DescriptorSetLayout::BindingInfo& dstBinding);
+
+    template <size_t imageDescSize, bool updateFmask>
+    static void UpdateEntrySampledImage(
+            const Device*               pDevice,
+            VkDescriptorSet             descriptorSet,
+            uint32_t                    deviceIdx,
+            const void*                 pDescriptorInfo,
+            const TemplateUpdateInfo&   entry);
+
+    template <size_t samplerDescSize>
+    static void UpdateEntrySampler(
+            const Device*               pDevice,
+            VkDescriptorSet             descriptorSet,
+            uint32_t                    deviceIdx,
+            const void*                 pDescriptorInfo,
+            const TemplateUpdateInfo&   entry);
+
+    template <VkDescriptorType descriptorType>
+    static void UpdateEntryBuffer(
+            const Device*               pDevice,
+            VkDescriptorSet             descriptorSet,
+            uint32_t                    deviceIdx,
+            const void*                 pDescriptorInfo,
+            const TemplateUpdateInfo&   entry);
+
+    template <size_t bufferDescSize, VkDescriptorType descriptorType>
+    static void UpdateEntryTexelBuffer(
+            const Device*               pDevice,
+            VkDescriptorSet             descriptorSet,
+            uint32_t                    deviceIdx,
+            const void*                 pDescriptorInfo,
+            const TemplateUpdateInfo&   entry);
+
+    template <size_t imageDescSize, size_t samplerDescSize, bool updateFmask, bool immutable>
+    static void UpdateEntryCombinedImageSampler(
+            const Device*               pDevice,
+            VkDescriptorSet             descriptorSet,
+            uint32_t                    deviceIdx,
+            const void*                 pDescriptorInfo,
+            const TemplateUpdateInfo&   entry);
+
+    uint32_t                    m_numEntries;
 };
 
 namespace entry
diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h
index e9ad6f1f..882fc906 100644
--- a/icd/api/include/vk_device.h
+++ b/icd/api/include/vk_device.h
@@ -465,8 +465,8 @@ class Device
     VK_INLINE Util::Mutex* GetTimerQueueMutex()
         { return &m_timerQueueMutex; }
 
-    VK_INLINE Llpc::ICompiler* GetLlpcCompiler(uint32_t idx = DefaultDeviceIndex) const
-        { return m_pLlpcCompiler[idx]; }
+    VK_INLINE PipelineCompiler* GetCompiler(uint32_t idx = DefaultDeviceIndex) const
+        { return m_pPhysicalDevices[idx]->GetCompiler(); }
 
     static const Pal::MsaaQuadSamplePattern* GetDefaultQuadSamplePattern(uint32_t sampleCount);
     static uint32_t GetDefaultSamplePatternIndex(uint32_t sampleCount);
@@ -485,6 +485,9 @@ class Device
         VkExternalMemoryHandleTypeFlagBitsKHR handleType,
         const void*                           pExternalPtr) const;
 
+    PFN_vkUpdateDescriptorSets GetUpdateDescriptorSetsFunc() const
+        { return m_pfnUpdateDescriptorSets; }
+
 protected:
     Device(
         uint32_t                         deviceCount,
@@ -507,7 +510,7 @@ class Device
 
     void InitSamplePatternPalette(Pal::SamplePatternPalette* pPalette) const;
 
-    VkResult CreateLlpcCompiler(int32_t idx = DefaultDeviceIndex);
+    void InitEntryPointFuncs();
 
     Instance* const                     m_pInstance;
     const RuntimeSettings&              m_settings;
@@ -547,11 +550,11 @@ class Device
     // The maximum allocations that can be created from the logical device
     uint32_t                             m_maxAllocations;
 
-    Llpc::ICompiler*                    m_pLlpcCompiler[MaxPalDevices];
-
     // Record pipeline cache count created on this device. Note this may be dropped once there isn't any test creating
     // excessive pipeline caches.
     volatile uint32_t                   m_pipelineCacheCount;
+
+    PFN_vkUpdateDescriptorSets          m_pfnUpdateDescriptorSets;
 };
 
 // =====================================================================================================================
diff --git a/icd/api/include/vk_graphics_pipeline.h b/icd/api/include/vk_graphics_pipeline.h
index d91609dc..ee66afa6 100644
--- a/icd/api/include/vk_graphics_pipeline.h
+++ b/icd/api/include/vk_graphics_pipeline.h
@@ -268,18 +268,13 @@ class GraphicsPipeline : public Pipeline, public NonDispatchable<VkPipeline, Gra
     struct CreateInfo
     {
         Pal::GraphicsPipelineCreateInfo             pipeline;
-        Llpc::GraphicsPipelineBuildInfo             pipelineLlpc;
         Pal::MsaaStateCreateInfo                    msaa;
         Pal::ColorBlendStateCreateInfo              blend;
         Pal::DepthStencilStateCreateInfo            ds;
         ImmedInfo                                   immedInfo;
         const PipelineLayout*                       pLayout;
-        void*                                       pShaderMem;
         uint32_t                                    sampleCoverage;
-        const VkPipelineVertexInputStateCreateInfo* pVertexInput;
-        uint32_t                                    activeStageCount;
-        const VkPipelineShaderStageCreateInfo*      pActiveStages;
-        bool                                        isMultiviewEnabled;
+        VkShaderStageFlagBits                       activeStages;
     };
 
     static void ConvertGraphicsPipelineInfo(
@@ -293,14 +288,6 @@ class GraphicsPipeline : public Pipeline, public NonDispatchable<VkPipeline, Gra
         CreateInfo*                                   pInfo,
         const bool                                    dynamicStateFlags[]);
 
-    static VkResult CreateGraphicsPipelineBinaries(
-        Device*                             pDevice,
-        PipelineCache*                      pPipelineCache,
-        CreateInfo*                         pInfo,
-        VbBindingInfo*                      pVbInfo,
-        size_t                              pipelineBinarySizes[MaxPalDevices],
-        void*                               pPipelineBinaries[MaxPalDevices]);
-
 private:
     ImmedInfo                 m_info;                             // Immediate state that will go in CmdSet* functions
     Pal::IMsaaState*          m_pPalMsaa[MaxPalDevices];          // PAL MSAA state object
diff --git a/icd/api/include/vk_instance.h b/icd/api/include/vk_instance.h
index 01ba1ff8..e912d11f 100644
--- a/icd/api/include/vk_instance.h
+++ b/icd/api/include/vk_instance.h
@@ -243,8 +243,6 @@ class Instance
     void DevModeEarlyInitialize();
     void DevModeLateInitialize();
 
-    VkResult EnumerateScreens();
-
     static void PAL_STDCALL PalDeveloperCallback(
         void*                        pPrivateData,
         const Pal::uint32            deviceIndex,
diff --git a/icd/api/include/vk_physical_device.h b/icd/api/include/vk_physical_device.h
index bb655c6b..cb0d3fe1 100644
--- a/icd/api/include/vk_physical_device.h
+++ b/icd/api/include/vk_physical_device.h
@@ -42,6 +42,7 @@
 #include "include/vk_extensions.h"
 #include "include/vk_formats.h"
 #include "include/vk_queue.h"
+#include "include/pipeline_compiler.h"
 #include "settings/settings.h"
 
 #include "palDevice.h"
@@ -381,6 +382,15 @@ class PhysicalDevice
     uint32_t GetSupportedAPIVersion() const;
 #endif
 
+    VK_INLINE uint32_t GetEnabledAPIVersion() const
+    {
+#ifdef ICD_VULKAN_1_1
+        return Util::Min(GetSupportedAPIVersion(), VkInstance()->GetAPIVersion());
+#else
+        return VkInstance()->GetAPIVersion();
+#endif
+    }
+
 #ifdef ICD_BUILD_APPPROFILE
     VK_INLINE AppProfile GetAppProfile() const
         { return m_appProfile; }
@@ -391,6 +401,10 @@ class PhysicalDevice
 
     void LateInitialize();
 
+    VK_FORCEINLINE PipelineCompiler* GetCompiler()
+    {
+        return &m_compiler;
+    }
 protected:
     PhysicalDevice(PhysicalDeviceManager* pPhysicalDeviceManager,
                    Pal::IDevice*          pPalDevice,
@@ -406,6 +420,11 @@ class PhysicalDevice
     void PopulateExtensions();
     void PopulateGpaProperties();
 
+    VK_FORCEINLINE bool IsPerChannelMinMaxFilteringSupported() const
+    {
+        return m_properties.gfxipProperties.flags.supportPerChannelMinMaxFilter;
+    }
+
     PhysicalDeviceManager*           m_pPhysicalDeviceManager;
     Pal::IDevice*                    m_pPalDevice;
     Pal::DeviceProperties            m_properties;
@@ -436,6 +455,8 @@ class PhysicalDevice
 
     // Device properties related to the VK_AMD_gpu_perf_api_interface extension
     PhysicalDeviceGpaProperties      m_gpaProps;
+
+    PipelineCompiler                         m_compiler;
 };
 
 VK_DEFINE_DISPATCHABLE(PhysicalDevice);
diff --git a/icd/api/include/vk_render_pass.h b/icd/api/include/vk_render_pass.h
index d69e8096..10df096c 100644
--- a/icd/api/include/vk_render_pass.h
+++ b/icd/api/include/vk_render_pass.h
@@ -128,6 +128,20 @@ class RenderPass : public NonDispatchable<VkRenderPass, RenderPass>
     VK_INLINE uint32_t GetViewMask(uint32_t subpass) const
         { return m_createInfo.pSubpasses[subpass].viewMask; }
 
+    VK_INLINE uint32_t GetActiveViewsBitMask() const
+    {
+        uint32_t activeViewsBitMask = 0;
+
+        // View is considered active when it is used in any subpass defined by RenderPass.
+        for (uint32_t subpass = 0; subpass < GetSubpassCount(); ++subpass)
+        {
+            activeViewsBitMask |= GetViewMask(subpass);
+        }
+
+        // ActiveViewsBitMask can be understood as RenderPass ViewMask.
+        return activeViewsBitMask;
+    }
+
     VK_INLINE bool IsMultiviewEnabled() const
     {
         // When a subpass uses a non-zero view mask,
diff --git a/icd/api/llpc/CMakeLists.txt b/icd/api/llpc/CMakeLists.txt
index 10f1da50..2f94e123 100644
--- a/icd/api/llpc/CMakeLists.txt
+++ b/icd/api/llpc/CMakeLists.txt
@@ -87,7 +87,6 @@ message(STATUS "LLVM link options:" ${LLVM_LINK_FLAGS})
 
 target_compile_definitions(llpc PRIVATE ${TARGET_ARCHITECTURE_ENDIANESS}ENDIAN_CPU)
 target_compile_definitions(llpc PRIVATE _SPIRV_LLVM_API)
-target_compile_definitions(llpc PRIVATE LLPC_BUILD_GFX9)
 
 if(XGL_LLVM_UPSTREAM)
     target_compile_definitions(llpc PRIVATE XGL_LLVM_UPSTREAM=1)
@@ -292,7 +291,6 @@ add_dependencies(amdllpc llpc)
 
 target_compile_definitions(amdllpc PRIVATE ${TARGET_ARCHITECTURE_ENDIANESS}ENDIAN_CPU)
 target_compile_definitions(amdllpc PRIVATE _SPIRV_LLVM_API)
-target_compile_definitions(amdllpc PRIVATE LLPC_BUILD_GFX9)
 
 target_include_directories(amdllpc
 PUBLIC
@@ -337,10 +335,5 @@ if(UNIX)
 endif()
 
 target_link_libraries(amdllpc PRIVATE llpc dl stdc++)
-    if(XGL_LLVM_UPSTREAM)
-        llvm_map_components_to_libnames(llvm_libs amdgpucodegen amdgpuinfo amdgpuasmparser amdgpudisassembler LTO ipo analysis bitreader bitwriter codegen irreader linker mc passes support target transformutils coroutines aggressiveinstcombine)
-    else()
-        llvm_map_components_to_libnames(llvm_libs amdgpucodegen amdgpuinfo amdgpuasmparser amdgpudisassembler LTO ipo analysis bitreader bitwriter codegen irreader linker mc passes support target transformutils coroutines)
-    endif()
-
+    llvm_map_components_to_libnames(llvm_libs amdgpucodegen amdgpuinfo amdgpuasmparser amdgpudisassembler LTO ipo analysis bitreader bitwriter codegen irreader linker mc passes support target transformutils coroutines aggressiveinstcombine)
 target_link_libraries(amdllpc PRIVATE ${llvm_libs})
diff --git a/icd/api/llpc/context/llpcCompiler.cpp b/icd/api/llpc/context/llpcCompiler.cpp
index 494c67e4..3d100335 100644
--- a/icd/api/llpc/context/llpcCompiler.cpp
+++ b/icd/api/llpc/context/llpcCompiler.cpp
@@ -51,16 +51,12 @@
 #include "llpcContext.h"
 #include "llpcCopyShader.h"
 #include "llpcGfx6Chip.h"
-#ifdef LLPC_BUILD_GFX9
 #include "llpcGfx9Chip.h"
-#endif
 #include "llpcGraphicsContext.h"
 #include "llpcElf.h"
 #include "llpcFile.h"
 #include "llpcPatch.h"
-#ifdef LLPC_BUILD_GFX9
 #include "llpcShaderMerger.h"
-#endif
 #include "llpcPipelineDumper.h"
 #include "llpcSpirvLower.h"
 #include "llpcVertexFetch.h"
@@ -278,9 +274,7 @@ Compiler::Compiler(
             }
             else
             {
-#ifdef LLPC_BUILD_GFX9
                 Gfx9::InitRegisterNameMap(gfxIp);
-#endif
             }
         }
 
@@ -749,13 +743,11 @@ Result Compiler::BuildGraphicsPipeline(
             pContext->SetGsOnChip(gsOnChip);
         }
 
-#ifdef LLPC_BUILD_GFX9
         // Do user data node merge for merged shader
         if ((result == Result::Success) && (m_gfxIp.major >= 9))
         {
             pContext->DoUserDataNodeMerge();
         }
-#endif
 
         // Do LLVM module patching (main patch work)
         for (int32_t stage = ShaderStageGfxCount - 1; (stage >= 0) && (result == Result::Success); --stage)
@@ -783,7 +775,6 @@ Result Compiler::BuildGraphicsPipeline(
             }
         }
 
-#ifdef LLPC_BUILD_GFX9
         // Do shader merge operations
         if ((result == Result::Success) && (m_gfxIp.major >= 9))
         {
@@ -854,7 +845,6 @@ Result Compiler::BuildGraphicsPipeline(
                 modules[ShaderStageGeometry] = pEsGsModule;
             }
         }
-#endif
 
         // Build copy shader if necessary (has geometry shader)
         if ((result == Result::Success) && (modules[ShaderStageGeometry] != nullptr))
@@ -1757,14 +1747,10 @@ void Compiler::InitGpuProperty()
     }
     else if (m_gfxIp.major == 9)
     {
-#ifdef LLPC_BUILD_GFX9
         if (m_gfxIp.stepping == 0)
         {
             m_gpuProperty.numShaderEngines = 4;
         }
-#else
-        LLPC_NOT_IMPLEMENTED();
-#endif
     }
     else
     {
diff --git a/icd/api/llpc/context/llpcComputeContext.h b/icd/api/llpc/context/llpcComputeContext.h
index 49ddb6d5..5cae77e4 100644
--- a/icd/api/llpc/context/llpcComputeContext.h
+++ b/icd/api/llpc/context/llpcComputeContext.h
@@ -75,10 +75,8 @@ class ComputeContext: public PipelineContext
     // Enables GS on-chip mode
     virtual void SetGsOnChip(bool gsOnChip) { LLPC_NEVER_CALLED(); }
 
-#ifdef LLPC_BUILD_GFX9
     // Does user data node merge for merged shader
     virtual void DoUserDataNodeMerge() { LLPC_NEVER_CALLED(); }
-#endif
 
 protected:
     virtual std::vector<ResourceMappingNode>* GetDummyResourceMapNodes(ShaderStage shaderStage);
diff --git a/icd/api/llpc/context/llpcContext.cpp b/icd/api/llpc/context/llpcContext.cpp
index c0dce682..da89b9b9 100644
--- a/icd/api/llpc/context/llpcContext.cpp
+++ b/icd/api/llpc/context/llpcContext.cpp
@@ -66,12 +66,10 @@ const uint8_t Context::GlslEmuLibGfx8[] =
 #include "./generate/gfx8/g_llpcGlslEmuLibGfx8.h"
 };
 
-#ifdef LLPC_BUILD_GFX9
 const uint8_t Context::GlslEmuLibGfx9[]=
 {
     #include "./generate/gfx9/g_llpcGlslEmuLibGfx9.h"
 };
-#endif
 
 // =====================================================================================================================
 Context::Context(
@@ -134,7 +132,6 @@ Context::Context(
 
     if (gfxIp.major >= 9)
     {
-#ifdef LLPC_BUILD_GFX9
         libBin.codeSize = sizeof(GlslEmuLibGfx9);
         libBin.pCode    = GlslEmuLibGfx9;
         pGlslEmuLibGfx = LoadLibary(&libBin);
@@ -143,9 +140,6 @@ Context::Context(
         {
             LLPC_ERRS("Fails to link LLVM libraries together\n");
         }
-#else
-        LLPC_NOT_IMPLEMENTED();
-#endif
     }
 
     // Do function inlining
@@ -162,7 +156,7 @@ Context::Context(
 
     // Remove non-native function for native lib
     {
-        m_pNativeGlslEmuLib = CloneModule(m_pGlslEmuLib.get());
+        m_pNativeGlslEmuLib = CloneModule(*m_pGlslEmuLib.get());
         legacy::PassManager passMgr;
         passMgr.add(PassNonNativeFuncRemove::Create());
 
diff --git a/icd/api/llpc/context/llpcContext.h b/icd/api/llpc/context/llpcContext.h
index 7624e65c..5f0241de 100644
--- a/icd/api/llpc/context/llpcContext.h
+++ b/icd/api/llpc/context/llpcContext.h
@@ -223,12 +223,10 @@ class Context : public llvm::LLVMContext
         m_pPipelineContext->SetGsOnChip(gsOnChip);
     }
 
-#ifdef LLPC_BUILD_GFX9
     void DoUserDataNodeMerge()
     {
         m_pPipelineContext->DoUserDataNodeMerge();
     }
-#endif
 
     uint64_t GetPiplineHashCode() const
     {
@@ -294,10 +292,7 @@ class Context : public llvm::LLVMContext
     // GLSL emulation libraries
     static const uint8_t GlslEmuLib[];
     static const uint8_t GlslEmuLibGfx8[];
-#ifdef LLPC_BUILD_GFX9
     static const uint8_t GlslEmuLibGfx9[];
-#endif
-
 };
 
 } // Llpc
diff --git a/icd/api/llpc/context/llpcGraphicsContext.cpp b/icd/api/llpc/context/llpcGraphicsContext.cpp
index da70c9ad..2fa150f1 100644
--- a/icd/api/llpc/context/llpcGraphicsContext.cpp
+++ b/icd/api/llpc/context/llpcGraphicsContext.cpp
@@ -33,9 +33,7 @@
 #include "SPIRVInternal.h"
 #include "llpcCompiler.h"
 #include "llpcGfx6Chip.h"
-#ifdef LLPC_BUILD_GFX9
 #include "llpcGfx9Chip.h"
-#endif
 #include "llpcGraphicsContext.h"
 
 #include "llpcInternal.h"
@@ -80,13 +78,11 @@ GraphicsContext::GraphicsContext(
     m_tessOffchip(cl::EnableTessOffChip),
     m_gsOnChip(false)
 {
-#ifdef LLPC_BUILD_GFX9
     if (gfxIp.major >= 9)
     {
         // For GFX9+, always enable tessellation off-chip mode
         m_tessOffchip = true;
     }
-#endif
 
     const PipelineShaderInfo* shaderInfo[ShaderStageGfxCount] =
     {
@@ -124,12 +120,10 @@ GraphicsContext::GraphicsContext(
 // =====================================================================================================================
 GraphicsContext::~GraphicsContext()
 {
-#ifdef LLPC_BUILD_GFX9
     for (auto pAllocNodes : m_allocUserDataNodes)
     {
         delete pAllocNodes;
     }
-#endif
 }
 
 // =====================================================================================================================
@@ -500,7 +494,6 @@ bool GraphicsContext::CheckGsOnChipValidity()
     }
     else
     {
-#ifdef LLPC_BUILD_GFX9
         uint32_t gsPrimsPerSubgroup = m_pGpuProperty->gsOnChipDefaultPrimsPerSubgroup;
 
         // NOTE: Make esGsItemSize odd by "| 1", to optimize ES -> GS ring layout for LDS bank conflicts
@@ -597,9 +590,6 @@ bool GraphicsContext::CheckGsOnChipValidity()
 
         // TODO: GFX9 GS -> VS ring on chip is not supported yet
         gsOnChip = false;
-#else
-        LLPC_NOT_IMPLEMENTED();
-#endif
     }
 
     LLPC_OUTS("===============================================================================\n");
@@ -626,7 +616,6 @@ bool GraphicsContext::CheckGsOnChipValidity()
     return gsOnChip;
 }
 
-#ifdef LLPC_BUILD_GFX9
 // =====================================================================================================================
 // Does user data node merging for merged shader
 void GraphicsContext::DoUserDataNodeMerge()
@@ -842,6 +831,5 @@ void GraphicsContext::MergeUserDataNode(
     *pMergedNodeCount = mergedNodeCount;
     *ppMergedNodes = pMergedNodes;
 }
-#endif
 
 } // Llpc
diff --git a/icd/api/llpc/context/llpcGraphicsContext.h b/icd/api/llpc/context/llpcGraphicsContext.h
index 43f0c733..dbabade5 100644
--- a/icd/api/llpc/context/llpcGraphicsContext.h
+++ b/icd/api/llpc/context/llpcGraphicsContext.h
@@ -81,9 +81,7 @@ class GraphicsContext: public PipelineContext
     // Enables GS on-chip mode
     virtual void SetGsOnChip(bool gsOnChip) { m_gsOnChip = gsOnChip; }
 
-#ifdef LLPC_BUILD_GFX9
     virtual void DoUserDataNodeMerge();
-#endif
 
     void InitShaderInfoForNullFs();
 
@@ -104,14 +102,12 @@ class GraphicsContext: public PipelineContext
     LLPC_DISALLOW_DEFAULT_CTOR(GraphicsContext);
     LLPC_DISALLOW_COPY_AND_ASSIGN(GraphicsContext);
 
-#ifdef LLPC_BUILD_GFX9
     void MergeUserDataNode(uint32_t                    nodeCount1,
                            const ResourceMappingNode*  pNodes1,
                            uint32_t                    nodeCount2,
                            const ResourceMappingNode*  pNodes2,
                            uint32_t*                   pMergedNodeCount,
                            const ResourceMappingNode** ppMergedNodes);
-#endif
 
     const GraphicsPipelineBuildInfo*    m_pPipelineInfo; // Info to build a graphics pipeline
 
@@ -131,9 +127,7 @@ class GraphicsContext: public PipelineContext
     bool            m_tessOffchip; // Whether to enable tessellation off-chip mode
     bool            m_gsOnChip;    // Whether to enable GS on-chip mode
 
-#ifdef LLPC_BUILD_GFX9
     std::vector<ResourceMappingNode*>  m_allocUserDataNodes;    // Allocated user data nodes for merged shader
-#endif
 };
 
 } // Llpc
diff --git a/icd/api/llpc/context/llpcPipelineContext.h b/icd/api/llpc/context/llpcPipelineContext.h
index f876a11e..11966918 100644
--- a/icd/api/llpc/context/llpcPipelineContext.h
+++ b/icd/api/llpc/context/llpcPipelineContext.h
@@ -670,10 +670,8 @@ class PipelineContext
     // Enables GS on-chip mode
     virtual void SetGsOnChip(bool gsOnChip) = 0;
 
-#ifdef LLPC_BUILD_GFX9
     // Does user data node merge for merged shader
     virtual void DoUserDataNodeMerge() = 0;
-#endif
 
     const char* GetGpuNameString() const;
     const char* GetGpuNameAbbreviation() const;
diff --git a/icd/api/llpc/include/llpc.h b/icd/api/llpc/include/llpc.h
index 1b460b92..b75fd14c 100644
--- a/icd/api/llpc/include/llpc.h
+++ b/icd/api/llpc/include/llpc.h
@@ -269,6 +269,7 @@ struct GraphicsPipelineBuildInfo
             bool          blendEnable;          ///< Blend will be enabled for this target at draw time
             bool          blendSrcAlphaToColor; ///< Whether source alpha is blended to color channels for this target
                                                 ///  at draw time
+           uint8_t channelWriteMask;            ///< Write mask to specify destination channels
            VkFormat       format;               ///< Color attachment format
         } target[MaxColorTargets];              ///< Per-MRT color target info
     } cbState;                                  ///< Color target state
@@ -402,14 +403,14 @@ class IPipelineDumper
     /// @param [in]  pPipelineInfo  Info to build this graphics pipeline
     ///
     /// @returns Hash code associated this graphics pipeline.
-    static uint64_t VKAPI_CALL GetGraphicsPipelineHash(const GraphicsPipelineBuildInfo* pPipelineInfo);
+    static uint64_t VKAPI_CALL GetPipelineHash(const GraphicsPipelineBuildInfo* pPipelineInfo);
 
     /// Calculates compute pipeline hash code.
     ///
     /// @param [in]  pPipelineInfo  Info to build this compute pipeline
     ///
     /// @returns Hash code associated this compute pipeline.
-    static uint64_t VKAPI_CALL GetComputePipelineHash(const ComputePipelineBuildInfo* pPipelineInfo);
+    static uint64_t VKAPI_CALL GetPipelineHash(const ComputePipelineBuildInfo* pPipelineInfo);
 };
 
 // =====================================================================================================================
diff --git a/icd/api/llpc/lower/llpcSpirvLowerAggregateLoadStore.cpp b/icd/api/llpc/lower/llpcSpirvLowerAggregateLoadStore.cpp
index b3e24f12..76de5ef5 100644
--- a/icd/api/llpc/lower/llpcSpirvLowerAggregateLoadStore.cpp
+++ b/icd/api/llpc/lower/llpcSpirvLowerAggregateLoadStore.cpp
@@ -152,7 +152,6 @@ void SpirvLowerAggregateLoadStore::visitCallInst(
         if (pStoreDest->getType()->getPointerAddressSpace() == SPIRAS_Private)
         {
             auto pStoreTy = pStoreDest->getType()->getPointerElementType();
-            LLPC_ASSERT (pStoreTy->isArrayTy() || pStoreTy->isStructTy());
 
             std::vector<uint32_t> idxs;
             ExpandStoreInst(pStoreValue, pStoreDest, pStoreTy, idxs, &callInst);
@@ -193,15 +192,27 @@ void SpirvLowerAggregateLoadStore::ExpandStoreInst(
     }
     else
     {
-        Value* pElemValue = ExtractValueInst::Create(pStoreValue, idxStack, "", pInsertPos);
-        std::vector<Value*> idxs;
-        idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
-        for (uint32_t i = 0, idxCount = idxStack.size(); i < idxCount; ++i)
+        Value* pElemValue = nullptr;
+        Value* pElemPtr = nullptr;
+
+        if (idxStack.empty())
         {
-            idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), idxStack[i]));
+            pElemValue = pStoreValue;
+            pElemPtr = pStorePtr;
+        }
+        else
+        {
+            pElemValue = ExtractValueInst::Create(pStoreValue, idxStack, "", pInsertPos);
+            std::vector<Value*> idxs;
+            idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
+            for (uint32_t i = 0, idxCount = idxStack.size(); i < idxCount; ++i)
+            {
+                idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), idxStack[i]));
+            }
+
+            pElemPtr = GetElementPtrInst::CreateInBounds(pStorePtr, idxs,"", pInsertPos);
         }
 
-        auto pElemPtr = GetElementPtrInst::CreateInBounds(pStorePtr, idxs,"", pInsertPos);
         if (pElemPtr->getType()->getPointerElementType() != pElemValue->getType())
         {
             // Type mismatch (only occurs for the store of uint32 <-> bool)
diff --git a/icd/api/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp b/icd/api/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp
index d4eeab6f..27417cdc 100644
--- a/icd/api/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp
+++ b/icd/api/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp
@@ -188,7 +188,7 @@ void SpirvLowerConstImmediateStore::ConvertAllocaToReadOnlyGlobal(
                                       "",
                                       nullptr,
                                       GlobalValue::NotThreadLocal,
-                                      ADDR_SPACE_CONST);
+                                      SPIRAS_Constant);
     pGlobal->takeName(pAlloca);
     // Change all uses of pAlloca to use pGlobal. We need to do it manually, as there is a change
     // of address space, and we also need to recreate "getelementptr"s.
diff --git a/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp b/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp
index b4f12510..25adc98c 100644
--- a/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp
+++ b/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp
@@ -370,16 +370,72 @@ void SpirvLowerGlobal::visitLoadInst(
         LLPC_ASSERT(pMetaNode != nullptr);
         auto pInOutMeta = mdconst::dyn_extract<Constant>(pMetaNode->getOperand(0));
 
-        auto pLoadValue = AddCallInstForInOutImport(pInOutTy,
-                                                    addrSpace,
-                                                    pInOutMeta,
-                                                    nullptr,
-                                                    nullptr,
-                                                    nullptr,
-                                                    InterpLocUnknown,
-                                                    nullptr,
-                                                    nullptr,
-                                                    &loadInst);
+        Value* pLoadValue = UndefValue::get(pInOutTy);
+        bool hasVertexIdx = false;
+
+        if (pInOutTy->isArrayTy())
+        {
+            // Arrayed input/output
+            LLPC_ASSERT(pInOutMeta->getNumOperands() == 3);
+            ShaderInOutMetadata inOutMeta = {};
+            inOutMeta.U32All = cast<ConstantInt>(pInOutMeta->getOperand(1))->getZExtValue();
+
+            // If the input/output is arrayed, the outermost dimension might for vertex indexing
+            if (inOutMeta.IsBuiltIn)
+            {
+                BuiltIn builtInId = static_cast<BuiltIn>(inOutMeta.Value);
+                hasVertexIdx = ((builtInId == BuiltInPerVertex)    || // GLSL style per-vertex data
+                                (builtInId == BuiltInPosition)     || // HLSL style per-vertex data
+                                (builtInId == BuiltInPointSize)    ||
+                                (builtInId == BuiltInClipDistance) ||
+                                (builtInId == BuiltInCullDistance));
+            }
+            else
+            {
+                hasVertexIdx = (inOutMeta.PerPatch == false);
+            }
+        }
+
+        if (hasVertexIdx)
+        {
+            LLPC_ASSERT(pInOutTy->isArrayTy());
+
+            auto pElemTy = pInOutTy->getArrayElementType();
+            auto pElemMeta = cast<Constant>(pInOutMeta->getOperand(2));
+
+            const uint32_t elemCount = pInOutTy->getArrayNumElements();
+            for (uint32_t i = 0; i < elemCount; ++i)
+            {
+                Value* pVertexIdx = ConstantInt::get(m_pContext->Int32Ty(), i);
+                auto pElemValue = AddCallInstForInOutImport(pElemTy,
+                                                            addrSpace,
+                                                            pElemMeta,
+                                                            nullptr,
+                                                            nullptr,
+                                                            pVertexIdx,
+                                                            InterpLocUnknown,
+                                                            nullptr,
+                                                            nullptr,
+                                                            &loadInst);
+
+                std::vector<uint32_t> idxs;
+                idxs.push_back(i);
+                pLoadValue = InsertValueInst::Create(pLoadValue, pElemValue, idxs, "", &loadInst);
+            }
+        }
+        else
+        {
+            pLoadValue = AddCallInstForInOutImport(pInOutTy,
+                                                   addrSpace,
+                                                   pInOutMeta,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   InterpLocUnknown,
+                                                   nullptr,
+                                                   nullptr,
+                                                   &loadInst);
+        }
 
         m_loadInsts.insert(&loadInst);
         loadInst.replaceAllUsesWith(pLoadValue);
@@ -478,7 +534,56 @@ void SpirvLowerGlobal::visitStoreInst(
         LLPC_ASSERT(pMetaNode != nullptr);
         auto pOutputMeta = mdconst::dyn_extract<Constant>(pMetaNode->getOperand(0));
 
-        AddCallInstForOutputExport(pStoreValue, pOutputMeta, nullptr, nullptr, nullptr, InvalidValue, &storeInst);
+        bool hasVertexIdx = false;
+
+        // If the input/output is arrayed, the outermost dimension might for vertex indexing
+        if (pOutputy->isArrayTy())
+        {
+            LLPC_ASSERT(pOutputMeta->getNumOperands() == 3);
+            ShaderInOutMetadata outputMeta = {};
+            outputMeta.U32All = cast<ConstantInt>(pOutputMeta->getOperand(1))->getZExtValue();
+
+            if (outputMeta.IsBuiltIn)
+            {
+                BuiltIn builtInId = static_cast<BuiltIn>(outputMeta.Value);
+                hasVertexIdx = ((builtInId == BuiltInPerVertex)    || // GLSL style per-vertex data
+                                (builtInId == BuiltInPosition)     || // HLSL style per-vertex data
+                                (builtInId == BuiltInPointSize)    ||
+                                (builtInId == BuiltInClipDistance) ||
+                                (builtInId == BuiltInCullDistance));
+            }
+            else
+            {
+                hasVertexIdx = (outputMeta.PerPatch == false);
+            }
+        }
+
+        if (hasVertexIdx)
+        {
+            LLPC_ASSERT(pOutputy->isArrayTy());
+            auto pElemMeta = cast<Constant>(pOutputMeta->getOperand(2));
+
+            const uint32_t elemCount = pOutputy->getArrayNumElements();
+            for (uint32_t i = 0; i < elemCount; ++i)
+            {
+                std::vector<uint32_t> idxs;
+                idxs.push_back(i);
+                auto pElemValue = ExtractValueInst::Create(pStoreValue, idxs, "", &storeInst);
+
+                Value* pVertexIdx = ConstantInt::get(m_pContext->Int32Ty(), i);
+                AddCallInstForOutputExport(pElemValue,
+                                           pElemMeta,
+                                           nullptr,
+                                           nullptr,
+                                           pVertexIdx,
+                                           InvalidValue,
+                                           &storeInst);
+            }
+        }
+        else
+        {
+            AddCallInstForOutputExport(pStoreValue, pOutputMeta, nullptr, nullptr, nullptr, InvalidValue, &storeInst);
+        }
 
         m_storeInsts.insert(&storeInst);
     }
@@ -861,10 +966,6 @@ void SpirvLowerGlobal::LowerInput()
 // Does lowering opertions for SPIR-V outputs, replaces outputs with proxy variables.
 void SpirvLowerGlobal::LowerOutput()
 {
-    // NOTE: For tessellation control shader, we invoke handling of "load"/"store" instructions and replace all those
-    // instructions with import/export calls in-place.
-    LLPC_ASSERT(m_shaderStage != ShaderStageTessControl);
-
     m_pRetBlock = BasicBlock::Create(*m_pContext, "", m_pEntryPoint);
 
     // Invoke handling of "return" instructions or "emit" calls
@@ -887,6 +988,16 @@ void SpirvLowerGlobal::LowerOutput()
         retInst->eraseFromParent();
     }
 
+    if (m_outputProxyMap.empty())
+    {
+        // Skip lowering if there is no output
+        return;
+    }
+
+    // NOTE: For tessellation control shader, we invoke handling of "load"/"store" instructions and replace all those
+    // instructions with import/export calls in-place.
+    LLPC_ASSERT(m_shaderStage != ShaderStageTessControl);
+
     // Export output from the proxy variable prior to "return" instruction or "emit" calls
     for (auto outputMap : m_outputProxyMap)
     {
@@ -1523,6 +1634,7 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport(
         {
             BuiltIn builtInId = static_cast<BuiltIn>(inOutMeta.Value);
             if ((builtInId == BuiltInSubgroupLocalInvocationId) ||
+                (builtInId == BuiltInSubgroupSize)              ||
                 (builtInId == BuiltInSubgroupEqMaskKHR)         ||
                 (builtInId == BuiltInSubgroupGeMaskKHR)         ||
                 (builtInId == BuiltInSubgroupGtMaskKHR)         ||
diff --git a/icd/api/llpc/patch/generate/gfx8/glslSpecialOpEmu.ll b/icd/api/llpc/patch/generate/gfx8/glslSpecialOpEmu.ll
index 10cb2d35..61cafa3f 100644
--- a/icd/api/llpc/patch/generate/gfx8/glslSpecialOpEmu.ll
+++ b/icd/api/llpc/patch/generate/gfx8/glslSpecialOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2018, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslSpecialOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL special graphics-specific operations.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll b/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll
index d1a4f8de..b0f60c1e 100755
--- a/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll
+++ b/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslArithOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (std32).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/gfx9/glslImageOpEmu.ll b/icd/api/llpc/patch/generate/gfx9/glslImageOpEmu.ll
index e4468210..eb06ecce 100644
--- a/icd/api/llpc/patch/generate/gfx9/glslImageOpEmu.ll
+++ b/icd/api/llpc/patch/generate/gfx9/glslImageOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslImageOpEmu.ll
-;* @brief LLVM IR file: contains emulation codes for GLSL image operations.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslArithOpEmu.ll b/icd/api/llpc/patch/generate/glslArithOpEmu.ll
index f4608693..af84e1b2 100644
--- a/icd/api/llpc/patch/generate/glslArithOpEmu.ll
+++ b/icd/api/llpc/patch/generate/glslArithOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslArithOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (std32).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslArithOpEmuF16.ll b/icd/api/llpc/patch/generate/glslArithOpEmuF16.ll
index 9817f1d9..62ccb888 100644
--- a/icd/api/llpc/patch/generate/glslArithOpEmuF16.ll
+++ b/icd/api/llpc/patch/generate/glslArithOpEmuF16.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslArithOpEmuf16.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (float16).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslArithOpEmuF64.ll b/icd/api/llpc/patch/generate/glslArithOpEmuF64.ll
index 58ba8338..5ed92521 100644
--- a/icd/api/llpc/patch/generate/glslArithOpEmuF64.ll
+++ b/icd/api/llpc/patch/generate/glslArithOpEmuF64.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslArithOpEmuF64.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (float64).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslArithOpEmuI16.ll b/icd/api/llpc/patch/generate/glslArithOpEmuI16.ll
index 0f12e613..5f10e5e5 100644
--- a/icd/api/llpc/patch/generate/glslArithOpEmuI16.ll
+++ b/icd/api/llpc/patch/generate/glslArithOpEmuI16.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslArithOpEmuI16.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (int16).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslArithOpEmuI64.ll b/icd/api/llpc/patch/generate/glslArithOpEmuI64.ll
index 311d4593..b71c1117 100644
--- a/icd/api/llpc/patch/generate/glslArithOpEmuI64.ll
+++ b/icd/api/llpc/patch/generate/glslArithOpEmuI64.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslArithOpEmuI64.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL arithmetic operations (int64).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslBufferOpEmu.ll b/icd/api/llpc/patch/generate/glslBufferOpEmu.ll
index 288f5da0..84f54fd4 100644
--- a/icd/api/llpc/patch/generate/glslBufferOpEmu.ll
+++ b/icd/api/llpc/patch/generate/glslBufferOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslBufferOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL buffer operations.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslBuiltInVarEmu.ll b/icd/api/llpc/patch/generate/glslBuiltInVarEmu.ll
index 7922c61e..1ab65ddc 100644
--- a/icd/api/llpc/patch/generate/glslBuiltInVarEmu.ll
+++ b/icd/api/llpc/patch/generate/glslBuiltInVarEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslBuiltInVarEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL built-in variables.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
@@ -21,6 +30,12 @@ target triple = "spir64-unknown-unknown"
 ; >>>  Common Built-in Variables
 ; =====================================================================================================================
 
+; GLSL: in uint gl_SubGroupSize
+define i32 @llpc.input.import.builtin.SubgroupSize(i32 %builtInId) #0
+{
+    ret i32 64
+}
+
 ; GLSL: in uint gl_SubGroupInvocation
 define i32 @llpc.input.import.builtin.SubgroupLocalInvocationId(i32 %builtInId) #0
 {
@@ -204,7 +219,6 @@ define <2 x float> @llpc.input.import.builtin.SamplePosition(i32 %builtInId) #0
 declare <3 x i32> @llpc.input.import.builtin.WorkgroupSize(i32) #0
 declare <3 x i32> @llpc.input.import.builtin.WorkgroupId(i32) #0
 declare <3 x i32> @llpc.input.import.builtin.LocalInvocationId(i32) #0
-declare i32 @llpc.input.import.builtin.SubgroupSize(i32) #0
 declare i32 @llpc.input.import.builtin.NumSamples(i32) #0
 declare i32 @llpc.input.import.builtin.SamplePatternIdx(i32) #0
 declare i32 @llpc.input.import.builtin.SampleId(i32) #0
diff --git a/icd/api/llpc/patch/generate/glslCopyShaderEmu.ll b/icd/api/llpc/patch/generate/glslCopyShaderEmu.ll
index 0c26a52e..93278ac0 100644
--- a/icd/api/llpc/patch/generate/glslCopyShaderEmu.ll
+++ b/icd/api/llpc/patch/generate/glslCopyShaderEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslNullFsEmul.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for copy shader.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
@@ -52,6 +61,8 @@ define <4 x i32> @llpc.descriptor.load.gsvsringbuffer(i32 %internalTablePtrLow,
     %6 = bitcast <2 x i32> %5 to i64
     %7 = shl i64 %ringOutOffset, 4
     %8 = add i64 %6, %7
+    ; This uses addrspace(2), which is SPIRAS::Constant. The PatchAddrSpaceMutate pass then changes
+    ; it to addrspace(4), which is AMDGPUAS::Constant.
     %9 = inttoptr i64 %8 to <4 x i32> addrspace(2)*, !amdgpu.uniform !1
     %10 = load <4 x i32>, <4 x i32> addrspace(2)* %9
 
diff --git a/icd/api/llpc/patch/generate/glslImageOpEmu.ll b/icd/api/llpc/patch/generate/glslImageOpEmu.ll
index 298e5cb5..f0374f9d 100644
--- a/icd/api/llpc/patch/generate/glslImageOpEmu.ll
+++ b/icd/api/llpc/patch/generate/glslImageOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslImageOpEmu.ll
-;* @brief LLVM IR file: contains emulation codes for GLSL image operations.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
@@ -317,6 +326,13 @@ define <4 x float> @llpc.patch.image.gather.texel.i32(
     ret <4 x float> %5
 }
 
+define i1 @llpc.imagesparse.texel.resident(
+    i32 %residentCode) #0
+{
+    %1 = icmp eq i32 %residentCode, 0
+    ret i1 %1
+}
+
 declare <8 x i32> @llpc.descriptor.load.resource(i32 , i32 , i32) #0
 
 declare <4 x i32> @llpc.descriptor.load.texelbuffer(i32 , i32 , i32) #0
diff --git a/icd/api/llpc/patch/generate/glslInlineConstOpEmu.ll b/icd/api/llpc/patch/generate/glslInlineConstOpEmu.ll
index ecda2abe..f6e88d7c 100644
--- a/icd/api/llpc/patch/generate/glslInlineConstOpEmu.ll
+++ b/icd/api/llpc/patch/generate/glslInlineConstOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslInlineConstOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL inline constant buffer operations.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslMatrixOpEmu.ll b/icd/api/llpc/patch/generate/glslMatrixOpEmu.ll
index a8bf0477..3c5c8a6e 100644
--- a/icd/api/llpc/patch/generate/glslMatrixOpEmu.ll
+++ b/icd/api/llpc/patch/generate/glslMatrixOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslMatrixOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL matrix operations (float).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslMatrixOpEmuF16.ll b/icd/api/llpc/patch/generate/glslMatrixOpEmuF16.ll
index 122c2a3b..12b545ec 100644
--- a/icd/api/llpc/patch/generate/glslMatrixOpEmuF16.ll
+++ b/icd/api/llpc/patch/generate/glslMatrixOpEmuF16.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslMatrixOpEmuF16.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL matrix operations (float16).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslMatrixOpEmuF64.ll b/icd/api/llpc/patch/generate/glslMatrixOpEmuF64.ll
index 00034a8f..88f62910 100644
--- a/icd/api/llpc/patch/generate/glslMatrixOpEmuF64.ll
+++ b/icd/api/llpc/patch/generate/glslMatrixOpEmuF64.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  g_glslMatrixOpEmuF64.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL matrix operations (double).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslNullFsEmu.ll b/icd/api/llpc/patch/generate/glslNullFsEmu.ll
index 4977c811..445dded1 100644
--- a/icd/api/llpc/patch/generate/glslNullFsEmu.ll
+++ b/icd/api/llpc/patch/generate/glslNullFsEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslNullFsEmul.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for null fragment shader.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslPushConstOpEmu.ll b/icd/api/llpc/patch/generate/glslPushConstOpEmu.ll
index ef9f792e..c1062cbd 100644
--- a/icd/api/llpc/patch/generate/glslPushConstOpEmu.ll
+++ b/icd/api/llpc/patch/generate/glslPushConstOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;* Trade secret of Advanced Micro Devices, Inc.
-;* Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;* All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;* publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;* the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file glslPushConstOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL push constant (spilled) operations.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
@@ -20,84 +29,84 @@ target triple = "spir64-unknown-unknown"
 ; GLSL: load float16/int16/uint16 (word)
 define <2 x i8> @llpc.pushconst.load.v2i8(i32 %memberOffset, i1 %glc, i1 %slc) #0
 {
-    %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable()
-    %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset
-    %2 = bitcast i8 addrspace(2)* %1 to <2 x i8> addrspace(2)*, !amdgpu.uniform !0
-    %3 = load <2 x i8>, <2 x i8> addrspace(2)* %2, align 4
+    %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable()
+    %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset
+    %2 = bitcast i8 addrspace(4)* %1 to <2 x i8> addrspace(4)*, !amdgpu.uniform !0
+    %3 = load <2 x i8>, <2 x i8> addrspace(4)* %2, align 4
     ret <2 x i8> %3
 }
 
 ; GLSL: load f16vec2/i16vec2/u16vec2/float/int/uint (dword)
 define <4 x i8> @llpc.pushconst.load.v4i8(i32 %memberOffset, i1 %glc, i1 %slc) #0
 {
-    %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable()
-    %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset
-    %2 = bitcast i8 addrspace(2)* %1 to <4 x i8> addrspace(2)*, !amdgpu.uniform !0
-    %3 = load <4 x i8>, <4 x i8> addrspace(2)* %2, align 4
+    %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable()
+    %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset
+    %2 = bitcast i8 addrspace(4)* %1 to <4 x i8> addrspace(4)*, !amdgpu.uniform !0
+    %3 = load <4 x i8>, <4 x i8> addrspace(4)* %2, align 4
     ret <4 x i8> %3
 }
 
 ; GLSL: load f16vec3/i16vec3/u16vec3 (wordx3)
 define <6 x i8> @llpc.pushconst.load.v6i8(i32 %memberOffset, i1 %glc, i1 %slc) #0
 {
-    %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable()
-    %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset
-    %2 = bitcast i8 addrspace(2)* %1 to <6 x i8> addrspace(2)*, !amdgpu.uniform !0
-    %3 = load <6 x i8>, <6 x i8> addrspace(2)* %2, align 4
+    %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable()
+    %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset
+    %2 = bitcast i8 addrspace(4)* %1 to <6 x i8> addrspace(4)*, !amdgpu.uniform !0
+    %3 = load <6 x i8>, <6 x i8> addrspace(4)* %2, align 4
     ret <6 x i8> %3
 }
 
 ; GLSL: load f16vec4/i16vec4/u16vec4/vec2/ivec2/uvec2/double/int64/uint64 (dwordx2)
 define <8 x i8> @llpc.pushconst.load.v8i8(i32 %memberOffset, i1 %glc, i1 %slc) #0
 {
-    %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable()
-    %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset
-    %2 = bitcast i8 addrspace(2)* %1 to <8 x i8> addrspace(2)*, !amdgpu.uniform !0
-    %3 = load <8 x i8>, <8 x i8> addrspace(2)* %2, align 8
+    %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable()
+    %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset
+    %2 = bitcast i8 addrspace(4)* %1 to <8 x i8> addrspace(4)*, !amdgpu.uniform !0
+    %3 = load <8 x i8>, <8 x i8> addrspace(4)* %2, align 8
     ret <8 x i8> %3
 }
 
 ; GLSL: load vec3/ivec3/uvec3 (dwordx3)
 define <12 x i8> @llpc.pushconst.load.v12i8(i32 %memberOffset, i1 %glc, i1 %slc) #0
 {
-    %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable()
-    %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset
-    %2 = bitcast i8 addrspace(2)* %1 to <12 x i8> addrspace(2)*, !amdgpu.uniform !0
-    %3 = load <12 x i8>, <12 x i8> addrspace(2)* %2, align 4
+    %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable()
+    %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset
+    %2 = bitcast i8 addrspace(4)* %1 to <12 x i8> addrspace(4)*, !amdgpu.uniform !0
+    %3 = load <12 x i8>, <12 x i8> addrspace(4)* %2, align 4
     ret <12 x i8> %3
 }
 
 ; GLSL: load vec4/ivec4/uvec4/dvec2/i64vec2/u64vec2 (dwordx4)
 define <16 x i8> @llpc.pushconst.load.v16i8(i32 %memberOffset, i1 %glc, i1 %slc) #0
 {
-    %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable()
-    %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset
-    %2 = bitcast i8 addrspace(2)* %1 to <16 x i8> addrspace(2)*, !amdgpu.uniform !0
-    %3 = load <16 x i8>, <16 x i8> addrspace(2)* %2, align 4
+    %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable()
+    %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset
+    %2 = bitcast i8 addrspace(4)* %1 to <16 x i8> addrspace(4)*, !amdgpu.uniform !0
+    %3 = load <16 x i8>, <16 x i8> addrspace(4)* %2, align 4
     ret <16 x i8> %3
 }
 
 ; GLSL: load dvec3/i64vec3/u64vec3 (dwordx6)
 define <24 x i8> @llpc.pushconst.load.v24i8(i32 %memberOffset, i1 %glc, i1 %slc) #0
 {
-    %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable()
-    %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset
-    %2 = bitcast i8 addrspace(2)* %1 to <24 x i8> addrspace(2)*, !amdgpu.uniform !0
-    %3 = load <24 x i8>, <24 x i8> addrspace(2)* %2, align 4
+    %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable()
+    %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset
+    %2 = bitcast i8 addrspace(4)* %1 to <24 x i8> addrspace(4)*, !amdgpu.uniform !0
+    %3 = load <24 x i8>, <24 x i8> addrspace(4)* %2, align 4
     ret <24 x i8> %3
 }
 
 ; GLSL: load dvec4/i64vec4/u64vec4 (dwordx8)
 define <32 x i8> @llpc.pushconst.load.v32i8(i32 %memberOffset, i1 %glc, i1 %slc) #0
 {
-    %spillTablePtr = call [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable()
-    %1 = getelementptr [512 x i8], [512 x i8] addrspace(2)* %spillTablePtr, i32 0, i32 %memberOffset
-    %2 = bitcast i8 addrspace(2)* %1 to <32 x i8> addrspace(2)*, !amdgpu.uniform !0
-    %3 = load <32 x i8>, <32 x i8> addrspace(2)* %2, align 4
+    %spillTablePtr = call [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable()
+    %1 = getelementptr [512 x i8], [512 x i8] addrspace(4)* %spillTablePtr, i32 0, i32 %memberOffset
+    %2 = bitcast i8 addrspace(4)* %1 to <32 x i8> addrspace(4)*, !amdgpu.uniform !0
+    %3 = load <32 x i8>, <32 x i8> addrspace(4)* %2, align 4
     ret <32 x i8> %3
 }
 
-declare [512 x i8] addrspace(2)* @llpc.descriptor.load.spilltable() #0
+declare [512 x i8] addrspace(4)* @llpc.descriptor.load.spilltable() #0
 
 attributes #0 = { nounwind }
 
diff --git a/icd/api/llpc/patch/generate/glslSharedVarOpEmu.ll b/icd/api/llpc/patch/generate/glslSharedVarOpEmu.ll
index 32fabe94..b60f7973 100644
--- a/icd/api/llpc/patch/generate/glslSharedVarOpEmu.ll
+++ b/icd/api/llpc/patch/generate/glslSharedVarOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslSharedVarOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL shared variable operations.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/glslSpecialOpEmu.ll b/icd/api/llpc/patch/generate/glslSpecialOpEmu.ll
index e1ca8526..c1fb6a31 100644
--- a/icd/api/llpc/patch/generate/glslSpecialOpEmu.ll
+++ b/icd/api/llpc/patch/generate/glslSpecialOpEmu.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2017, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslSpecialOpEmu.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL special graphics-specific operations.
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2017-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
@@ -271,26 +280,32 @@ define spir_func <4 x i32> @_Z17SubgroupBallotKHRb(i1 %value) #0
 {
     %1 = call i64 @llpc.ballot(i1 %value)
     %2 = bitcast i64 %1 to <2 x i32>
-    %3 = shufflevector <2 x i32> %2, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <2 x i32> %2, <2 x i32> <i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 
     ret <4 x i32> %3
 }
 
+; GLSL: int/uint readInvocation(int/uint, uint)
+define spir_func i32 @_Z25SubgroupReadInvocationKHRii(i32 %value, i32 %invocationIndex)
+{
+    %1 = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %invocationIndex)
+    ret i32 %1
+}
+
 ; GLSL: float readInvocation(float, uint)
 define spir_func float @_Z25SubgroupReadInvocationKHRfi(float %value, i32 %invocationIndex)
 {
     %1 = bitcast float %value to i32
-    %2 = call i32 @llvm.amdgcn.readlane(i32 %1, i32 %invocationIndex)
+    %2 = call i32 @_Z25SubgroupReadInvocationKHRii(i32 %1, i32 %invocationIndex)
     %3 = bitcast i32 %2 to float
 
     ret float %3
 }
 
-; GLSL: int/uint readInvocation(int/uint, uint)
-define spir_func i32 @_Z25SubgroupReadInvocationKHRii(i32 %value, i32 %invocationIndex)
+; GLSL: int/uint readFirstInvocation(int/uint)
+define spir_func i32 @_Z26SubgroupFirstInvocationKHRi(i32 %value)
 {
-    %1 = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %invocationIndex)
-
+    %1 = call i32 @llvm.amdgcn.readfirstlane(i32 %value)
     ret i32 %1
 }
 
@@ -298,20 +313,12 @@ define spir_func i32 @_Z25SubgroupReadInvocationKHRii(i32 %value, i32 %invocatio
 define spir_func float @_Z26SubgroupFirstInvocationKHRf(float %value)
 {
     %1 = bitcast float %value to i32
-    %2 = call i32 @llvm.amdgcn.readfirstlane(i32 %1)
+    %2 = call i32 @_Z26SubgroupFirstInvocationKHRi(i32 %1)
     %3 = bitcast i32 %2 to float
 
     ret float %3
 }
 
-; GLSL: int/uint readFirstInvocation(int/uint)
-define spir_func i32 @_Z26SubgroupFirstInvocationKHRi(i32 %value)
-{
-    %1 = call i32 @llvm.amdgcn.readfirstlane(i32 %value)
-
-    ret i32 %1
-}
-
 ; GLSL: bool anyInvocation(bool)
 define spir_func i1 @_Z14SubgroupAnyKHRb(i1 %value)
 {
@@ -343,21 +350,22 @@ define spir_func i1 @_Z19SubgroupAllEqualKHRb(i1 %value)
     ret i1 %5
 }
 
+; GLSL: int/uint writeInvocation(int/uint, int/uint, int/uint)
+define spir_func i32 @_Z18WriteInvocationAMDiii(i32 %inputValue, i32 %writeValue, i32 %invocationIndex)
+{
+    %1 = call i32 @llvm.amdgcn.writelane(i32 %writeValue, i32 %invocationIndex, i32 %inputValue)
+    ret i32 %1
+}
+
 ; GLSL: float writeInvocation(float, float, uint)
 define spir_func float @_Z18WriteInvocationAMDffi(float %inputValue, float %writeValue, i32 %invocationIndex)
 {
     %1 = bitcast float %writeValue to i32
     %2 = bitcast float %inputValue to i32
-    %3 = call i32 @llvm.amdgcn.writelane(i32 %1, i32 %invocationIndex, i32 %2)
+    %3 = call i32 @_Z18WriteInvocationAMDiii(i32 %1, i32 %invocationIndex, i32 %2)
     %4 = bitcast i32 %3 to float
-    ret float %4
-}
 
-; GLSL: int/uint writeInvocation(int/uint, int/uint, int/uint)
-define spir_func i32 @_Z18WriteInvocationAMDiii(i32 %inputValue, i32 %writeValue, i32 %invocationIndex)
-{
-    %1 = call i32 @llvm.amdgcn.writelane(i32 %writeValue, i32 %invocationIndex, i32 %inputValue)
-    ret i32 %1
+    ret float %4
 }
 
 ; GLSL: bool subgroupElect()
@@ -374,6 +382,1575 @@ define spir_func i1 @_Z20GroupNonUniformElecti(i32 %scope)
     ret i1 %6
 }
 
+; GLSL: bool subgroupAll(bool)
+define spir_func i1 @_Z18GroupNonUniformAllib(i32 %scope, i1 %value)
+{
+    %1 = call i1 @_Z14SubgroupAllKHRb(i1 %value)
+    ret i1 %1
+}
+
+; GLSL: bool subgroupAny(bool)
+define spir_func i1 @_Z18GroupNonUniformAnyib(i32 %scope, i1 %value)
+{
+    %1 = call i1 @_Z14SubgroupAnyKHRb(i1 %value)
+    ret i1 %1
+}
+
+; GLSL: bool subgroupAllEqual(int/uint)
+define spir_func i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %value)
+{
+    %1 = icmp ne i32 %value, 0
+    %2 = call i1 @_Z19SubgroupAllEqualKHRb(i1 %1)
+    ret i1 %2
+}
+
+; GLSL: bool subgroupAllEqual(ivec2/uvec2)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %value)
+{
+    %1 = extractelement <2 x i32> %value, i32 0
+    %2 = extractelement <2 x i32> %value, i32 1
+
+    %3 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1)
+    %4 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %2)
+
+    %5 = and i1 %3, %4
+    ret i1 %5
+}
+
+; GLSL: bool subgroupAllEqual(ivec3/uvec3)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %value)
+{
+    %1 = extractelement <3 x i32> %value, i32 0
+    %2 = extractelement <3 x i32> %value, i32 1
+    %3 = extractelement <3 x i32> %value, i32 1
+
+    %4 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1)
+    %5 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %2)
+    %6 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %3)
+
+    %7 = and i1 %4, %5
+    %8 = and i1 %7, %6
+    ret i1 %8
+}
+
+; GLSL: bool subgroupAllEqual(ivec4/uvec4)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %value)
+{
+    %1 = extractelement <4 x i32> %value, i32 0
+    %2 = extractelement <4 x i32> %value, i32 1
+    %3 = extractelement <4 x i32> %value, i32 2
+    %4 = extractelement <4 x i32> %value, i32 3
+
+    %5 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1)
+    %6 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %2)
+    %7 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %3)
+    %8 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %4)
+
+    %9 = and i1 %5, %6
+    %10 = and i1 %9, %7
+    %11 = and i1 %10, %8
+    ret i1 %11
+}
+
+; GLSL: bool subgroupAllEqual(float)
+define spir_func i1 @_Z23GroupNonUniformAllEqualif(i32 %scope, float %value)
+{
+    %1 = bitcast float %value to i32
+    %2 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1)
+
+    ret i1 %2
+}
+
+; GLSL: bool subgroupAllEqual(vec2)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv2_f(i32 %scope, <2 x float> %value)
+{
+    %1 = bitcast <2 x float> %value to <2 x i32>
+    %2 = call i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %1)
+
+    ret i1 %2
+}
+
+; GLSL: bool subgroupAllEqual(vec3)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv3_f(i32 %scope, <3 x float> %value)
+{
+    %1 = bitcast <3 x float> %value to <3 x i32>
+    %2 = call i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %1)
+
+    ret i1 %2
+}
+
+; GLSL: bool subgroupAllEqual(vec4)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv4_f(i32 %scope, <4 x float> %value)
+{
+    %1 = bitcast <4 x float> %value to <4 x i32>
+    %2 = call i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %1)
+
+    ret i1 %2
+}
+
+; GLSL: bool subgroupAllEqual(double)
+define spir_func i1 @_Z23GroupNonUniformAllEqualid(i32 %scope, double %value)
+{
+    %1 = bitcast double %value to <2 x i32>
+    %2 = extractelement <2 x i32> %1, i32 0
+    %3 = extractelement <2 x i32> %1, i32 1
+
+    %4 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %2)
+    %5 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %3)
+    %6 = and i1 %4, %5
+
+    ret i1 %6
+}
+
+; GLSL: bool subgroupAllEqual(dvec2)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv2_d(i32 %scope, <2 x double> %value)
+{
+    %1 = bitcast <2 x double> %value to <4 x i32>
+    %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <2 x i32> <i32 0, i32 2>
+    %3 = shufflevector <4 x i32> %1, <4 x i32> %1, <2 x i32> <i32 1, i32 3>
+
+    %4 = call i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %2)
+    %5 = call i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %3)
+    %6 = and i1 %4, %5
+
+    ret i1 %6
+}
+
+; GLSL: bool subgroupAllEqual(dvec3)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv3_d(i32 %scope, <3 x double> %value)
+{
+    %1 = bitcast <3 x double> %value to <6 x i32>
+    %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <3 x i32> <i32 0, i32 2, i32 4>
+    %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <3 x i32> <i32 1, i32 3, i32 5>
+
+    %4 = call i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %2)
+    %5 = call i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %3)
+    %6 = and i1 %4, %5
+
+    ret i1 %6
+}
+
+; GLSL: bool subgroupAllEqual(dvec4)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv4_d(i32 %scope, <4 x double> %value)
+{
+    %1 = bitcast <4 x double> %value to <8 x i32>
+    %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+    %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+    %4 = call i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %2)
+    %5 = call i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %3)
+    %6 = and i1 %4, %5
+
+    ret i1 %6
+}
+
+; GLSL: bool subgroupAllEqual(bool)
+define spir_func i1 @_Z23GroupNonUniformAllEqualib(i32 %scope, i1 %value)
+{
+    %1 = zext i1 %value to i32
+    %2 = call i1 @_Z23GroupNonUniformAllEqualii(i32 %scope, i32 %1)
+
+    ret i1 %2
+}
+
+; GLSL: bool subgroupAllEqual(bvec2)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv2_b(i32 %scope, <2 x i1> %value)
+{
+    %1 = zext <2 x i1> %value to <2 x i32>
+    %2 = call i1 @_Z23GroupNonUniformAllEqualiDv2_i(i32 %scope, <2 x i32> %1)
+
+    ret i1 %2
+}
+
+; GLSL: bool subgroupAllEqual(bvec3)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv3_b(i32 %scope, <3 x i1> %value)
+{
+    %1 = zext <3 x i1> %value to <3 x i32>
+    %2 = call i1 @_Z23GroupNonUniformAllEqualiDv3_i(i32 %scope, <3 x i32> %1)
+
+    ret i1 %2
+}
+
+; GLSL: bool subgroupAllEqual(bvec4)
+define spir_func i1 @_Z23GroupNonUniformAllEqualiDv4_b(i32 %scope, <4 x i1> %value)
+{
+    %1 = zext <4 x i1> %value to <4 x i32>
+    %2 = call i1 @_Z23GroupNonUniformAllEqualiDv4_i(i32 %scope, <4 x i32> %1)
+
+    ret i1 %2
+}
+
+; GLSL: int/uint subgroupBroadcast(int/uint, uint)
+define spir_func i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %value, i32 %id)
+{
+    %1 = call i32 @_Z25SubgroupReadInvocationKHRii(i32 %value, i32 %id)
+    ret i32 %1
+}
+
+; GLSL: ivec2/uvec2 subgroupBroadcast(ivec2/uvec2, uint)
+define spir_func <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %value, i32 %id)
+{
+    %1 = extractelement <2 x i32> %value, i32 0
+    %2 = extractelement <2 x i32> %value, i32 1
+
+    %3 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id)
+    %4 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %2, i32 %id)
+
+    %5 = insertelement <2 x i32> undef, i32 %3, i32 0
+    %6 = insertelement <2 x i32> %5, i32 %4, i32 1
+
+    ret <2 x i32> %6
+}
+
+; GLSL: ivec3/uvec3 subgroupBroadcast(ivec3/uvec3, uint)
+define spir_func <3 x i32> @_Z24GroupNonUniformBroadcastiDv3_ii(i32 %scope, <3 x i32> %value, i32 %id)
+{
+    %1 = extractelement <3 x i32> %value, i32 0
+    %2 = extractelement <3 x i32> %value, i32 1
+    %3 = extractelement <3 x i32> %value, i32 2
+
+    %4 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id)
+    %5 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %2, i32 %id)
+    %6 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %3, i32 %id)
+
+    %7 = insertelement <3 x i32> undef, i32 %4, i32 0
+    %8 = insertelement <3 x i32> %7, i32 %5, i32 1
+    %9 = insertelement <3 x i32> %8, i32 %6, i32 2
+
+    ret <3 x i32> %9
+}
+
+; GLSL: ivec4/uvec4 subgroupBroadcast(ivec4/uvec4, uint)
+define spir_func <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %value, i32 %id)
+{
+    %1 = extractelement <4 x i32> %value, i32 0
+    %2 = extractelement <4 x i32> %value, i32 1
+    %3 = extractelement <4 x i32> %value, i32 2
+    %4 = extractelement <4 x i32> %value, i32 3
+
+    %5 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id)
+    %6 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %2, i32 %id)
+    %7 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %3, i32 %id)
+    %8 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %4, i32 %id)
+
+    %9 = insertelement <4 x i32> undef, i32 %5, i32 0
+    %10 = insertelement <4 x i32> %9, i32 %6, i32 1
+    %11 = insertelement <4 x i32> %10, i32 %7, i32 2
+    %12 = insertelement <4 x i32> %11, i32 %8, i32 3
+
+    ret <4 x i32> %12
+}
+
+; GLSL: float subgroupBroadcast(float, uint)
+define spir_func float @_Z24GroupNonUniformBroadcastifi(i32 %scope, float %value, i32 %id)
+{
+    %1 = bitcast float %value to i32
+    %2 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id)
+    %3 = bitcast i32 %2 to float
+
+    ret float %3
+}
+
+; GLSL: vec2 subgroupBroadcast(vec2, uint)
+define spir_func <2 x float> @_Z24GroupNonUniformBroadcastiDv2_fi(i32 %scope, <2 x float> %value, i32 %id)
+{
+    %1 = bitcast <2 x float> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+
+    ret <2 x float> %3
+}
+
+; GLSL: vec3 subgroupBroadcast(vec3, uint)
+define spir_func <3 x float> @_Z24GroupNonUniformBroadcastiDv3_fi(i32 %scope, <3 x float> %value, i32 %id)
+{
+    %1 = bitcast <3 x float> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z24GroupNonUniformBroadcastiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = bitcast <3 x i32> %2 to <3 x float>
+
+    ret <3 x float> %3
+}
+
+; GLSL: vec4 subgroupBroadcast(vec4, uint)
+define spir_func <4 x float> @_Z24GroupNonUniformBroadcastiDv4_fi(i32 %scope, <4 x float> %value, i32 %id)
+{
+    %1 = bitcast <4 x float> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+
+    ret <4 x float> %3
+}
+
+; GLSL: double subgroupBroadcast(double, uint)
+define spir_func double @_Z24GroupNonUniformBroadcastidi(i32 %scope, double %value, i32 %id)
+{
+    %1 = bitcast double %value to <2 x i32>
+    %2 = call <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to double
+
+    ret double %3
+}
+
+; GLSL: dvec2 subgroupBroadcast(dvec2, uint)
+define spir_func <2 x double> @_Z24GroupNonUniformBroadcastiDv2_di(i32 %scope, <2 x double> %value, i32 %id)
+{
+    %1 = bitcast <2 x double> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+
+    ret <2 x double> %3
+}
+
+; GLSL: dvec3 subgroupBroadcast(dvec3, uint)
+define spir_func <3 x double> @_Z24GroupNonUniformBroadcastiDv3_di(i32 %scope, <3 x double> %value, i32 %id)
+{
+    %1 = bitcast <3 x double> %value to <6 x i32>
+    %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> <i32 4, i32 5>
+
+    %4 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %3, i32 %id)
+    %6 = shufflevector <2 x i32> %5, <2 x i32> <i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+    %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+    %8 = bitcast <6 x i32> %7 to <3 x double>
+
+    ret <3 x double> %8
+}
+
+; GLSL: dvec4 subgroupBroadcast(dvec4, uint)
+define spir_func <4 x double> @_Z24GroupNonUniformBroadcastiDv4_di(i32 %scope, <4 x double> %value, i32 %id)
+{
+    %1 = bitcast <4 x double> %value to <8 x i32>
+    %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+    %4 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %3, i32 %id)
+
+    %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    %7 = bitcast <8 x i32> %6 to <4 x double>
+
+    ret <4 x double> %7
+}
+
+; GLSL: bool subgroupBroadcast(bool, uint)
+define spir_func i1 @_Z24GroupNonUniformBroadcastibi(i32 %scope, i1 %value, i32 %id)
+{
+    %1 = zext i1 %value to i32
+    %2 = call i32 @_Z24GroupNonUniformBroadcastiii(i32 %scope, i32 %1, i32 %id)
+    %3 = trunc i32 %2 to i1
+
+    ret i1 %3
+}
+
+; GLSL: bvec2 subgroupBroadcast(bvec2, uint)
+define spir_func <2 x i1> @_Z24GroupNonUniformBroadcastiDv2_bi(i32 %scope, <2 x i1> %value, i32 %id)
+{
+    %1 = zext <2 x i1> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z24GroupNonUniformBroadcastiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = trunc <2 x i32> %2 to <2 x i1>
+
+    ret <2 x i1> %3
+}
+
+; GLSL: bvec3 subgroupBroadcast(bvec3, uint)
+define spir_func <3 x i1> @_Z24GroupNonUniformBroadcastiDv3_bi(i32 %scope, <3 x i1> %value, i32 %id)
+{
+    %1 = zext <3 x i1> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z24GroupNonUniformBroadcastiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = trunc <3 x i32> %2 to <3 x i1>
+
+    ret <3 x i1> %3
+}
+
+; GLSL: bvec4 subgroupBroadcast(bvec4, uint)
+define spir_func <4 x i1> @_Z24GroupNonUniformBroadcastiDv4_bi(i32 %scope, <4 x i1> %value, i32 %id)
+{
+    %1 = zext <4 x i1> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z24GroupNonUniformBroadcastiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = trunc <4 x i32> %2 to <4 x i1>
+
+    ret <4 x i1> %3
+}
+
+; GLSL: int/uint subgroupBroadcastFirst(int/uint)
+define spir_func i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %value)
+{
+    %1 = call i32 @_Z26SubgroupFirstInvocationKHRi(i32 %value)
+    ret i32 %1
+}
+
+; GLSL: ivec2/uvec2 subgroupBroadcastFirst(ivec2/uvec2)
+define spir_func <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %value)
+{
+    %1 = extractelement <2 x i32> %value, i32 0
+    %2 = extractelement <2 x i32> %value, i32 1
+
+    %3 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1)
+    %4 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %2)
+
+    %5 = insertelement <2 x i32> undef, i32 %3, i32 0
+    %6 = insertelement <2 x i32> %5, i32 %4, i32 1
+
+    ret <2 x i32> %6
+}
+
+; GLSL: ivec3/uvec3 subgroupBroadcastFirst(ivec3/uvec3)
+define spir_func <3 x i32> @_Z29GroupNonUniformBroadcastFirstiDv3_i(i32 %scope, <3 x i32> %value)
+{
+    %1 = extractelement <3 x i32> %value, i32 0
+    %2 = extractelement <3 x i32> %value, i32 1
+    %3 = extractelement <3 x i32> %value, i32 2
+
+    %4 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1)
+    %5 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %2)
+    %6 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %3)
+
+    %7 = insertelement <3 x i32> undef, i32 %4, i32 0
+    %8 = insertelement <3 x i32> %7, i32 %5, i32 1
+    %9 = insertelement <3 x i32> %8, i32 %6, i32 2
+
+    ret <3 x i32> %9
+}
+
+; GLSL: ivec4/uvec4 subgroupBroadcastFirst(ivec4/uvec4)
+define spir_func <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %value)
+{
+    %1 = extractelement <4 x i32> %value, i32 0
+    %2 = extractelement <4 x i32> %value, i32 1
+    %3 = extractelement <4 x i32> %value, i32 2
+    %4 = extractelement <4 x i32> %value, i32 3
+
+    %5 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1)
+    %6 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %2)
+    %7 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %3)
+    %8 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %4)
+
+    %9 = insertelement <4 x i32> undef, i32 %5, i32 0
+    %10 = insertelement <4 x i32> %9, i32 %6, i32 1
+    %11 = insertelement <4 x i32> %10, i32 %7, i32 2
+    %12 = insertelement <4 x i32> %11, i32 %8, i32 3
+
+    ret <4 x i32> %12
+}
+
+; GLSL: float subgroupBroadcastFirst(float)
+define spir_func float @_Z29GroupNonUniformBroadcastFirstif(i32 %scope, float %value)
+{
+    %1 = bitcast float %value to i32
+    %2 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1)
+    %3 = bitcast i32 %2 to float
+
+    ret float %3
+}
+
+; GLSL: vec2 subgroupBroadcastFirst(vec2)
+define spir_func <2 x float> @_Z29GroupNonUniformBroadcastFirstiDv2_f(i32 %scope, <2 x float> %value)
+{
+    %1 = bitcast <2 x float> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %1)
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+
+    ret <2 x float> %3
+}
+
+; GLSL: vec3 subgroupBroadcastFirst(vec3)
+define spir_func <3 x float> @_Z29GroupNonUniformBroadcastFirstiDv3_f(i32 %scope, <3 x float> %value)
+{
+    %1 = bitcast <3 x float> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z29GroupNonUniformBroadcastFirstiDv3_i(i32 %scope, <3 x i32> %1)
+    %3 = bitcast <3 x i32> %2 to <3 x float>
+
+    ret <3 x float> %3
+}
+
+; GLSL: vec4 subgroupBroadcastFirst(vec4)
+define spir_func <4 x float> @_Z29GroupNonUniformBroadcastFirstiDv4_f(i32 %scope, <4 x float> %value)
+{
+    %1 = bitcast <4 x float> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %1)
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+
+    ret <4 x float> %3
+}
+
+; GLSL: double subgroupBroadcastFirst(double)
+define spir_func double @_Z29GroupNonUniformBroadcastFirstid(i32 %scope, double %value)
+{
+    %1 = bitcast double %value to <2 x i32>
+    %2 = call <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %1)
+    %3 = bitcast <2 x i32> %2 to double
+
+    ret double %3
+}
+
+; GLSL: dvec2 subgroupBroadcastFirst(dvec2)
+define spir_func <2 x double> @_Z29GroupNonUniformBroadcastFirstiDv2_d(i32 %scope, <2 x double> %value)
+{
+    %1 = bitcast <2 x double> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %1)
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+
+    ret <2 x double> %3
+}
+
+; GLSL: dvec3 subgroupBroadcastFirst(dvec3)
+define spir_func <3 x double> @_Z29GroupNonUniformBroadcastFirstiDv3_d(i32 %scope, <3 x double> %value)
+{
+    %1 = bitcast <3 x double> %value to <6 x i32>
+    %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> <i32 4, i32 5>
+
+    %4 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %2)
+    %5 = call <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %3)
+    %6 = shufflevector <2 x i32> %5, <2 x i32> <i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+    %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+    %8 = bitcast <6 x i32> %7 to <3 x double>
+
+    ret <3 x double> %8
+}
+
+; GLSL: dvec4 subgroupBroadcastFirst(dvec4)
+define spir_func <4 x double> @_Z29GroupNonUniformBroadcastFirstiDv4_d(i32 %scope, <4 x double> %value)
+{
+    %1 = bitcast <4 x double> %value to <8 x i32>
+    %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+    %4 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %2)
+    %5 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %3)
+
+    %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    %7 = bitcast <8 x i32> %6 to <4 x double>
+
+    ret <4 x double> %7
+}
+
+; GLSL: bool subgroupBroadcastFirst(bool)
+define spir_func i1 @_Z29GroupNonUniformBroadcastFirstib(i32 %scope, i1 %value)
+{
+    %1 = zext i1 %value to i32
+    %2 = call i32 @_Z29GroupNonUniformBroadcastFirstii(i32 %scope, i32 %1)
+    %3 = trunc i32 %2 to i1
+
+    ret i1 %3
+}
+
+; GLSL: bvec2 subgroupBroadcastFirst(bvec2)
+define spir_func <2 x i1> @_Z29GroupNonUniformBroadcastFirstiDv2_b(i32 %scope, <2 x i1> %value)
+{
+    %1 = zext <2 x i1> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z29GroupNonUniformBroadcastFirstiDv2_i(i32 %scope, <2 x i32> %1)
+    %3 = trunc <2 x i32> %2 to <2 x i1>
+
+    ret <2 x i1> %3
+}
+
+; GLSL: bvec3 subgroupBroadcastFirst(bvec3)
+define spir_func <3 x i1> @_Z29GroupNonUniformBroadcastFirstiDv3_b(i32 %scope, <3 x i1> %value)
+{
+    %1 = zext <3 x i1> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z29GroupNonUniformBroadcastFirstiDv3_i(i32 %scope, <3 x i32> %1)
+    %3 = trunc <3 x i32> %2 to <3 x i1>
+
+    ret <3 x i1> %3
+}
+
+; GLSL: bvec4 subgroupBroadcastFirst(bvec4, uint)
+define spir_func <4 x i1> @_Z29GroupNonUniformBroadcastFirstiDv4_b(i32 %scope, <4 x i1> %value)
+{
+    %1 = zext <4 x i1> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z29GroupNonUniformBroadcastFirstiDv4_i(i32 %scope, <4 x i32> %1)
+    %3 = trunc <4 x i32> %2 to <4 x i1>
+
+    ret <4 x i1> %3
+}
+
+; GLSL: uvec4 subgroupBallot(bool)
+define spir_func <4 x i32> @_Z21GroupNonUniformBallotib(i32 %scope, i1 %value)
+{
+    %1 = call <4 x i32> @_Z17SubgroupBallotKHRb(i1 %value)
+    ret <4 x i32> %1
+}
+
+; GLSL: bool subgroupInverseBallot(uvec4)
+define spir_func i1 @_Z28GroupNonUniformInverseBallotiDv4_i(i32 %scope, <4 x i32> %value)
+{
+    %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
+    %2 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1) #1
+    %3 = zext i32 %2 to i64
+    %4 = shl i64 1, %3
+
+    %5 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> <i32 0, i32 1>
+    %6 = bitcast <2 x i32> %5 to i64
+    %7 = and i64 %4, %6
+    %8 = icmp ne i64 %7, 0
+
+    ret i1 %8
+}
+
+; GLSL: bool subgroupBallotBitExtract(uvec4, uint)
+define spir_func i1 @_Z31GroupNonUniformBallotBitExtractiDv4_ii(i32 %scope, <4 x i32> %value, i32 %index)
+{
+    %1 = zext i32 %index to i64
+    %2 = shl i64 1, %1
+
+    %3 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> <i32 0, i32 1>
+    %4 = bitcast <2 x i32> %3 to i64
+    %5 = and i64 %2, %4
+    %6 = icmp ne i64 %5, 0
+
+    ret i1 %6
+}
+
+; GLSL: uint subgroupBallotBitCount(uvec4)
+;       uint subgroupBallotInclusiveBitCount(uvec4)
+;       uint subgroupBallotExclusiveBitCount(uvec4)
+define spir_func i32 @_Z29GroupNonUniformBallotBitCountiiDv4_i(i32 %scope, i32 %operation, <4 x i32> %value)
+{
+    %1 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> <i32 0, i32 1>
+    %2 = bitcast <2 x i32> %1 to i64
+    %3 = extractelement <2 x i32> %1, i32 0
+    %4 = extractelement <2 x i32> %1, i32 1
+
+    switch i32 %operation, label %.default [ i32 0, label %.reduce
+                                             i32 1, label %.inclusive
+                                             i32 2, label %.exclusive ]
+
+.reduce:
+    %5 = call i64 @llvm.ctpop.i64(i64 %2)
+    %6 = trunc i64 %5 to i32
+    ret i32 %6
+
+.inclusive:
+    %7 = call i32 @llvm.amdgcn.mbcnt.lo(i32 %3, i32 0)
+    %8 = call i32 @llvm.amdgcn.mbcnt.hi(i32 %4, i32 %7)
+    %9 = add i32 %8, 1
+
+    %10 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+    %11 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %10)
+    %12 = zext i32 %11 to i64
+    %13 = shl i64 1, %12
+
+    %14 = and i64 %13, %2
+    %15 = icmp ne i64 %14, 0
+    %16 = select i1 %15, i32 %9, i32 %8
+
+    ret i32 %16
+
+.exclusive:
+    %17 = call i32 @llvm.amdgcn.mbcnt.lo(i32 %3, i32 0)
+    %18 = call i32 @llvm.amdgcn.mbcnt.hi(i32 %4, i32 %17)
+
+    ret i32 %18
+
+.default:
+    ret i32 0
+}
+
+; GLSL: uint subgroupBallotFindLSB(uvec4)
+define spir_func i32 @_Z28GroupNonUniformBallotFindLSBiDv4_i(i32 %scope, <4 x i32> %value)
+{
+    %1 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> <i32 0, i32 1>
+    %2 = bitcast <2 x i32> %1 to i64
+
+    %3 = call i64 @llvm.cttz.i64(i64 %2, i1 true)
+    %4 = trunc i64 %3 to i32
+
+    ret i32 %4
+}
+
+; GLSL: uint subgroupBallotFindMSB(uvec4)
+define spir_func i32 @_Z28GroupNonUniformBallotFindMSBiDv4_i(i32 %scope, <4 x i32> %value)
+{
+    %1 = shufflevector <4 x i32> %value, <4 x i32> %value, <2 x i32> <i32 0, i32 1>
+    %2 = bitcast <2 x i32> %1 to i64
+
+    %3 = call i64 @llvm.ctlz.i64(i64 %2, i1 true)
+    %4 = trunc i64 %3 to i32
+    %5 = sub i32 63, %4
+
+    ret i32 %5
+}
+
+; GLSL: int/uint subgroupShuffle(int/uint, uint)
+define spir_func i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %value, i32 %id)
+{
+    %1 = mul i32 %id, 4
+    %2 = call i32 @llvm.amdgcn.ds.bpermute(i32 %1, i32 %value)
+
+    ret i32 %2
+}
+
+; GLSL: ivec2/uvec2 subgroupShuffle(ivec2/uvec2, uint)
+define spir_func <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %value, i32 %id)
+{
+    %1 = extractelement <2 x i32> %value, i32 0
+    %2 = extractelement <2 x i32> %value, i32 1
+
+    %3 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id)
+    %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %2, i32 %id)
+
+    %5 = insertelement <2 x i32> undef, i32 %3, i32 0
+    %6 = insertelement <2 x i32> %5, i32 %4, i32 1
+
+    ret <2 x i32> %6
+}
+
+; GLSL: ivec3/uvec3 subgroupShuffle(ivec3/uvec3, uint)
+define spir_func <3 x i32> @_Z22GroupNonUniformShuffleiDv3_ii(i32 %scope, <3 x i32> %value, i32 %id)
+{
+    %1 = extractelement <3 x i32> %value, i32 0
+    %2 = extractelement <3 x i32> %value, i32 1
+    %3 = extractelement <3 x i32> %value, i32 2
+
+    %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id)
+    %5 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %2, i32 %id)
+    %6 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %3, i32 %id)
+
+    %7 = insertelement <3 x i32> undef, i32 %4, i32 0
+    %8 = insertelement <3 x i32> %7, i32 %5, i32 1
+    %9 = insertelement <3 x i32> %8, i32 %6, i32 2
+
+    ret <3 x i32> %9
+}
+
+; GLSL: ivec4/uvec4 subgroupShuffle(ivec4/uvec4, uint)
+define spir_func <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %value, i32 %id)
+{
+    %1 = extractelement <4 x i32> %value, i32 0
+    %2 = extractelement <4 x i32> %value, i32 1
+    %3 = extractelement <4 x i32> %value, i32 2
+    %4 = extractelement <4 x i32> %value, i32 3
+
+    %5 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id)
+    %6 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %2, i32 %id)
+    %7 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %3, i32 %id)
+    %8 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %4, i32 %id)
+
+    %9 = insertelement <4 x i32> undef, i32 %5, i32 0
+    %10 = insertelement <4 x i32> %9, i32 %6, i32 1
+    %11 = insertelement <4 x i32> %10, i32 %7, i32 2
+    %12 = insertelement <4 x i32> %11, i32 %8, i32 3
+
+    ret <4 x i32> %12
+}
+
+; GLSL: float subgroupShuffle(float, uint)
+define spir_func float @_Z22GroupNonUniformShuffleifi(i32 %scope, float %value, i32 %id)
+{
+    %1 = bitcast float %value to i32
+    %2 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id)
+    %3 = bitcast i32 %2 to float
+
+    ret float %3
+}
+
+; GLSL: vec2 subgroupShuffle(vec2, uint)
+define spir_func <2 x float> @_Z22GroupNonUniformShuffleiDv2_fi(i32 %scope, <2 x float> %value, i32 %id)
+{
+    %1 = bitcast <2 x float> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+
+    ret <2 x float> %3
+}
+
+; GLSL: vec3 subgroupShuffle(vec3, uint)
+define spir_func <3 x float> @_Z22GroupNonUniformShuffleiDv3_fi(i32 %scope, <3 x float> %value, i32 %id)
+{
+    %1 = bitcast <3 x float> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z22GroupNonUniformShuffleiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = bitcast <3 x i32> %2 to <3 x float>
+
+    ret <3 x float> %3
+}
+
+; GLSL: vec4 subgroupShuffle(vec4, uint)
+define spir_func <4 x float> @_Z22GroupNonUniformShuffleiDv4_fi(i32 %scope, <4 x float> %value, i32 %id)
+{
+    %1 = bitcast <4 x float> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+
+    ret <4 x float> %3
+}
+
+; GLSL: double subgroupShuffle(double, uint)
+define spir_func double @_Z22GroupNonUniformShuffleidi(i32 %scope, double %value, i32 %id)
+{
+    %1 = bitcast double %value to <2 x i32>
+    %2 = call <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to double
+
+    ret double %3
+}
+
+; GLSL: dvec2 subgroupShuffle(dvec2, uint)
+define spir_func <2 x double> @_Z22GroupNonUniformShuffleiDv2_di(i32 %scope, <2 x double> %value, i32 %id)
+{
+    %1 = bitcast <2 x double> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+
+    ret <2 x double> %3
+}
+
+; GLSL: dvec3 subgroupShuffle(dvec3, uint)
+define spir_func <3 x double> @_Z22GroupNonUniformShuffleiDv3_di(i32 %scope, <3 x double> %value, i32 %id)
+{
+    %1 = bitcast <3 x double> %value to <6 x i32>
+    %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> <i32 4, i32 5>
+
+    %4 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %3, i32 %id)
+    %6 = shufflevector <2 x i32> %5, <2 x i32> <i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+    %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+    %8 = bitcast <6 x i32> %7 to <3 x double>
+
+    ret <3 x double> %8
+}
+
+; GLSL: dvec4 subgroupShuffle(dvec4, uint)
+define spir_func <4 x double> @_Z22GroupNonUniformShuffleiDv4_di(i32 %scope, <4 x double> %value, i32 %id)
+{
+    %1 = bitcast <4 x double> %value to <8 x i32>
+    %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+    %4 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %3, i32 %id)
+
+    %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    %7 = bitcast <8 x i32> %6 to <4 x double>
+
+    ret <4 x double> %7
+}
+
+; GLSL: bool subgroupShuffle(bool, uint)
+define spir_func i1 @_Z22GroupNonUniformShuffleibi(i32 %scope, i1 %value, i32 %id)
+{
+    %1 = zext i1 %value to i32
+    %2 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %1, i32 %id)
+    %3 = trunc i32 %2 to i1
+
+    ret i1 %3
+}
+
+; GLSL: bvec2 subgroupShuffle(bvec2, uint)
+define spir_func <2 x i1> @_Z22GroupNonUniformShuffleiDv2_bi(i32 %scope, <2 x i1> %value, i32 %id)
+{
+    %1 = zext <2 x i1> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z22GroupNonUniformShuffleiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = trunc <2 x i32> %2 to <2 x i1>
+
+    ret <2 x i1> %3
+}
+
+; GLSL: bvec3 subgroupShuffle(bvec3, uint)
+define spir_func <3 x i1> @_Z22GroupNonUniformShuffleiDv3_bi(i32 %scope, <3 x i1> %value, i32 %id)
+{
+    %1 = zext <3 x i1> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z22GroupNonUniformShuffleiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = trunc <3 x i32> %2 to <3 x i1>
+
+    ret <3 x i1> %3
+}
+
+; GLSL: bvec4 subgroupShuffle(bvec4, uint)
+define spir_func <4 x i1> @_Z22GroupNonUniformShuffleiDv4_bi(i32 %scope, <4 x i1> %value, i32 %id)
+{
+    %1 = zext <4 x i1> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z22GroupNonUniformShuffleiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = trunc <4 x i32> %2 to <4 x i1>
+
+    ret <4 x i1> %3
+}
+
+; GLSL: int/uint subgroupShuffleXor(int/uint, uint)
+define spir_func i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %value, i32 %mask)
+{
+    %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+    %2 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1)
+    %3 = xor i32 %2, %mask
+    %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %value, i32 %3)
+
+    ret i32 %4
+}
+
+; GLSL: ivec2/uvec2 subgroupShuffleXor(ivec2/uvec2, uint)
+define spir_func <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %value, i32 %id)
+{
+    %1 = extractelement <2 x i32> %value, i32 0
+    %2 = extractelement <2 x i32> %value, i32 1
+
+    %3 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id)
+    %4 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %2, i32 %id)
+
+    %5 = insertelement <2 x i32> undef, i32 %3, i32 0
+    %6 = insertelement <2 x i32> %5, i32 %4, i32 1
+
+    ret <2 x i32> %6
+}
+
+; GLSL: ivec3/uvec3 subgroupShuffleXor(ivec3/uvec3, uint)
+define spir_func <3 x i32> @_Z25GroupNonUniformShuffleXoriDv3_ii(i32 %scope, <3 x i32> %value, i32 %id)
+{
+    %1 = extractelement <3 x i32> %value, i32 0
+    %2 = extractelement <3 x i32> %value, i32 1
+    %3 = extractelement <3 x i32> %value, i32 2
+
+    %4 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id)
+    %5 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %2, i32 %id)
+    %6 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %3, i32 %id)
+
+    %7 = insertelement <3 x i32> undef, i32 %4, i32 0
+    %8 = insertelement <3 x i32> %7, i32 %5, i32 1
+    %9 = insertelement <3 x i32> %8, i32 %6, i32 2
+
+    ret <3 x i32> %9
+}
+
+; GLSL: ivec4/uvec4 subgroupShuffleXor(ivec4/uvec4, uint)
+define spir_func <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %value, i32 %id)
+{
+    %1 = extractelement <4 x i32> %value, i32 0
+    %2 = extractelement <4 x i32> %value, i32 1
+    %3 = extractelement <4 x i32> %value, i32 2
+    %4 = extractelement <4 x i32> %value, i32 3
+
+    %5 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id)
+    %6 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %2, i32 %id)
+    %7 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %3, i32 %id)
+    %8 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %4, i32 %id)
+
+    %9 = insertelement <4 x i32> undef, i32 %5, i32 0
+    %10 = insertelement <4 x i32> %9, i32 %6, i32 1
+    %11 = insertelement <4 x i32> %10, i32 %7, i32 2
+    %12 = insertelement <4 x i32> %11, i32 %8, i32 3
+
+    ret <4 x i32> %12
+}
+
+; GLSL: float subgroupShuffleXor(float, uint)
+define spir_func float @_Z25GroupNonUniformShuffleXorifi(i32 %scope, float %value, i32 %id)
+{
+    %1 = bitcast float %value to i32
+    %2 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id)
+    %3 = bitcast i32 %2 to float
+
+    ret float %3
+}
+
+; GLSL: vec2 subgroupShuffleXor(vec2, uint)
+define spir_func <2 x float> @_Z25GroupNonUniformShuffleXoriDv2_fi(i32 %scope, <2 x float> %value, i32 %id)
+{
+    %1 = bitcast <2 x float> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+
+    ret <2 x float> %3
+}
+
+; GLSL: vec3 subgroupShuffleXor(vec3, uint)
+define spir_func <3 x float> @_Z25GroupNonUniformShuffleXoriDv3_fi(i32 %scope, <3 x float> %value, i32 %id)
+{
+    %1 = bitcast <3 x float> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z25GroupNonUniformShuffleXoriDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = bitcast <3 x i32> %2 to <3 x float>
+
+    ret <3 x float> %3
+}
+
+; GLSL: vec4 subgroupShuffleXor(vec4, uint)
+define spir_func <4 x float> @_Z25GroupNonUniformShuffleXoriDv4_fi(i32 %scope, <4 x float> %value, i32 %id)
+{
+    %1 = bitcast <4 x float> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+
+    ret <4 x float> %3
+}
+
+; GLSL: double subgroupShuffleXor(double, uint)
+define spir_func double @_Z25GroupNonUniformShuffleXoridi(i32 %scope, double %value, i32 %id)
+{
+    %1 = bitcast double %value to <2 x i32>
+    %2 = call <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to double
+
+    ret double %3
+}
+
+; GLSL: dvec2 subgroupShuffleXor(dvec2, uint)
+define spir_func <2 x double> @_Z25GroupNonUniformShuffleXoriDv2_di(i32 %scope, <2 x double> %value, i32 %id)
+{
+    %1 = bitcast <2 x double> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+
+    ret <2 x double> %3
+}
+
+; GLSL: dvec3 subgroupShuffleXor(dvec3, uint)
+define spir_func <3 x double> @_Z25GroupNonUniformShuffleXoriDv3_di(i32 %scope, <3 x double> %value, i32 %id)
+{
+    %1 = bitcast <3 x double> %value to <6 x i32>
+    %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> <i32 4, i32 5>
+
+    %4 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %3, i32 %id)
+    %6 = shufflevector <2 x i32> %5, <2 x i32> <i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+    %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+    %8 = bitcast <6 x i32> %7 to <3 x double>
+
+    ret <3 x double> %8
+}
+
+; GLSL: dvec4 subgroupShuffleXor(dvec4, uint)
+define spir_func <4 x double> @_Z25GroupNonUniformShuffleXoriDv4_di(i32 %scope, <4 x double> %value, i32 %id)
+{
+    %1 = bitcast <4 x double> %value to <8 x i32>
+    %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+    %4 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %3, i32 %id)
+
+    %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    %7 = bitcast <8 x i32> %6 to <4 x double>
+
+    ret <4 x double> %7
+}
+
+; GLSL: bool subgroupShuffleXor(bool, uint)
+define spir_func i1 @_Z25GroupNonUniformShuffleXoribi(i32 %scope, i1 %value, i32 %id)
+{
+    %1 = zext i1 %value to i32
+    %2 = call i32 @_Z25GroupNonUniformShuffleXoriii(i32 %scope, i32 %1, i32 %id)
+    %3 = trunc i32 %2 to i1
+
+    ret i1 %3
+}
+
+; GLSL: bvec2 subgroupShuffleXor(bvec2, uint)
+define spir_func <2 x i1> @_Z25GroupNonUniformShuffleXoriDv2_bi(i32 %scope, <2 x i1> %value, i32 %id)
+{
+    %1 = zext <2 x i1> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z25GroupNonUniformShuffleXoriDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = trunc <2 x i32> %2 to <2 x i1>
+
+    ret <2 x i1> %3
+}
+
+; GLSL: bvec3 subgroupShuffleXor(bvec3, uint)
+define spir_func <3 x i1> @_Z25GroupNonUniformShuffleXoriDv3_bi(i32 %scope, <3 x i1> %value, i32 %id)
+{
+    %1 = zext <3 x i1> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z25GroupNonUniformShuffleXoriDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = trunc <3 x i32> %2 to <3 x i1>
+
+    ret <3 x i1> %3
+}
+
+; GLSL: bvec4 subgroupShuffleXor(bvec4, uint)
+define spir_func <4 x i1> @_Z25GroupNonUniformShuffleXoriDv4_bi(i32 %scope, <4 x i1> %value, i32 %id)
+{
+    %1 = zext <4 x i1> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z25GroupNonUniformShuffleXoriDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = trunc <4 x i32> %2 to <4 x i1>
+
+    ret <4 x i1> %3
+}
+
+; GLSL: int/uint subgroupShuffleUp(int/uint, uint)
+define spir_func i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %value, i32 %delta)
+{
+    %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+    %2 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1)
+    %3 = sub i32 %2, %delta
+    %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %value, i32 %3)
+
+    ret i32 %4
+}
+
+; GLSL: ivec2/uvec2 subgroupShuffleUp(ivec2/uvec2, uint)
+define spir_func <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %value, i32 %id)
+{
+    %1 = extractelement <2 x i32> %value, i32 0
+    %2 = extractelement <2 x i32> %value, i32 1
+
+    %3 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id)
+    %4 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %2, i32 %id)
+
+    %5 = insertelement <2 x i32> undef, i32 %3, i32 0
+    %6 = insertelement <2 x i32> %5, i32 %4, i32 1
+
+    ret <2 x i32> %6
+}
+
+; GLSL: ivec3/uvec3 subgroupShuffleUp(ivec3/uvec3, uint)
+define spir_func <3 x i32> @_Z24GroupNonUniformShuffleUpiDv3_ii(i32 %scope, <3 x i32> %value, i32 %id)
+{
+    %1 = extractelement <3 x i32> %value, i32 0
+    %2 = extractelement <3 x i32> %value, i32 1
+    %3 = extractelement <3 x i32> %value, i32 2
+
+    %4 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id)
+    %5 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %2, i32 %id)
+    %6 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %3, i32 %id)
+
+    %7 = insertelement <3 x i32> undef, i32 %4, i32 0
+    %8 = insertelement <3 x i32> %7, i32 %5, i32 1
+    %9 = insertelement <3 x i32> %8, i32 %6, i32 2
+
+    ret <3 x i32> %9
+}
+
+; GLSL: ivec4/uvec4 subgroupShuffleUp(ivec4/uvec4, uint)
+define spir_func <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %value, i32 %id)
+{
+    %1 = extractelement <4 x i32> %value, i32 0
+    %2 = extractelement <4 x i32> %value, i32 1
+    %3 = extractelement <4 x i32> %value, i32 2
+    %4 = extractelement <4 x i32> %value, i32 3
+
+    %5 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id)
+    %6 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %2, i32 %id)
+    %7 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %3, i32 %id)
+    %8 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %4, i32 %id)
+
+    %9 = insertelement <4 x i32> undef, i32 %5, i32 0
+    %10 = insertelement <4 x i32> %9, i32 %6, i32 1
+    %11 = insertelement <4 x i32> %10, i32 %7, i32 2
+    %12 = insertelement <4 x i32> %11, i32 %8, i32 3
+
+    ret <4 x i32> %12
+}
+
+; GLSL: float subgroupShuffleUp(float, uint)
+define spir_func float @_Z24GroupNonUniformShuffleUpifi(i32 %scope, float %value, i32 %id)
+{
+    %1 = bitcast float %value to i32
+    %2 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id)
+    %3 = bitcast i32 %2 to float
+
+    ret float %3
+}
+
+; GLSL: vec2 subgroupShuffleUp(vec2, uint)
+define spir_func <2 x float> @_Z24GroupNonUniformShuffleUpiDv2_fi(i32 %scope, <2 x float> %value, i32 %id)
+{
+    %1 = bitcast <2 x float> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+
+    ret <2 x float> %3
+}
+
+; GLSL: vec3 subgroupShuffleUp(vec3, uint)
+define spir_func <3 x float> @_Z24GroupNonUniformShuffleUpiDv3_fi(i32 %scope, <3 x float> %value, i32 %id)
+{
+    %1 = bitcast <3 x float> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z24GroupNonUniformShuffleUpiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = bitcast <3 x i32> %2 to <3 x float>
+
+    ret <3 x float> %3
+}
+
+; GLSL: vec4 subgroupShuffleUp(vec4, uint)
+define spir_func <4 x float> @_Z24GroupNonUniformShuffleUpiDv4_fi(i32 %scope, <4 x float> %value, i32 %id)
+{
+    %1 = bitcast <4 x float> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+
+    ret <4 x float> %3
+}
+
+; GLSL: double subgroupShuffleUp(double, uint)
+define spir_func double @_Z24GroupNonUniformShuffleUpidi(i32 %scope, double %value, i32 %id)
+{
+    %1 = bitcast double %value to <2 x i32>
+    %2 = call <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to double
+
+    ret double %3
+}
+
+; GLSL: dvec2 subgroupShuffleUp(dvec2, uint)
+define spir_func <2 x double> @_Z24GroupNonUniformShuffleUpiDv2_di(i32 %scope, <2 x double> %value, i32 %id)
+{
+    %1 = bitcast <2 x double> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+
+    ret <2 x double> %3
+}
+
+; GLSL: dvec3 subgroupShuffleUp(dvec3, uint)
+define spir_func <3 x double> @_Z24GroupNonUniformShuffleUpiDv3_di(i32 %scope, <3 x double> %value, i32 %id)
+{
+    %1 = bitcast <3 x double> %value to <6 x i32>
+    %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> <i32 4, i32 5>
+
+    %4 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %3, i32 %id)
+    %6 = shufflevector <2 x i32> %5, <2 x i32> <i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+    %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+    %8 = bitcast <6 x i32> %7 to <3 x double>
+
+    ret <3 x double> %8
+}
+
+; GLSL: dvec4 subgroupShuffleUp(dvec4, uint)
+define spir_func <4 x double> @_Z24GroupNonUniformShuffleUpiDv4_di(i32 %scope, <4 x double> %value, i32 %id)
+{
+    %1 = bitcast <4 x double> %value to <8 x i32>
+    %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+    %4 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %3, i32 %id)
+
+    %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    %7 = bitcast <8 x i32> %6 to <4 x double>
+
+    ret <4 x double> %7
+}
+
+; GLSL: bool subgroupShuffleUp(bool, uint)
+define spir_func i1 @_Z24GroupNonUniformShuffleUpibi(i32 %scope, i1 %value, i32 %id)
+{
+    %1 = zext i1 %value to i32
+    %2 = call i32 @_Z24GroupNonUniformShuffleUpiii(i32 %scope, i32 %1, i32 %id)
+    %3 = trunc i32 %2 to i1
+
+    ret i1 %3
+}
+
+; GLSL: bvec2 subgroupShuffleUp(bvec2, uint)
+define spir_func <2 x i1> @_Z24GroupNonUniformShuffleUpiDv2_bi(i32 %scope, <2 x i1> %value, i32 %id)
+{
+    %1 = zext <2 x i1> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z24GroupNonUniformShuffleUpiDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = trunc <2 x i32> %2 to <2 x i1>
+
+    ret <2 x i1> %3
+}
+
+; GLSL: bvec3 subgroupShuffleUp(bvec3, uint)
+define spir_func <3 x i1> @_Z24GroupNonUniformShuffleUpiDv3_bi(i32 %scope, <3 x i1> %value, i32 %id)
+{
+    %1 = zext <3 x i1> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z24GroupNonUniformShuffleUpiDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = trunc <3 x i32> %2 to <3 x i1>
+
+    ret <3 x i1> %3
+}
+
+; GLSL: bvec4 subgroupShuffleUp(bvec4, uint)
+define spir_func <4 x i1> @_Z24GroupNonUniformShuffleUpiDv4_bi(i32 %scope, <4 x i1> %value, i32 %id)
+{
+    %1 = zext <4 x i1> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z24GroupNonUniformShuffleUpiDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = trunc <4 x i32> %2 to <4 x i1>
+
+    ret <4 x i1> %3
+}
+
+; GLSL: int/uint subgroupShuffleDown(int/uint, uint)
+define spir_func i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %value, i32 %delta)
+{
+    %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+    %2 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %1)
+    %3 = add i32 %2, %delta
+    %4 = call i32 @_Z22GroupNonUniformShuffleiii(i32 %scope, i32 %value, i32 %3)
+
+    ret i32 %4
+}
+
+; GLSL: ivec2/uvec2 subgroupShuffleDown(ivec2/uvec2, uint)
+define spir_func <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %value, i32 %id)
+{
+    %1 = extractelement <2 x i32> %value, i32 0
+    %2 = extractelement <2 x i32> %value, i32 1
+
+    %3 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id)
+    %4 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %2, i32 %id)
+
+    %5 = insertelement <2 x i32> undef, i32 %3, i32 0
+    %6 = insertelement <2 x i32> %5, i32 %4, i32 1
+
+    ret <2 x i32> %6
+}
+
+; GLSL: ivec3/uvec3 subgroupShuffleDown(ivec3/uvec3, uint)
+define spir_func <3 x i32> @_Z26GroupNonUniformShuffleDowniDv3_ii(i32 %scope, <3 x i32> %value, i32 %id)
+{
+    %1 = extractelement <3 x i32> %value, i32 0
+    %2 = extractelement <3 x i32> %value, i32 1
+    %3 = extractelement <3 x i32> %value, i32 2
+
+    %4 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id)
+    %5 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %2, i32 %id)
+    %6 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %3, i32 %id)
+
+    %7 = insertelement <3 x i32> undef, i32 %4, i32 0
+    %8 = insertelement <3 x i32> %7, i32 %5, i32 1
+    %9 = insertelement <3 x i32> %8, i32 %6, i32 2
+
+    ret <3 x i32> %9
+}
+
+; GLSL: ivec4/uvec4 subgroupShuffleDown(ivec4/uvec4, uint)
+define spir_func <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %value, i32 %id)
+{
+    %1 = extractelement <4 x i32> %value, i32 0
+    %2 = extractelement <4 x i32> %value, i32 1
+    %3 = extractelement <4 x i32> %value, i32 2
+    %4 = extractelement <4 x i32> %value, i32 3
+
+    %5 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id)
+    %6 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %2, i32 %id)
+    %7 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %3, i32 %id)
+    %8 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %4, i32 %id)
+
+    %9 = insertelement <4 x i32> undef, i32 %5, i32 0
+    %10 = insertelement <4 x i32> %9, i32 %6, i32 1
+    %11 = insertelement <4 x i32> %10, i32 %7, i32 2
+    %12 = insertelement <4 x i32> %11, i32 %8, i32 3
+
+    ret <4 x i32> %12
+}
+
+; GLSL: float subgroupShuffleDown(float, uint)
+define spir_func float @_Z26GroupNonUniformShuffleDownifi(i32 %scope, float %value, i32 %id)
+{
+    %1 = bitcast float %value to i32
+    %2 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id)
+    %3 = bitcast i32 %2 to float
+
+    ret float %3
+}
+
+; GLSL: vec2 subgroupShuffleDown(vec2, uint)
+define spir_func <2 x float> @_Z26GroupNonUniformShuffleDowniDv2_fi(i32 %scope, <2 x float> %value, i32 %id)
+{
+    %1 = bitcast <2 x float> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to <2 x float>
+
+    ret <2 x float> %3
+}
+
+; GLSL: vec3 subgroupShuffleDown(vec3, uint)
+define spir_func <3 x float> @_Z26GroupNonUniformShuffleDowniDv3_fi(i32 %scope, <3 x float> %value, i32 %id)
+{
+    %1 = bitcast <3 x float> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z26GroupNonUniformShuffleDowniDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = bitcast <3 x i32> %2 to <3 x float>
+
+    ret <3 x float> %3
+}
+
+; GLSL: vec4 subgroupShuffleDown(vec4, uint)
+define spir_func <4 x float> @_Z26GroupNonUniformShuffleDowniDv4_fi(i32 %scope, <4 x float> %value, i32 %id)
+{
+    %1 = bitcast <4 x float> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <4 x float>
+
+    ret <4 x float> %3
+}
+
+; GLSL: double subgroupShuffleDown(double, uint)
+define spir_func double @_Z26GroupNonUniformShuffleDownidi(i32 %scope, double %value, i32 %id)
+{
+    %1 = bitcast double %value to <2 x i32>
+    %2 = call <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = bitcast <2 x i32> %2 to double
+
+    ret double %3
+}
+
+; GLSL: dvec2 subgroupShuffleDown(dvec2, uint)
+define spir_func <2 x double> @_Z26GroupNonUniformShuffleDowniDv2_di(i32 %scope, <2 x double> %value, i32 %id)
+{
+    %1 = bitcast <2 x double> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = bitcast <4 x i32> %2 to <2 x double>
+
+    ret <2 x double> %3
+}
+
+; GLSL: dvec3 subgroupShuffleDown(dvec3, uint)
+define spir_func <3 x double> @_Z26GroupNonUniformShuffleDowniDv3_di(i32 %scope, <3 x double> %value, i32 %id)
+{
+    %1 = bitcast <3 x double> %value to <6 x i32>
+    %2 = shufflevector <6 x i32> %1, <6 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <6 x i32> %1, <6 x i32> %1, <2 x i32> <i32 4, i32 5>
+
+    %4 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %3, i32 %id)
+    %6 = shufflevector <2 x i32> %5, <2 x i32> <i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+    %7 = shufflevector <4 x i32> %4, <4 x i32> %6, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+    %8 = bitcast <6 x i32> %7 to <3 x double>
+
+    ret <3 x double> %8
+}
+
+; GLSL: dvec4 subgroupShuffleDown(dvec4, uint)
+define spir_func <4 x double> @_Z26GroupNonUniformShuffleDowniDv4_di(i32 %scope, <4 x double> %value, i32 %id)
+{
+    %1 = bitcast <4 x double> %value to <8 x i32>
+    %2 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %3 = shufflevector <8 x i32> %1, <8 x i32> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+    %4 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %2, i32 %id)
+    %5 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %3, i32 %id)
+
+    %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    %7 = bitcast <8 x i32> %6 to <4 x double>
+
+    ret <4 x double> %7
+}
+
+; GLSL: bool subgroupShuffleDown(bool, uint)
+define spir_func i1 @_Z26GroupNonUniformShuffleDownibi(i32 %scope, i1 %value, i32 %id)
+{
+    %1 = zext i1 %value to i32
+    %2 = call i32 @_Z26GroupNonUniformShuffleDowniii(i32 %scope, i32 %1, i32 %id)
+    %3 = trunc i32 %2 to i1
+
+    ret i1 %3
+}
+
+; GLSL: bvec2 subgroupShuffleDown(bvec2, uint)
+define spir_func <2 x i1> @_Z26GroupNonUniformShuffleDowniDv2_bi(i32 %scope, <2 x i1> %value, i32 %id)
+{
+    %1 = zext <2 x i1> %value to <2 x i32>
+    %2 = call <2 x i32> @_Z26GroupNonUniformShuffleDowniDv2_ii(i32 %scope, <2 x i32> %1, i32 %id)
+    %3 = trunc <2 x i32> %2 to <2 x i1>
+
+    ret <2 x i1> %3
+}
+
+; GLSL: bvec3 subgroupShuffleDown(bvec3, uint)
+define spir_func <3 x i1> @_Z26GroupNonUniformShuffleDowniDv3_bi(i32 %scope, <3 x i1> %value, i32 %id)
+{
+    %1 = zext <3 x i1> %value to <3 x i32>
+    %2 = call <3 x i32> @_Z26GroupNonUniformShuffleDowniDv3_ii(i32 %scope, <3 x i32> %1, i32 %id)
+    %3 = trunc <3 x i32> %2 to <3 x i1>
+
+    ret <3 x i1> %3
+}
+
+; GLSL: bvec4 subgroupShuffleDown(bvec4, uint)
+define spir_func <4 x i1> @_Z26GroupNonUniformShuffleDowniDv4_bi(i32 %scope, <4 x i1> %value, i32 %id)
+{
+    %1 = zext <4 x i1> %value to <4 x i32>
+    %2 = call <4 x i32> @_Z26GroupNonUniformShuffleDowniDv4_ii(i32 %scope, <4 x i32> %1, i32 %id)
+    %3 = trunc <4 x i32> %2 to <4 x i1>
+
+    ret <4 x i1> %3
+}
+
+; GLSL: ivec/uvec subgroupAdd(ivec/uvec)
+;       ivec/uvec subgroupInclusiveAdd(ivec/uvec)
+;       ivec/uvec subgroupExclusiveAdd(ivec/uvec)
+
+; GLSL: vec subgroupAdd(vec)
+;       vec subgroupInclusiveAdd(vec)
+;       vec subgroupExclusiveAdd(vec)
+
+; GLSL: dvec subgroupAdd(dvec)
+;       dvec subgroupInclusiveAdd(dvec)
+;       dvec subgroupExclusiveAdd(dvec)
+
+; GLSL: ivec/uvec subgroupMul(ivec/uvec)
+;       ivec/uvec subgroupInclusiveMul(ivec/uvec)
+;       ivec/uvec subgroupExclusiveMul(ivec/uvec)
+
+; GLSL: vec subgroupMul(vec)
+;       vec subgroupInclusiveMul(vec)
+;       vec subgroupExclusiveMul(vec)
+
+; GLSL: dvec subgroupMul(dvec)
+;       dvec subgroupInclusiveMul(dvec)
+;       dvec subgroupExclusiveMul(dvec)
+
+; GLSL: ivec subgroupMin(ivec)
+;       ivec subgroupInclusiveMin(ivec)
+;       ivec subgroupExclusiveMin(ivec)
+
+; GLSL: uvec subgroupMin(uvec)
+;       uvec subgroupInclusiveMin(uvec)
+;       uvec subgroupExclusiveMin(uvec)
+
+; GLSL: vec subgroupMin(vec)
+;       vec subgroupInclusiveMin(vec)
+;       vec subgroupExclusiveMin(vec)
+
+; GLSL: dvec subgroupMin(dvec)
+;       dvec subgroupInclusiveMin(dvec)
+;       dvec subgroupExclusiveMin(dvec)
+
+; GLSL: ivec subgroupMax(ivec)
+;       ivec subgroupInclusiveMax(ivec)
+;       ivec subgroupExclusiveMax(ivec)
+
+; GLSL: uvec subgroupMax(uvec)
+;       uvec subgroupInclusiveMax(uvec)
+;       uvec subgroupExclusiveMax(uvec)
+
+; GLSL: vec subgroupMax(vec)
+;       vec subgroupInclusiveMax(vec)
+;       vec subgroupExclusiveMax(vec)
+
+; GLSL: dvec subgroupMax(dvec)
+;       dvec subgroupInclusiveMax(dvec)
+;       dvec subgroupExclusiveMax(dvec)
+
+; GLSL: ivec/uvec subgroupAnd(ivec/uvec)
+;       ivec/uvec subgroupInclusiveAnd(ivec/uvec)
+;       ivec/uvec subgroupExclusiveAnd(ivec/uvec)
+
+; GLSL: ivec/uvec subgroupOr(ivec/uvec)
+;       ivec/uvec subgroupInclusiveOr(ivec/uvec)
+;       ivec/uvec subgroupExclusiveOr(ivec/uvec)
+
+; GLSL: ivec/uvec subgroupXor(ivec/uvec)
+;       ivec/uvec subgroupInclusiveXor(ivec/uvec)
+;       ivec/uvec subgroupExclusiveXor(ivec/uvec)
+
+; GLSL: bvec subgroupAnd(bvec)
+;       bvec subgroupInclusiveAnd(bvec)
+;       bvec subgroupExclusiveAnd(bvec)
+
+; GLSL: bvec subgroupOr(bvec)
+;       bvec subgroupInclusiveOr(bvec)
+;       bvec subgroupExclusiveOr(bvec)
+
+; GLSL: bvec subgroupXor(bvec)
+;       bvec subgroupInclusiveXor(bvec)
+;       bvec subgroupExclusiveXor(bvec)
+
+; GLSL: gvec subgroupQuadBroadcast(gvec, uint)
+
+; GLSL: gvec subgroupQuadSwapHorizontal(gvec)
+;       gvec subgroupQuadSwapVertical(gvec)
+;       gvec subgroupQuadSwapDiagonal(gvec)
+
 ; =====================================================================================================================
 ; >>>  Interpolation Functions
 ; =====================================================================================================================
@@ -483,6 +2060,9 @@ declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #2
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
 declare i64 @llvm.cttz.i64(i64, i1) #0
+declare i64 @llvm.ctlz.i64(i64, i1) #0
+declare i64 @llvm.ctpop.i64(i64) #0
+declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/icd/api/llpc/patch/generate/glslSpecialOpEmuF16.ll b/icd/api/llpc/patch/generate/glslSpecialOpEmuF16.ll
index 6eaaac95..9849eeb4 100644
--- a/icd/api/llpc/patch/generate/glslSpecialOpEmuF16.ll
+++ b/icd/api/llpc/patch/generate/glslSpecialOpEmuF16.ll
@@ -1,18 +1,27 @@
-;**********************************************************************************************************************
-;*
-;*  Trade secret of Advanced Micro Devices, Inc.
-;*  Copyright (c) 2018, Advanced Micro Devices, Inc., (unpublished)
-;*
-;*  All rights reserved. This notice is intended as a precaution against inadvertent publication and does not imply
-;*  publication or any waiver of confidentiality. The year included in the foregoing notice is the year of creation of
-;*  the work.
-;*
-;**********************************************************************************************************************
-
-;**********************************************************************************************************************
-;* @file  glslSpecialOpEmuF16.ll
-;* @brief LLPC LLVM-IR file: contains emulation codes for GLSL special graphics-specific operations (float16).
-;**********************************************************************************************************************
+;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;
+ ;  Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ ;
+ ;  Permission is hereby granted, free of charge, to any person obtaining a copy
+ ;  of this software and associated documentation files (the "Software"), to deal
+ ;  in the Software without restriction, including without limitation the rights
+ ;  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ ;  copies of the Software, and to permit persons to whom the Software is
+ ;  furnished to do so, subject to the following conditions:
+ ;
+ ;  The above copyright notice and this permission notice shall be included in all
+ ;  copies or substantial portions of the Software.
+ ;
+ ;  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ ;  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ ;  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ ;  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ ;  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ ;  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ ;  SOFTWARE.
+ ;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
 target triple = "spir64-unknown-unknown"
diff --git a/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.py b/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.py
index bd958a69..5ceb4471 100644
--- a/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.py
+++ b/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.py
@@ -40,7 +40,9 @@
 LLVM_DECLS = {}
 
 # Image opcode traits are encoded in function name using these tokens
-SPIRV_IMAGE_PREFIX                          = "llpc.image"
+SPIRV_IMAGE_PREFIX                          = "llpc"
+SPIRV_IMAGE_MODIFIER                        = "image"
+SPIRV_IMAGE_SPARSE_MODIFIER                 = "sparse"
 SPIRV_IMAGE_OPERAND_DREF_MODIFIER           = "dref"
 SPIRV_IMAGE_OPERAND_PROJ_MODIFIER           = "proj"
 SPIRV_IMAGE_OPERAND_BIAS_MODIFIER           = "bias"
@@ -259,7 +261,8 @@ def copyFrom(self, other):
         self._cubeId            = other._cubeId
         self._atomicData        = other._atomicData
 
-        self._lzOptimization    = other._lzOptimization
+        self._supportLzOptimization    = other._supportLzOptimization
+        self._supportSparse     = other._supportSparse
         self._mangledName       = other._mangledName
         pass
 
@@ -303,7 +306,8 @@ def __init__(self, mangledName, sampledType):
 
         # For zero-LOD optimization, will generate 2 version of function, a lz optimized version which uses
         # zero-LOD instruction, and a normal version uses lod instruction.
-        self._lzOptimization    = (mangledName.find(SPIRV_IMAGE_OPERAND_LODLODZ_MODIFIER) != -1)
+        self._supportLzOptimization    = (mangledName.find(SPIRV_IMAGE_OPERAND_LODLODZ_MODIFIER) != -1)
+        self._supportSparse     = (mangledName.find(SPIRV_IMAGE_SPARSE_MODIFIER) != -1)
         self._mangledName       = mangledName
         pass
 
@@ -312,13 +316,13 @@ def parse(self):
         # Gets each image opcode trait token from function's mangled name
         tokens         = self._mangledName.split('.')
         # Parses SpirvImageOpKind
-        opKind         = tokens[1]
+        opKind         = tokens[2]
         assert opKind in SPIRV_IMAGE_INST_KIND_DICT, "Error: " + self._mangledName
         self._opKind = SPIRV_IMAGE_INST_KIND_DICT[opKind]
         self._attr = SPIRV_IMAGE_INST_KIND_ATTR_DICT[self._opKind]
 
         # Parses dimension
-        dimName        = tokens[2]
+        dimName        = tokens[3]
         arrayed        = False
         if dimName.find(SPIRV_IMAGE_ARRAY_MODIFIER) != -1:
             arrayed    = True
@@ -328,7 +332,7 @@ def parse(self):
         self._arrayed       = arrayed
 
         # Parses other traits
-        for t in tokens[3:]:
+        for t in tokens[4:]:
             if t == SPIRV_IMAGE_OPERAND_DREF_MODIFIER:
                 self._hasDref           = True
             elif t == SPIRV_IMAGE_OPERAND_PROJ_MODIFIER:
@@ -400,10 +404,29 @@ def __init__(self, funcDefBase, gfxLevel):
         self._gfxLevel = gfxLevel
         pass
 
-    # Generates image function implementation, will detect zero-LOD optimization and generate both normal
-    # and optimized version.
+    # Start code generation
     def gen(self, irOut):
-        if self._lzOptimization:
+        self._genWithSparse(irOut)
+        pass
+
+    # Generate both normal and sparse version
+    def _genWithSparse(self, irOut):
+        if self._supportSparse:
+            # Generate sparse version
+            codeGen = CodeGen(self, self._gfxLevel)
+            codeGen._genWithLzOptimization(irOut)
+
+            # Turn off sparse support
+            self._supportSparse = False
+
+        # Generate normal version
+        codeGen = CodeGen(self, self._gfxLevel)
+        codeGen._genWithLzOptimization(irOut)
+        pass
+
+    # Generate both normal and zero-LOD optimized version
+    def _genWithLzOptimization(self, irOut):
+        if self._supportLzOptimization:
             # Generate zero-LOD optimized version
             self._mangledName = self._mangledName.replace(SPIRV_IMAGE_OPERAND_LODLODZ_MODIFIER,
                                                           SPIRV_IMAGE_OPERAND_LODZ_MODIFIER)
@@ -411,21 +434,21 @@ def gen(self, irOut):
             codeGen._genInternal(irOut)
 
             # Turn off zero-LOD optimization
-            self._lzOptimization = False
+            self._supportLzOptimization = False
             self._mangledName = self._mangledName.replace(SPIRV_IMAGE_OPERAND_LODZ_MODIFIER,
                                                           SPIRV_IMAGE_OPERAND_LOD_MODIFIER)
-
+        # Generate normal version
         codeGen = CodeGen(self, self._gfxLevel)
         codeGen._genInternal(irOut)
         pass
 
     # Generates image function implementation.
     def _genInternal(self, irOut):
-        retType = self._opKind == SpirvImageOpKind.write and "void" or self.getReturnType()
+        retType = self._supportSparse and self.getSparseReturnType(self.getReturnType()) or self.getReturnType()
         irFuncDef = "define %s @%s(%s) %s\n" % (retType,
-                                                 self.getFunctionName(),
-                                                 self.getParamList(),
-                                                 self._attr)
+                                                self.getFunctionName(),
+                                                self.getParamList(),
+                                                self._attr)
         irOut.write(irFuncDef)
         irOut.write('{\n')
         self.genLoadSamplerAndResource(irOut)
@@ -478,31 +501,46 @@ def _genInternal(self, irOut):
 
     # Gets return type of image operation, which is type of texel.
     def getReturnType(self):
-        if self.isAtomicOp():
-            return self._sampledType == SpirvSampledType.f32 and "float" or "i32"
+        ret = "void"
+        if self._opKind == SpirvImageOpKind.write:
+            pass
+        elif self.isAtomicOp():
+            ret = self._sampledType == SpirvSampledType.f32 and "float" or "i32"
         elif self._opKind == SpirvImageOpKind.querylod:
-            return "<2 x float>"
+            ret = "<2 x float>"
         elif self._hasDref and self._opKind != SpirvImageOpKind.gather:
             assert self._sampledType == SpirvSampledType.f32
-            return "float"
+            ret = "float"
         else:
             if self._sampledType == SpirvSampledType.f32:
-                return "<4 x float>"
+                ret = "<4 x float>"
             elif self._sampledType == SpirvSampledType.i32:
-                return "<4 x i32>"
+                ret = "<4 x i32>"
             elif self._sampledType == SpirvSampledType.u32:
-                return "<4 x i32>"
+                ret = "<4 x i32>"
             else:
                 shouldNeverCall()
 
+        return ret
+
+    def getSparseReturnType(self, dataReturnType):
+        assert self._supportSparse
+        return "{ i32, %s }" % (dataReturnType)
+
     # Gets image function name.
     def getFunctionName(self):
         tokens = self._mangledName.split('.')
-        tokens[0] = SPIRV_IMAGE_PREFIX
-        assert tokens[2].startswith(SPIRV_IMAGE_DIM_PREFIX)
-        tokens[2] = tokens[2][len(SPIRV_IMAGE_DIM_PREFIX):]
+        assert tokens[0] == SPIRV_IMAGE_PREFIX
+        assert tokens[3].startswith(SPIRV_IMAGE_DIM_PREFIX)
+
+        # Setup image sparse modifier in function name
+        tokens[1] = self._supportSparse and SPIRV_IMAGE_MODIFIER + SPIRV_IMAGE_SPARSE_MODIFIER \
+                                        or  SPIRV_IMAGE_MODIFIER
+
+        # Remove dim prefix in function name
+        tokens[3] = tokens[3][len(SPIRV_IMAGE_DIM_PREFIX):]
         sampledTypeName = rFind(SPIRV_SAMPLED_TYPE_DICT, self._sampledType)
-        tokens.insert(2, sampledTypeName)
+        tokens.insert(3, sampledTypeName)
         funcName = '.'.join(tokens)
 
         # For atomic operations, atomic.xxx has been changed to atomic_xxx to ease python process,
@@ -652,8 +690,21 @@ def processReturn(self, retVal, intrinGen, irOut):
             irOut.write("    %s = extractelement %s %s, i32 0\n" % (retVal, \
                                                                     intrinGen.getBackendRetType(), \
                                                                     oldRetVal))
-        retType = self._opKind == SpirvImageOpKind.write and "void" or self.getReturnType()
-        irOut.write("    ret %s %s\n" % (retType, retVal))
+        retType = self.getReturnType()
+
+        if self._supportSparse:
+            # Return value of sparse instruction is struct
+            sparseRetType = self.getSparseReturnType(retType)
+            tempRetVal = self.acquireLocalVar()
+            irOut.write("    %s = insertvalue %s undef, i32 1, 0\n" % (tempRetVal, sparseRetType))
+            dataRetVal = retVal
+            retVal = self.acquireLocalVar()
+            irOut.write("    %s = insertvalue %s %s, %s %s, 1\n" % (retVal, sparseRetType, tempRetVal, retType, dataRetVal))
+            irOut.write("    ret %s %s\n" % (sparseRetType, retVal))
+            pass
+        else:
+            irOut.write("    ret %s %s\n" % (retType, retVal))
+            pass
 
     # Generates coordinate parameters.
     def genCoord(self, irOut):
@@ -1154,7 +1205,7 @@ def genFillVAddrReg(self, constOffsetsIndex, isFetchingFromFmask, irOut):
                 index += 1
                 irOut.write(ret[1])
 
-        if self._hasLod and not self._lzOptimization:
+        if self._hasLod and not self._supportLzOptimization:
             ret = self.getInsertElement(vaddrReg, vaddrRegType, vaddrRegCompType, self._lod, index)
             vaddrReg = ret[0]
             index += 1
@@ -1168,7 +1219,7 @@ def getVAddrRegSize(self):
             size += 1
         if self._hasBias:
             size += 1
-        if self._hasLod and not self._lzOptimization:
+        if self._hasLod and not self._supportLzOptimization:
             size += 1
         if self._hasGrad:
             size += self.getCoordNumComponents(False, False, False) * 2
@@ -1839,9 +1890,9 @@ def getFuncName(self):
 
         if self._hasBias:
             funcName += ".b"
-        elif self._hasLod and not self._lzOptimization:
+        elif self._hasLod and not self._supportLzOptimization:
             funcName += ".l"
-        elif self._hasLod and self._lzOptimization:
+        elif self._hasLod and self._supportLzOptimization:
             funcName += ".lz"
         elif self._hasGrad:
             funcName += ".d"
@@ -1870,7 +1921,11 @@ def processLine(irOut, funcConfig, gfxLevel):
     # A mangled function configuration looks like:
     # llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.bias.constoffset
     # Supported configuration tokens:   (All tokens must follow this order)
-    # 1.  One of:                                           (mandatory)
+    # 0.  llpc                                              (mandatory)
+    # 1.  image or image|sparse                             (mandatory)
+    #         image|sparse means sparse instruction is supported for this function, an additional sparse version
+    #         will be generated.
+    # 2.  One of:                                           (mandatory)
     #       sample
     #       fetch
     #       gather
@@ -1879,7 +1934,7 @@ def processLine(irOut, funcConfig, gfxLevel):
     #       write
     #       atomic.exchange
     #       atomic.compExchange
-    #       atomic_iincrement
+    #       atomic.iincrement
     #       atomic.idecrement
     #       atomic.iadd
     #       atomic.isub
@@ -1890,40 +1945,47 @@ def processLine(irOut, funcConfig, gfxLevel):
     #       atomic.and
     #       atomic.or
     #       atomic.xor
-    # 2.  Dimension string                                  (mandatory, see below)
-    # 3.  proj                                              (optional)
-    # 4.  dref                                              (optional)
-    # 5.  bias                                              (optional)
-    # 6.  lod                                               (optional)
-    # 7.  grad                                              (optional)
-    # 8.  constoffset                                       (optional)
-    # 9.  offset                                            (optional)
-    # 10. constoffsets                                      (optional)
-    # 11. sample                                            (optional)
-    # 12. minlod                                            (optional)
-    # 13. fmaskbased                                        (optional)
-    # 14. fmaskonly                                         (optional)
+    # 3.  Dimension string                                  (mandatory, see below)
+    # 4.  proj                                              (optional)
+    # 5.  dref                                              (optional)
+    # 6.  bias                                              (optional)
+    # 7.  lod or lod|lodz                                   (optional)
+    #         lod|lodz means lz optimization is enabled for this function, besides normal lod version, an additional
+    #         lodz version will also be generated, which leverages hardware lz instructions.
+    # 8.  grad                                              (optional)
+    # 9.  constoffset                                       (optional)
+    # 10.  offset                                            (optional)
+    # 11. constoffsets                                      (optional)
+    # 12. sample                                            (optional)
+    # 13. minlod                                            (optional)
+    # 14. fmaskbased                                        (optional)
+    # 15. fmaskonly                                         (optional)
     # Dimension string: All supported dimensions are packed in a dimension string, as a configuration token.
     # Dimension string format:
-    # Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray|Rect|Buffer
+    # Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray|Rect|Buffer|SubpassData
 
     print(">>>  %s" % (funcConfig))
-    assert funcConfig.startswith(SPIRV_IMAGE_PREFIX), "Error: " + funcConfig
 
     # For atomic operations, replace atomic.xxx to atomic_xxx to ease python process
     funcConfig = funcConfig.replace('atomic.', 'atomic_')
 
-    funcConfig = funcConfig[(len(SPIRV_IMAGE_PREFIX)):]
     mangledTokens  = funcConfig.split('.')
+    # check token 0
+    assert funcConfig.startswith(SPIRV_IMAGE_PREFIX), "Error prefix: " + funcConfig
+
+    # check token 1
+    assert mangledTokens[1] in (SPIRV_IMAGE_MODIFIER, SPIRV_IMAGE_MODIFIER + '|' + SPIRV_IMAGE_SPARSE_MODIFIER), \
+            "Error image modifier" + funcConfig
+
     # Extract dimensions from dimension string
-    dimString = mangledTokens[2]
+    dimString = mangledTokens[3]
     assert dimString.startswith(SPIRV_IMAGE_DIM_PREFIX), "" + dimString
     dims = dimString[3:].split('|')
-    opKind = SPIRV_IMAGE_INST_KIND_DICT[mangledTokens[1]]
+    opKind = SPIRV_IMAGE_INST_KIND_DICT[mangledTokens[2]]
 
     # Generate function definition for each dimension
     for dim in dims:
-        mangledTokens[2] = SPIRV_IMAGE_DIM_PREFIX + dim
+        mangledTokens[3] = SPIRV_IMAGE_DIM_PREFIX + dim
         mangledName = '.'.join(mangledTokens)
         if opKind in (SpirvImageOpKind.sample, SpirvImageOpKind.fetch, SpirvImageOpKind.gather, \
                       SpirvImageOpKind.querylod, SpirvImageOpKind.read, SpirvImageOpKind.write, \
diff --git a/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.txt b/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.txt
index 5726b7c6..4b345708 100644
--- a/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.txt
+++ b/icd/api/llpc/patch/generate/script/genGlslImageOpEmuCode.txt
@@ -1,6 +1,10 @@
 # Configuration to generate LLVM IR implementation for all SPIR-V image instructions
 # Supported configuration tokens:   (All tokens must follow this order)
-# 1.  One of:                                           (mandatory)
+# 0.  llpc                                              (mandatory)
+# 1.  image or image|sparse                             (mandatory)
+#         image|sparse means sparse instruction is supported for this function, an additional sparse version
+#         will be generated.
+# 2.  One of:                                           (mandatory)
 #       sample
 #       fetch
 #       gather
@@ -20,21 +24,21 @@
 #       atomic.and
 #       atomic.or
 #       atomic.xor
-# 2.  Dimension string                                  (mandatory, see below)
-# 3.  proj                                              (optional)
-# 4.  dref                                              (optional)
-# 5.  bias                                              (optional)
-# 6.  lod or lod|lodz                                   (optional)
+# 3.  Dimension string                                  (mandatory, see below)
+# 4.  proj                                              (optional)
+# 5.  dref                                              (optional)
+# 6.  bias                                              (optional)
+# 7.  lod or lod|lodz                                   (optional)
 #         lod|lodz means lz optimization is enabled for this function, besides normal lod version, an additional
 #         lodz version will also be generated, which leverages hardware lz instructions.
-# 7.  grad                                              (optional)
-# 8.  constoffset                                       (optional)
-# 9.  offset                                            (optional)
-# 10. constoffsets                                      (optional)
-# 11. sample                                            (optional)
-# 12. minlod                                            (optional)
-# 13. fmaskbased                                        (optional)
-# 14. fmaskonly                                         (optional)
+# 8.  grad                                              (optional)
+# 9.  constoffset                                       (optional)
+# 10.  offset                                            (optional)
+# 11. constoffsets                                      (optional)
+# 12. sample                                            (optional)
+# 13. minlod                                            (optional)
+# 14. fmaskbased                                        (optional)
+# 15. fmaskonly                                         (optional)
 # Dimension string: All supported dimensions are packed in a dimension string, as a configuration token.
 # Dimension string format:
 # Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray|Rect|Buffer|SubpassData
@@ -42,66 +46,66 @@
 # llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.bias.constoffset
 
 # Sampling instructions
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray.dref
-llpc.image.sample.Dim1D|2D|3D|Rect.proj
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.lod|lodz
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.bias
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.dref.lod|lodz
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Cube.dref.bias
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.lod|lodz
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.bias
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.lod|lodz
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.bias
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray.dref
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.lod|lodz
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.bias
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Cube|CubeArray.dref.lod|lodz
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Cube.dref.bias
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.lod|lodz
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.bias
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref.lod|lodz
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref.bias
 
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray.grad
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube.dref.grad
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.grad
-llpc.image.sample.Dim1D|2D|3D.proj.dref.grad
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube|CubeArray.grad
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect|Cube.dref.grad
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.grad
+llpc.image|sparse.sample.Dim1D|2D|3D.proj.dref.grad
 
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.constoffset
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.dref.constoffset
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.constoffset
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.constoffset
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray.lod|lodz.constoffset
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray.bias.constoffset
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray.dref.lod|lodz.constoffset
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray.dref.bias.constoffset
-llpc.image.sample.Dim1D|2D|3D.proj.lod|lodz.constoffset
-llpc.image.sample.Dim1D|2D|3D.proj.bias.constoffset
-llpc.image.sample.Dim1D|2D|3D.proj.dref.lod|lodz.constoffset
-llpc.image.sample.Dim1D|2D|3D.proj.dref.bias.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.dref.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray.lod|lodz.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray.bias.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray.dref.lod|lodz.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray.dref.bias.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D.proj.lod|lodz.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D.proj.bias.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D.proj.dref.lod|lodz.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D.proj.dref.bias.constoffset
 
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.grad.constoffset
-llpc.image.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.dref.grad.constoffset
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.grad.constoffset
-llpc.image.sample.Dim1D|2D|3D|Rect.proj.dref.grad.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.grad.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|1DArray|2DArray|Rect.dref.grad.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.grad.constoffset
+llpc.image|sparse.sample.Dim1D|2D|3D|Rect.proj.dref.grad.constoffset
 
 # Fetch instructions
-llpc.image.fetch.Dim1D|2D|3D|1DArray|2DArray.lod
-llpc.image.fetch.Dim1D|2D|3D|1DArray|2DArray|Rect|Buffer
-llpc.image.fetch.Dim1D|2D|3D|1DArray|2DArray.lod.constoffset
-llpc.image.fetch.DimRect.constoffset
-llpc.image.fetch.Dim2D|2DArray.sample
-llpc.image.fetch.Dim2D|2DArray.sample.fmaskbased
+llpc.image|sparse.fetch.Dim1D|2D|3D|1DArray|2DArray.lod
+llpc.image|sparse.fetch.Dim1D|2D|3D|1DArray|2DArray|Rect|Buffer
+llpc.image|sparse.fetch.Dim1D|2D|3D|1DArray|2DArray.lod.constoffset
+llpc.image|sparse.fetch.DimRect.constoffset
+llpc.image|sparse.fetch.Dim2D|2DArray.sample
+llpc.image|sparse.fetch.Dim2D|2DArray.sample.fmaskbased
 llpc.image.fetch.Dim2D|2DArray.sample.fmaskonly
 
 # Gather instructions
-llpc.image.gather.Dim2D|2DArray|Cube|CubeArray|Rect
-llpc.image.gather.Dim2D|2DArray|Cube|CubeArray|Rect.dref
-llpc.image.gather.Dim2D|2DArray|Rect.constoffset
-llpc.image.gather.Dim2D|2DArray|Rect.dref.constoffset
-llpc.image.gather.Dim2D|2DArray|Rect.offset
-llpc.image.gather.Dim2D|2DArray|Rect.dref.offset
-llpc.image.gather.Dim2D|2DArray|Rect.constoffsets
-llpc.image.gather.Dim2D|2DArray|Rect.dref.constoffsets
+llpc.image|sparse.gather.Dim2D|2DArray|Cube|CubeArray|Rect
+llpc.image|sparse.gather.Dim2D|2DArray|Cube|CubeArray|Rect.dref
+llpc.image|sparse.gather.Dim2D|2DArray|Rect.constoffset
+llpc.image|sparse.gather.Dim2D|2DArray|Rect.dref.constoffset
+llpc.image|sparse.gather.Dim2D|2DArray|Rect.offset
+llpc.image|sparse.gather.Dim2D|2DArray|Rect.dref.offset
+llpc.image|sparse.gather.Dim2D|2DArray|Rect.constoffsets
+llpc.image|sparse.gather.Dim2D|2DArray|Rect.dref.constoffsets
 
 # Image read and write instructions
-llpc.image.read.Dim1D|2D|3D|Rect|Cube|Buffer|1DArray|2DArray|CubeArray|SubpassData|SubpassDataArray
-llpc.image.read.Dim2D|2DArray|SubpassData.sample
-llpc.image.read.DimSubpassData.sample.fmaskbased
-llpc.image.read.Dim1D|2D|3D|Cube|1DArray|2DArray|CubeArray.lod
+llpc.image|sparse.read.Dim1D|2D|3D|Rect|Cube|Buffer|1DArray|2DArray|CubeArray|SubpassData|SubpassDataArray
+llpc.image|sparse.read.Dim2D|2DArray|SubpassData.sample
+llpc.image|sparse.read.DimSubpassData.sample.fmaskbased
+llpc.image|sparse.read.Dim1D|2D|3D|Cube|1DArray|2DArray|CubeArray.lod
 llpc.image.write.Dim1D|2D|3D|Rect|Cube|Buffer|1DArray|2DArray|CubeArray
 llpc.image.write.Dim2D|2DArray.sample
 llpc.image.write.Dim1D|2D|3D|Cube|1DArray|2DArray|CubeArray.lod
diff --git a/icd/api/llpc/patch/llpcCodeGenManager.cpp b/icd/api/llpc/patch/llpcCodeGenManager.cpp
index 3415eb7e..03e70b7f 100644
--- a/icd/api/llpc/patch/llpcCodeGenManager.cpp
+++ b/icd/api/llpc/patch/llpcCodeGenManager.cpp
@@ -45,13 +45,10 @@
 
 #include "spirv.hpp"
 #include "llpcCodeGenManager.h"
-#include "llpcGfx6ConfigBuilder.h"
-#ifdef LLPC_BUILD_GFX9
-#include "llpcGfx9ConfigBuilder.h"
-#endif
 #include "llpcContext.h"
 #include "llpcElf.h"
-#include "llpcGfx6Chip.h"
+#include "llpcGfx6ConfigBuilder.h"
+#include "llpcGfx9ConfigBuilder.h"
 #include "llpcInternal.h"
 
 namespace llvm
@@ -193,7 +190,7 @@ Result CodeGenManager::GenerateCode(
 
     if (cl::EmitLlvm)
     {
-        WriteBitcodeToFile(pModule, outStream);
+        WriteBitcodeToFile(*pModule, outStream);
         return result;
     }
 
@@ -316,11 +313,7 @@ Result CodeGenManager::BuildGraphicsPipelineRegConfig(
         }
         else
         {
-#ifdef LLPC_BUILD_GFX9
             result = Gfx9::ConfigBuilder::BuildPipelineVsFsRegConfig(pContext, ppConfig, pConfigSize);
-#else
-            result = Result::Unsupported;
-#endif
         }
     }
     else if (hasTs && (hasGs == false))
@@ -332,11 +325,7 @@ Result CodeGenManager::BuildGraphicsPipelineRegConfig(
         }
         else
         {
-#ifdef LLPC_BUILD_GFX9
             result = Gfx9::ConfigBuilder::BuildPipelineVsTsFsRegConfig(pContext, ppConfig, pConfigSize);
-#else
-            result = Result::Unsupported;
-#endif
         }
     }
     else if ((hasTs == false) && hasGs)
@@ -348,11 +337,7 @@ Result CodeGenManager::BuildGraphicsPipelineRegConfig(
         }
         else
         {
-#ifdef LLPC_BUILD_GFX9
             result = Gfx9::ConfigBuilder::BuildPipelineVsGsFsRegConfig(pContext, ppConfig, pConfigSize);
-#else
-            result = Result::Unsupported;
-#endif
         }
     }
     else
@@ -364,11 +349,7 @@ Result CodeGenManager::BuildGraphicsPipelineRegConfig(
         }
         else
         {
-#ifdef LLPC_BUILD_GFX9
             result = Gfx9::ConfigBuilder::BuildPipelineVsTsGsFsRegConfig(pContext, ppConfig, pConfigSize);
-#else
-            result = Result::Unsupported;
-#endif
         }
     }
 
@@ -393,11 +374,7 @@ Result CodeGenManager::BuildComputePipelineRegConfig(
     }
     else
     {
-#ifdef LLPC_BUILD_GFX9
         result = Gfx9::ConfigBuilder::BuildPipelineCsRegConfig(pContext, ppConfig, pConfigSize);
-#else
-        result = Result::Unsupported;
-#endif
     }
 
     return result;
diff --git a/icd/api/llpc/patch/llpcIntrinsDefs.h b/icd/api/llpc/patch/llpcIntrinsDefs.h
index 9f3453ee..3baaf467 100644
--- a/icd/api/llpc/patch/llpcIntrinsDefs.h
+++ b/icd/api/llpc/patch/llpcIntrinsDefs.h
@@ -49,8 +49,8 @@ static const uint32_t GS_EMIT_STREAM0 = 0x22; // [3:0] = 2 (GS), [5:4] = 2 (emit
 enum AddrSpace
 {
     ADDR_SPACE_GLOBAL           = 1,            // Global memory
-    ADDR_SPACE_CONST            = 2,            // Constant memory
     ADDR_SPACE_LOCAL            = 3,            // Local memory
+    ADDR_SPACE_CONST            = 4,            // Constant memory
 };
 
 // Enumerates the target for "export" instruction.
diff --git a/icd/api/llpc/patch/llpcPatchAddrSpaceMutate.cpp b/icd/api/llpc/patch/llpcPatchAddrSpaceMutate.cpp
index b7d12cac..596b155c 100644
--- a/icd/api/llpc/patch/llpcPatchAddrSpaceMutate.cpp
+++ b/icd/api/llpc/patch/llpcPatchAddrSpaceMutate.cpp
@@ -83,20 +83,48 @@ bool PatchAddrSpaceMutate::runOnModule(
     m_addrSpaceMap[SPIRAS_Constant] = ADDR_SPACE_CONST;
     m_addrSpaceMap[SPIRAS_Local] = ADDR_SPACE_LOCAL;
 
-    // We are not expecting any global variables that need their types mutating, other than unused ones
-    // left behind by previous passes.
-#ifndef NDEBUG
+    // Gather the globals and then process them. We do not want to reprocess globals that we create
+    // here. Ignore unused globals left behind by lowering passes.
+    SmallVector<GlobalVariable*, 8> globalVars;
     for (auto globalIt = module.global_begin(), globalItEnd = module.global_end();
           globalIt != globalItEnd; ++globalIt)
     {
         auto pGlobalVar = dyn_cast<GlobalVariable>(&*globalIt);
         if ((pGlobalVar != nullptr) && (pGlobalVar->use_empty() == false))
         {
-            auto pGlobalType = globalIt->getType();
-            LLPC_ASSERT(pGlobalType == MapType(pGlobalType));
+            globalVars.push_back(pGlobalVar);
+        }
+    }
+
+    // For any global variable whose type needs to change, create a new one. We only cope with the
+    // case where the top level address space changes, so we do not need to worry about modifying
+    // any initializer.
+    for (uint32_t globalVarIdx = 0; globalVarIdx != globalVars.size(); ++globalVarIdx)
+    {
+        auto pOldGlobalVar = globalVars[globalVarIdx];
+        auto pOldGlobalVarType = cast<PointerType>(pOldGlobalVar->getType());
+        auto pNewGlobalVarType = cast<PointerType>(MapType(pOldGlobalVarType));
+
+        if (pOldGlobalVarType != pNewGlobalVarType)
+        {
+            LLPC_ASSERT(pOldGlobalVarType->getElementType() == pNewGlobalVarType->getElementType());
+
+            auto pNewGlobalVar = new GlobalVariable(module,
+                                                    pOldGlobalVarType->getElementType(),
+                                                    pOldGlobalVar->isConstant(),
+                                                    pOldGlobalVar->getLinkage(),
+                                                    pOldGlobalVar->hasInitializer() ?
+                                                        pOldGlobalVar->getInitializer() : nullptr,
+                                                    "",
+                                                    nullptr,
+                                                    pOldGlobalVar->getThreadLocalMode(),
+                                                    pNewGlobalVarType->getAddressSpace(),
+                                                    pOldGlobalVar->isExternallyInitialized());
+
+            pNewGlobalVar->takeName(pOldGlobalVar);
+            m_globalMap[pOldGlobalVar] = pNewGlobalVar;
         }
     }
-#endif // NDEBUG
 
     // Gather the functions and then process them. We do not want to reprocess functions that we create here.
     SmallVector<Function*, 8> funcs;
diff --git a/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp b/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp
index 8b3833fd..ac219b26 100644
--- a/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp
+++ b/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp
@@ -38,9 +38,7 @@
 #include "SPIRVInternal.h"
 #include "llpcContext.h"
 #include "llpcGfx6Chip.h"
-#ifdef LLPC_BUILD_GFX9
 #include "llpcGfx9Chip.h"
-#endif
 #include "llpcIntrinsDefs.h"
 #include "llpcPatchEntryPointMutate.h"
 
diff --git a/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp b/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp
index b024b462..b2ef00bd 100644
--- a/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp
+++ b/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp
@@ -65,10 +65,8 @@ PatchInOutImportExport::PatchInOutImportExport()
     m_pFragDepth(nullptr),
     m_pFragStencilRef(nullptr),
     m_pSampleMask(nullptr),
-#ifdef LLPC_BUILD_GFX9
     m_pViewportIndex(nullptr),
     m_pLayer(nullptr),
-#endif
     m_hasTs(false),
     m_hasGs(false),
     m_pLds(nullptr),
@@ -1290,7 +1288,6 @@ void PatchInOutImportExport::visitReturnInst(
         // Export gl_Layer and gl_ViewportIndex before entry-point returns
         if ((m_gfxIp.major >= 9) && (useLayer || useViewportIndex))
         {
-#ifdef LLPC_BUILD_GFX9
             Value* pViewportIndexAndLayer = ConstantInt::get(m_pContext->Int32Ty(), 0);
 
             if (useViewportIndex)
@@ -1425,7 +1422,6 @@ void PatchInOutImportExport::visitReturnInst(
                     ++inOutUsage.expCount;
                 }
             }
-#endif
         }
 
         // NOTE: If no generic outputs are present in this shader, we have to export a dummy one
@@ -2174,11 +2170,6 @@ Value* PatchInOutImportExport::PatchVsBuiltInInputImport(
             pInput = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.viewIndex);
             break;
         }
-    case BuiltInSubgroupSize:
-        {
-            pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize);
-            break;
-        }
     case BuiltInDeviceIndex:
         {
             auto pPipelineInfo = reinterpret_cast<const GraphicsPipelineBuildInfo*>(m_pContext->GetPipelineBuildInfo());
@@ -2281,11 +2272,6 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport(
             pInput = inoutUsage.tcs.pInvocationId;
             break;
         }
-    case BuiltInSubgroupSize:
-        {
-            pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize);
-            break;
-        }
     case BuiltInDeviceIndex:
         {
             auto pPipelineInfo = reinterpret_cast<const GraphicsPipelineBuildInfo*>(m_pContext->GetPipelineBuildInfo());
@@ -2447,11 +2433,6 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport(
             pInput = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.viewIndex);
             break;
         }
-    case BuiltInSubgroupSize:
-        {
-            pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize);
-            break;
-        }
     case BuiltInDeviceIndex:
         {
             auto pPipelineInfo = reinterpret_cast<const GraphicsPipelineBuildInfo*>(m_pContext->GetPipelineBuildInfo());
@@ -2542,11 +2523,6 @@ Value* PatchInOutImportExport::PatchGsBuiltInInputImport(
             pInput = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.viewIndex);
             break;
         }
-    case BuiltInSubgroupSize:
-        {
-            pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize);
-            break;
-        }
     case BuiltInDeviceIndex:
         {
             auto pPipelineInfo = reinterpret_cast<const GraphicsPipelineBuildInfo*>(m_pContext->GetPipelineBuildInfo());
@@ -2846,11 +2822,6 @@ Value* PatchInOutImportExport::PatchFsBuiltInInputImport(
             args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 4));
             pInput = EmitCall(m_pModule, "llvm.amdgcn.ubfe.i32", pInputTy, args, NoAttrib, pInsertPos);
 
-            break;
-        }
-    case BuiltInSubgroupSize:
-        {
-            pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize);
             break;
         }
     case BuiltInDeviceIndex:
@@ -2969,11 +2940,6 @@ Value* PatchInOutImportExport::PatchCsBuiltInInputImport(
             pInput = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.localInvocationId);
             break;
         }
-    case BuiltInSubgroupSize:
-        {
-            pInput = ConstantInt::get(m_pContext->Int32Ty(), m_pContext->GetGpuProperty()->waveSize);
-            break;
-        }
     case BuiltInDeviceIndex:
         {
             auto pPipelineInfo = reinterpret_cast<const ComputePipelineBuildInfo*>(m_pContext->GetPipelineBuildInfo());
@@ -3368,10 +3334,8 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
                 }
                 else
                 {
-#ifdef LLPC_BUILD_GFX9
                     // NOTE: The export of gl_Layer is delayed and is done before entry-point returns.
                     m_pLayer = pOutput;
-#endif
                 }
             }
 
@@ -3393,10 +3357,8 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
                 }
                 else
                 {
-#ifdef LLPC_BUILD_GFX9
                     // NOTE: The export of gl_ViewportIndex is delayed and is done before entry-point returns.
                     m_pViewportIndex = pOutput;
-#endif
                 }
             }
 
@@ -3820,10 +3782,8 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport(
                 }
                 else
                 {
-#ifdef LLPC_BUILD_GFX9
                     // NOTE: The export of gl_Layer is delayed and is done before entry-point returns.
                     m_pLayer = pOutput;
-#endif
                 }
             }
 
@@ -3845,10 +3805,8 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport(
                 }
                 else
                 {
-#ifdef LLPC_BUILD_GFX9
                     // NOTE: The export of gl_ViewportIndex is delayed and is done before entry-point returns.
                     m_pViewportIndex = pOutput;
-#endif
                 }
             }
 
@@ -4137,10 +4095,8 @@ void PatchInOutImportExport::PatchCopyShaderBuiltInOutputExport(
             }
             else
             {
-#ifdef LLPC_BUILD_GFX9
                 // NOTE: The export of gl_Layer is delayed and is done before entry-point returns.
                 m_pLayer = pOutput;
-#endif
             }
 
             break;
@@ -4153,10 +4109,8 @@ void PatchInOutImportExport::PatchCopyShaderBuiltInOutputExport(
             }
             else
             {
-#ifdef LLPC_BUILD_GFX9
                 // NOTE: The export of gl_ViewportIndex is delayed and is done before entry-point returns.
                 m_pViewportIndex = pOutput;
-#endif
             }
 
             break;
@@ -5445,12 +5399,9 @@ uint32_t PatchInOutImportExport::CalcPatchCountPerThreadGroup(
 
     // NOTE: Performance analysis shows that 16 patches per thread group is an optimal upper-bound. The value is only
     // an experimental number. For GFX9. 64 is an optimal number instead.
-#ifdef LLPC_BUILD_GFX9
     const auto gfxIp = m_pContext->GetGfxIpVersion();
     const uint32_t optimalPatchCountPerThreadGroup = (gfxIp.major >= 9) ? 64 : 16;
-#else
-    const uint32_t optimalPatchCountPerThreadGroup = 16;
-#endif
+
     patchCountPerThreadGroup = std::min(patchCountPerThreadGroup, optimalPatchCountPerThreadGroup);
 
     if (m_pContext->IsTessOffChip())
diff --git a/icd/api/llpc/patch/llpcPatchInOutImportExport.h b/icd/api/llpc/patch/llpcPatchInOutImportExport.h
index 6e688637..a5c120ac 100644
--- a/icd/api/llpc/patch/llpcPatchInOutImportExport.h
+++ b/icd/api/llpc/patch/llpcPatchInOutImportExport.h
@@ -264,12 +264,10 @@ class PatchInOutImportExport:
     llvm::Value*            m_pFragDepth;               // Correspond to "out float gl_FragDepth"
     llvm::Value*            m_pFragStencilRef;          // Correspond to "out int gl_FragStencilRef"
     llvm::Value*            m_pSampleMask;              // Correspond to "out int gl_SampleMask[]"
-#ifdef LLPC_BUILD_GFX9
     // NOTE: For GFX9, gl_ViewportIndex and gl_Layer are packed with one channel (gl_ViewpoertInex is 16-bit high part
     // and gl_Layer is 16-bit low part). Thus, the export is delayed with them merged together.
     llvm::Value*            m_pViewportIndex;           // Correspond to "out int gl_ViewportIndex"
     llvm::Value*            m_pLayer;                   // Correspond to "out int gl_Layer"
-#endif
 
     bool                    m_hasTs;                    // Whether the pipeline has tessellation shaders
 
diff --git a/icd/api/llpc/patch/llpcPatchResourceCollect.cpp b/icd/api/llpc/patch/llpcPatchResourceCollect.cpp
index 2e20c4f4..7211a203 100644
--- a/icd/api/llpc/patch/llpcPatchResourceCollect.cpp
+++ b/icd/api/llpc/patch/llpcPatchResourceCollect.cpp
@@ -167,8 +167,6 @@ void PatchResourceCollect::visitCallInst(
     else if (mangledName.startswith(LlpcName::ImageCallPrefix))
     {
         // Image operations
-        auto opName = mangledName.substr(strlen(LlpcName::ImageCallPrefix));
-
         ShaderImageCallMetadata imageCallMeta = {};
         LLPC_ASSERT(callInst.getNumArgOperands() >= 2);
         uint32_t metaOperandIndex = callInst.getNumArgOperands() - 1;
@@ -187,18 +185,11 @@ void PatchResourceCollect::visitCallInst(
         DescriptorPair descPair = { descSet, binding };
         m_pResUsage->descPairs.insert(descPair.u64All);
 
-        std::string imageSampleName;
-        std::string imageGatherName;
-        std::string imageQueryLodName;
-        SPIRV::SPIRVImageOpKindNameMap::find(ImageOpSample, &imageSampleName);
-        SPIRV::SPIRVImageOpKindNameMap::find(ImageOpGather, &imageGatherName);
-        SPIRV::SPIRVImageOpKindNameMap::find(ImageOpQueryLod, &imageQueryLodName);
-
         // NOTE: For image sampling operations, we have to add both resource descriptor and sampler descriptor info
         // to descriptor usages, operand 0 and 1 are sampler descriptor, 3 and 4 are resource descriptor
-        if (opName.startswith(imageSampleName) ||
-            opName.startswith(imageGatherName) ||
-            opName.startswith(imageQueryLodName))
+        if ((imageOp == ImageOpSample) ||
+            (imageOp == ImageOpGather) ||
+            (imageOp == ImageOpQueryLod))
         {
             uint32_t descSet = cast<ConstantInt>(callInst.getOperand(3))->getZExtValue();
             uint32_t binding = cast<ConstantInt>(callInst.getOperand(4))->getZExtValue();
diff --git a/icd/api/llpc/tool/amdllpc.cpp b/icd/api/llpc/tool/amdllpc.cpp
index bb394b4d..8999e22c 100644
--- a/icd/api/llpc/tool/amdllpc.cpp
+++ b/icd/api/llpc/tool/amdllpc.cpp
@@ -1149,8 +1149,7 @@ int32_t main(
                 // Translate LLVM module to LLVM bitcode
                 llvm::SmallString<1024> bitcodeBuf;
                 raw_svector_ostream bitcodeStream(bitcodeBuf);
-                WriteBitcodeToFile(pModule.get(), bitcodeStream);
-
+                WriteBitcodeToFile(*pModule.get(), bitcodeStream);
                 void* pCode = new uint8_t[bitcodeBuf.size()];
                 memcpy(pCode, bitcodeBuf.data(), bitcodeBuf.size());
                 compileInfo.spirvBin[shaderStage].codeSize = bitcodeBuf.size();
diff --git a/icd/api/llpc/translator/SPIRVInternal.h b/icd/api/llpc/translator/SPIRVInternal.h
index 056c5565..e1930c7a 100644
--- a/icd/api/llpc/translator/SPIRVInternal.h
+++ b/icd/api/llpc/translator/SPIRVInternal.h
@@ -346,7 +346,8 @@ namespace kSPIRVName {
 }
 
 namespace gSPIRVName {
-  const static char ImageCallPrefix[]                     = "spirv.image.";
+  const static char ImageCallPrefix[]                     = "spirv.image";
+  const static char ImageCallModSparse[]                  = "sparse";
   const static char ImageCallModDref[]                    = ".dref";
   const static char ImageCallModProj[]                    = ".proj";
   const static char ImageCallModBias[]                    = ".bias";
diff --git a/icd/api/llpc/translator/SPIRVReader.cpp b/icd/api/llpc/translator/SPIRVReader.cpp
index 8f545d0b..fe298ce8 100644
--- a/icd/api/llpc/translator/SPIRVReader.cpp
+++ b/icd/api/llpc/translator/SPIRVReader.cpp
@@ -1771,8 +1771,6 @@ SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
     // load/store when we visit this bool variable. This issue exists in
     // DOOM4 released version, we have to keep the workaround.
     if (Dst->getType()->getPointerElementType() != Src->getType()) {
-      assert(Src->getType()->isAggregateType() &&
-             Dst->getType()->getPointerElementType()->isAggregateType());
       SI = transSPIRVBuiltinFromInst(BS, BB);
     } else {
       // NOTE: For those storage classes that will not involve memory
@@ -2305,7 +2303,19 @@ SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
   case OpImageQueryLevels:
   case OpImageQuerySamples:
   case OpImageRead:
-  case OpImageWrite: {
+  case OpImageWrite:
+  case OpImageSparseSampleImplicitLod:
+  case OpImageSparseSampleExplicitLod:
+  case OpImageSparseSampleDrefImplicitLod:
+  case OpImageSparseSampleDrefExplicitLod:
+  case OpImageSparseSampleProjImplicitLod:
+  case OpImageSparseSampleProjExplicitLod:
+  case OpImageSparseSampleProjDrefImplicitLod:
+  case OpImageSparseSampleProjDrefExplicitLod:
+  case OpImageSparseFetch:
+  case OpImageSparseGather:
+  case OpImageSparseDrefGather:
+  case OpImageSparseRead: {
     return mapValue(BV,
         transSPIRVImageOpFromInst(
             static_cast<SPIRVInstruction *>(BV),
@@ -2347,6 +2357,27 @@ SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *BV, Function *F,
         false, 0, BB);
     return mapValue(BV, LI);
   }
+  case OpImageSparseTexelsResident: {
+    SPIRVImageSparseTexelsResident *BI = static_cast<SPIRVImageSparseTexelsResident *>(BV);
+    auto ResidentCode = transValue(BI->getResidentCode(), F, BB);
+
+    std::string FuncName("llpc.imagesparse.texel.resident");
+    SmallVector<Value *, 1> Arg;
+    Arg.push_back(ResidentCode);
+
+    Function *Func = M->getFunction(FuncName);
+    if (!Func) {
+      SmallVector<Type *, 1> ArgTy;
+      ArgTy.push_back(Type::getInt32Ty(*Context));
+      FunctionType *FuncTy = FunctionType::get(Type::getInt1Ty(*Context), ArgTy, false);
+      Func = Function::Create(FuncTy, GlobalValue::ExternalLinkage, FuncName, M);
+      Func->setCallingConv(CallingConv::SPIR_FUNC);
+      if (isFuncNoUnwind())
+        Func->addFnAttr(Attribute::NoUnwind);
+    }
+
+    return mapValue(BV, CallInst::Create(Func, Arg, "", BB));
+  }
   default: {
     auto OC = BV->getOpCode();
     if (isSPIRVCmpInstTransToLLVMInst(static_cast<SPIRVInstruction*>(BV))) {
@@ -2676,13 +2707,19 @@ SPIRVToLLVM::transSPIRVImageOpFromInst(SPIRVInstruction *BI, BasicBlock*BB)
 
   if (Info.OpKind != ImageOpQueryNonLod) {
     // Generate name strings for image calls:
-    //    Format: prefix.op.[f32|i32|u32].dim[.proj][.dref][.bias][.lod][.grad]
-    //                                       [.constoffset][.offset]
-    //                                       [.constoffsets][.sample][.minlod]
+    //    Format: prefix.image[sparse].op.[f32|i32|u32].dim[.proj][.dref][.bias][.lod][.grad]
+    //                                                      [.constoffset][.offset]
+    //                                                      [.constoffsets][.sample][.minlod]
 
     // Add call prefix
     SS << gSPIRVName::ImageCallPrefix;
 
+    // Add sparse modifier
+    if (Info.IsSparse)
+      SS << gSPIRVName::ImageCallModSparse;
+
+    SS << ".";
+
     // Add image operation kind
     std::string S;
     SPIRVImageOpKindNameMap::find(Info.OpKind, &S);
@@ -2853,7 +2890,7 @@ SPIRVToLLVM::transSPIRVImageOpFromInst(SPIRVInstruction *BI, BasicBlock*BB)
     //    Format: prefix.query.op.dim[.cubearray][.buffer].returntype
 
     // Add call prefix
-    SS << gSPIRVName::ImageCallPrefix;
+    SS << gSPIRVName::ImageCallPrefix << ".";
 
     // Add image operation kind: query
     std::string S;
diff --git a/icd/api/llpc/translator/SPIRVUtil.cpp b/icd/api/llpc/translator/SPIRVUtil.cpp
index 3cc64c75..af887599 100644
--- a/icd/api/llpc/translator/SPIRVUtil.cpp
+++ b/icd/api/llpc/translator/SPIRVUtil.cpp
@@ -108,7 +108,7 @@ saveLLVMModule(Module *M, const std::string &OutputFile) {
     return;
   }
 
-  WriteBitcodeToFile(M, Out.os());
+  WriteBitcodeToFile(*M, Out.os());
   Out.keep();
 }
 
diff --git a/icd/api/llpc/translator/libSPIRV/SPIRVEntry.h b/icd/api/llpc/translator/libSPIRV/SPIRVEntry.h
index 4d33adca..950bc9f9 100644
--- a/icd/api/llpc/translator/libSPIRV/SPIRVEntry.h
+++ b/icd/api/llpc/translator/libSPIRV/SPIRVEntry.h
@@ -730,19 +730,6 @@ T* bcast(SPIRVEntry *E) {
 // Each time a new class is implemented, remove the corresponding typedef.
 // This is also an indication of how much work is left.
 #define _SPIRV_OP(x, ...) typedef SPIRVEntryOpCodeOnly<Op##x> SPIRV##x;
-_SPIRV_OP(ImageSparseSampleImplicitLod, 305)
-_SPIRV_OP(ImageSparseSampleExplicitLod, 306)
-_SPIRV_OP(ImageSparseSampleDrefImplicitLod, 307)
-_SPIRV_OP(ImageSparseSampleDrefExplicitLod, 308)
-_SPIRV_OP(ImageSparseSampleProjImplicitLod, 309)
-_SPIRV_OP(ImageSparseSampleProjExplicitLod, 310)
-_SPIRV_OP(ImageSparseSampleProjDrefImplicitLod, 311)
-_SPIRV_OP(ImageSparseSampleProjDrefExplicitLod, 312)
-_SPIRV_OP(ImageSparseFetch, 313)
-_SPIRV_OP(ImageSparseGather, 314)
-_SPIRV_OP(ImageSparseDrefGather, 315)
-_SPIRV_OP(ImageSparseTexelsResident, 316)
-_SPIRV_OP(ImageSparseRead, 320)
 _SPIRV_OP(TypeNamedBarrier)
 _SPIRV_OP(NamedBarrierInitialize)
 _SPIRV_OP(MemoryNamedBarrier)
@@ -756,41 +743,6 @@ _SPIRV_OP(NamedBarrierInitialize)
 _SPIRV_OP(MemoryNamedBarrier)
 _SPIRV_OP(ExecutionModeId)
 _SPIRV_OP(DecorateId)
-#ifdef ICD_VULKAN_1_1
-_SPIRV_OP(GroupNonUniformAll)
-_SPIRV_OP(GroupNonUniformAny)
-_SPIRV_OP(GroupNonUniformAllEqual)
-_SPIRV_OP(GroupNonUniformBroadcast)
-_SPIRV_OP(GroupNonUniformBroadcastFirst)
-_SPIRV_OP(GroupNonUniformBallot)
-_SPIRV_OP(GroupNonUniformInverseBallot)
-_SPIRV_OP(GroupNonUniformBallotBitExtract)
-_SPIRV_OP(GroupNonUniformBallotBitCount)
-_SPIRV_OP(GroupNonUniformBallotFindLSB)
-_SPIRV_OP(GroupNonUniformBallotFindMSB)
-_SPIRV_OP(GroupNonUniformShuffle)
-_SPIRV_OP(GroupNonUniformShuffleXor)
-_SPIRV_OP(GroupNonUniformShuffleUp)
-_SPIRV_OP(GroupNonUniformShuffleDown)
-_SPIRV_OP(GroupNonUniformIAdd)
-_SPIRV_OP(GroupNonUniformFAdd)
-_SPIRV_OP(GroupNonUniformIMul)
-_SPIRV_OP(GroupNonUniformFMul)
-_SPIRV_OP(GroupNonUniformSMin)
-_SPIRV_OP(GroupNonUniformUMin)
-_SPIRV_OP(GroupNonUniformFMin)
-_SPIRV_OP(GroupNonUniformSMax)
-_SPIRV_OP(GroupNonUniformUMax)
-_SPIRV_OP(GroupNonUniformFMax)
-_SPIRV_OP(GroupNonUniformBitwiseAnd)
-_SPIRV_OP(GroupNonUniformBitwiseOr)
-_SPIRV_OP(GroupNonUniformBitwiseXor)
-_SPIRV_OP(GroupNonUniformLogicalAnd)
-_SPIRV_OP(GroupNonUniformLogicalOr)
-_SPIRV_OP(GroupNonUniformLogicalXor)
-_SPIRV_OP(GroupNonUniformQuadBroadcast)
-_SPIRV_OP(GroupNonUniformQuadSwap)
-#endif
 _SPIRV_OP(GroupIAddNonUniformAMD)
 _SPIRV_OP(GroupFAddNonUniformAMD)
 _SPIRV_OP(GroupFMinNonUniformAMD)
diff --git a/icd/api/llpc/translator/libSPIRV/SPIRVEnum.h b/icd/api/llpc/translator/libSPIRV/SPIRVEnum.h
index 89e586af..fdbcc245 100644
--- a/icd/api/llpc/translator/libSPIRV/SPIRVEnum.h
+++ b/icd/api/llpc/translator/libSPIRV/SPIRVEnum.h
@@ -190,6 +190,15 @@ SPIRVMap<SPIRVCapabilityKind, SPIRVCapVec>::init() {
   ADD_VEC_INIT(CapabilityStencilExportEXT, { CapabilityShader });
   ADD_VEC_INIT(CapabilityShaderViewportIndexLayerEXT, { CapabilityMultiViewport });
   ADD_VEC_INIT(CapabilityUniformAndStorageBuffer16BitAccess, { CapabilityStorageBuffer16BitAccess });
+#ifdef ICD_VULKAN_1_1
+  ADD_VEC_INIT(CapabilityGroupNonUniformVote, { CapabilityGroupNonUniform });
+  ADD_VEC_INIT(CapabilityGroupNonUniformArithmetic, { CapabilityGroupNonUniform });
+  ADD_VEC_INIT(CapabilityGroupNonUniformBallot, { CapabilityGroupNonUniform });
+  ADD_VEC_INIT(CapabilityGroupNonUniformShuffle, { CapabilityGroupNonUniform });
+  ADD_VEC_INIT(CapabilityGroupNonUniformShuffleRelative, { CapabilityGroupNonUniform });
+  ADD_VEC_INIT(CapabilityGroupNonUniformClustered, { CapabilityGroupNonUniform });
+  ADD_VEC_INIT(CapabilityGroupNonUniformQuad, { CapabilityGroupNonUniform });
+#endif
 }
 
 template<> inline void
diff --git a/icd/api/llpc/translator/libSPIRV/SPIRVInstruction.h b/icd/api/llpc/translator/libSPIRV/SPIRVInstruction.h
index 010debfc..7ade919d 100644
--- a/icd/api/llpc/translator/libSPIRV/SPIRVInstruction.h
+++ b/icd/api/llpc/translator/libSPIRV/SPIRVInstruction.h
@@ -553,6 +553,28 @@ class SPIRVImageTexelPointer : public SPIRVInstruction {
   SPIRVId Sample;
 };
 
+class SPIRVImageSparseTexelsResident : public SPIRVInstruction {
+public:
+  const static Op OC = OpImageSparseTexelsResident;
+
+  // Incomplete constructor
+  SPIRVImageSparseTexelsResident() :SPIRVInstruction(OC), ResidentCode(SPIRVID_INVALID)
+    {}
+
+  SPIRVValue *getResidentCode() { return getValue(ResidentCode); }
+protected:
+  _SPIRV_DEF_ENCDEC3(Type, Id, ResidentCode)
+
+  void validate()const {
+    assert(Type->isTypeBool() && Type->isTypeScalar());
+
+    auto ResidentCodeTy = getValueType(ResidentCode);
+    assert(ResidentCodeTy->isTypeInt() && ResidentCodeTy->isTypeScalar());
+  }
+
+  SPIRVId ResidentCode;
+};
+
 class SPIRVStore:public SPIRVInstruction, public SPIRVMemoryAccess {
 public:
   const static SPIRVWord FixedWords = 3;
@@ -2212,6 +2234,39 @@ _SPIRV_OP(GroupCommitReadPipe, false, 6)
 _SPIRV_OP(GroupCommitWritePipe, false, 6)
 #ifdef ICD_VULKAN_1_1
 _SPIRV_OP(GroupNonUniformElect, true, 4)
+_SPIRV_OP(GroupNonUniformAll, true, 5)
+_SPIRV_OP(GroupNonUniformAny, true, 5)
+_SPIRV_OP(GroupNonUniformAllEqual, true, 5)
+_SPIRV_OP(GroupNonUniformBroadcast, true, 6)
+_SPIRV_OP(GroupNonUniformBroadcastFirst, true, 5)
+_SPIRV_OP(GroupNonUniformBallot, true, 5)
+_SPIRV_OP(GroupNonUniformInverseBallot, true, 5)
+_SPIRV_OP(GroupNonUniformBallotBitExtract, true, 6)
+_SPIRV_OP(GroupNonUniformBallotBitCount, true, 6, false, 1)
+_SPIRV_OP(GroupNonUniformBallotFindLSB, true, 5)
+_SPIRV_OP(GroupNonUniformBallotFindMSB, true, 5)
+_SPIRV_OP(GroupNonUniformShuffle, true, 6)
+_SPIRV_OP(GroupNonUniformShuffleXor, true, 6)
+_SPIRV_OP(GroupNonUniformShuffleUp, true, 6)
+_SPIRV_OP(GroupNonUniformShuffleDown, true, 6)
+_SPIRV_OP(GroupNonUniformIAdd, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformFAdd, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformIMul, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformFMul, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformSMin, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformUMin, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformFMin, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformSMax, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformUMax, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformFMax, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformBitwiseAnd, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformBitwiseOr, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformBitwiseXor, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformLogicalAnd, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformLogicalOr, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformLogicalXor, true, 6, true, 1)
+_SPIRV_OP(GroupNonUniformQuadBroadcast, true, 6)
+_SPIRV_OP(GroupNonUniformQuadSwap, true, 6)
 #endif
 #undef _SPIRV_OP
 
@@ -2280,6 +2335,18 @@ _SPIRV_OP(ImageQuerySize, true, 4)
 _SPIRV_OP(ImageQueryLod, true, 5)
 _SPIRV_OP(ImageQueryLevels, true, 4)
 _SPIRV_OP(ImageQuerySamples, true, 4)
+_SPIRV_OP(ImageSparseSampleImplicitLod, true, 5, true)
+_SPIRV_OP(ImageSparseSampleExplicitLod, true, 7, true, 2)
+_SPIRV_OP(ImageSparseSampleDrefImplicitLod, true, 6, true, 3)
+_SPIRV_OP(ImageSparseSampleDrefExplicitLod, true, 7, true, 3)
+_SPIRV_OP(ImageSparseSampleProjImplicitLod, true, 5, true, 2)
+_SPIRV_OP(ImageSparseSampleProjExplicitLod, true, 7, true, 2)
+_SPIRV_OP(ImageSparseSampleProjDrefImplicitLod, true, 6, true, 3)
+_SPIRV_OP(ImageSparseSampleProjDrefExplicitLod, true, 7, true, 3)
+_SPIRV_OP(ImageSparseFetch, true, 4, true, 2)
+_SPIRV_OP(ImageSparseGather, true, 6, true, 3)
+_SPIRV_OP(ImageSparseDrefGather, true, 6, true, 3)
+_SPIRV_OP(ImageSparseRead, true, 5, true, 2)
 #undef _SPIRV_OP
 
 // SpecConstantOp instruction
diff --git a/icd/api/llpc/util/llpcDebug.cpp b/icd/api/llpc/util/llpcDebug.cpp
index 90f20810..87668521 100644
--- a/icd/api/llpc/util/llpcDebug.cpp
+++ b/icd/api/llpc/util/llpcDebug.cpp
@@ -41,9 +41,7 @@
 #include "llpcDebug.h"
 #include "llpcElf.h"
 #include "llpcGfx6Chip.h"
-#ifdef LLPC_BUILD_GFX9
 #include "llpcGfx9Chip.h"
-#endif
 #include "llpcInternal.h"
 #include "llpcMetroHash.h"
 
diff --git a/icd/api/llpc/util/llpcInternal.h b/icd/api/llpc/util/llpcInternal.h
index 0a2a197d..facf932a 100644
--- a/icd/api/llpc/util/llpcInternal.h
+++ b/icd/api/llpc/util/llpcInternal.h
@@ -99,7 +99,7 @@ namespace LlpcName
     const static char DescriptorLoadSpillTable[]      = "llpc.descriptor.load.spilltable";
     const static char DescriptorLoadGsVsRingBuffer[]  = "llpc.descriptor.load.gsvsringbuffer";
 
-    const static char ImageCallPrefix[]               = "llpc.image.";
+    const static char ImageCallPrefix[]               = "llpc.image";
 
     const static char GlobalProxyPrefix[]             = "__llpc_global_proxy_";
     const static char InputProxyPrefix[]              = "__llpc_input_proxy_";
diff --git a/icd/api/llpc/util/llpcPipelineDumper.cpp b/icd/api/llpc/util/llpcPipelineDumper.cpp
index 996ff94b..b8d503b5 100644
--- a/icd/api/llpc/util/llpcPipelineDumper.cpp
+++ b/icd/api/llpc/util/llpcPipelineDumper.cpp
@@ -136,7 +136,7 @@ void VKAPI_CALL IPipelineDumper::DumpPipelineBinary(
 
 // =====================================================================================================================
 // Calculates graphics pipeline hash code.
-uint64_t VKAPI_CALL IPipelineDumper::GetGraphicsPipelineHash(
+uint64_t VKAPI_CALL IPipelineDumper::GetPipelineHash(
     const GraphicsPipelineBuildInfo* pPipelineInfo) // [in] Info to build this graphics pipeline
 {
     return PipelineDumper::GetGraphicsPipelineHash(pPipelineInfo);
@@ -144,7 +144,7 @@ uint64_t VKAPI_CALL IPipelineDumper::GetGraphicsPipelineHash(
 
 // =====================================================================================================================
 // Calculates compute pipeline hash code.
-uint64_t VKAPI_CALL IPipelineDumper::GetComputePipelineHash(
+uint64_t VKAPI_CALL IPipelineDumper::GetPipelineHash(
     const ComputePipelineBuildInfo* pPipelineInfo) // [in] Info to build this compute pipeline
 {
     return PipelineDumper::GetComputePipelineHash(pPipelineInfo);
@@ -655,7 +655,7 @@ MetroHash::Hash PipelineDumper::GenerateHashForComputePipeline(
     MetroHash64 hasher;
 
     UpdateHashForPipelineShaderInfo(ShaderStageCompute, &pPipeline->cs, &hasher);
-
+    hasher.Update(pPipeline->deviceIndex);
     MetroHash::Hash hash = {};
     hasher.Finalize(hash.bytes);
 
@@ -924,11 +924,7 @@ OStream& operator<<(
                             }
                             else
                             {
-#ifdef LLPC_BUILD_GFX9
                                 pRegName = Gfx9::GetRegisterNameString(gfxIp, pConfig[i].key * 4);
-#else
-                                pRegName = "UNKNOWN";
-#endif
                             }
                             auto length = snprintf(formatBuf,
                                                    sizeof(formatBuf),
@@ -981,11 +977,7 @@ OStream& operator<<(
                 }
                 else
                 {
-#ifdef LLPC_BUILD_GFX9
                     pRegName = Gfx9::GetRegisterNameString(gfxIp, pConfig[2 * i]);
-#else
-                    pRegName = "UNKNOWN";
-#endif
                 }
                 auto length = snprintf(formatBuf, sizeof(formatBuf), "        %-45s = 0x%08X\n", pRegName, pConfig[2 * i + 1]);
                 out << formatBuf;
diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp
new file mode 100644
index 00000000..7293b0b8
--- /dev/null
+++ b/icd/api/pipeline_compiler.cpp
@@ -0,0 +1,766 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ ***********************************************************************************************************************
+ * @file  pipeline_compiler.cpp
+ * @brief Contains implementation of Vulkan pipeline compiler
+ ***********************************************************************************************************************
+ */
+
+#include "include/pipeline_compiler.h"
+#include "include/vk_device.h"
+#include "include/vk_shader.h"
+#include "include/vk_pipeline_cache.h"
+#include "include/vk_pipeline_layout.h"
+#include "include/vk_render_pass.h"
+
+namespace vk
+{
+
+extern bool IsSrcAlphaUsedInBlend(VkBlendFactor blend);
+
+// =====================================================================================================================
+PipelineCompiler::PipelineCompiler(PhysicalDevice* pPhysicalDevice)
+    :
+    m_pPhysicalDevice(pPhysicalDevice)
+    , m_pLlpc(nullptr)
+{
+
+}
+
+// =====================================================================================================================
+PipelineCompiler::~PipelineCompiler()
+{
+    VK_ASSERT(m_pLlpc == nullptr);
+}
+
+// =====================================================================================================================
+// Initializes pipeline compiler.
+VkResult PipelineCompiler::Initialize()
+{
+    Pal::IDevice* pPalDevice = m_pPhysicalDevice->PalDevice();
+
+    // Initialzie GfxIp informations per PAL device properties
+    Pal::DeviceProperties info;
+    pPalDevice->GetProperties(&info);
+    m_gfxIpLevel = info.gfxLevel;
+
+    switch (info.gfxLevel)
+    {
+    case Pal::GfxIpLevel::GfxIp6:
+        m_gfxIp.major = 6;
+        m_gfxIp.minor = 0;
+        break;
+    case Pal::GfxIpLevel::GfxIp7:
+        m_gfxIp.major = 7;
+        m_gfxIp.minor = 0;
+        break;
+    case Pal::GfxIpLevel::GfxIp8:
+        m_gfxIp.major = 8;
+        m_gfxIp.minor = 0;
+        break;
+    case Pal::GfxIpLevel::GfxIp8_1:
+        m_gfxIp.major = 8;
+        m_gfxIp.minor = 1;
+        break;
+    case Pal::GfxIpLevel::GfxIp9:
+        m_gfxIp.major = 9;
+        m_gfxIp.minor = 0;
+        break;
+    default:
+        VK_NEVER_CALLED();
+        break;
+    }
+
+    m_gfxIp.stepping = info.gfxStepping;
+
+    // Create compiler objects
+    VkResult result = VK_SUCCESS;
+    result = CreateLlpcCompiler();
+
+    return result;
+}
+
+// =====================================================================================================================
+// Destroies all compiler instance.
+void PipelineCompiler::Destroy()
+{
+    if (m_pLlpc)
+    {
+        m_pLlpc->Destroy();
+        m_pLlpc = nullptr;
+    }
+
+}
+
+// =====================================================================================================================
+// Creates LLPC compiler instance.
+VkResult PipelineCompiler::CreateLlpcCompiler()
+{
+    const uint32_t         OptionBufferSize = 4096;
+    const uint32_t         MaxLlpcOptions   = 32;
+    Llpc::ICompiler*       pCompiler        = nullptr;
+    const RuntimeSettings& settings         = m_pPhysicalDevice->GetRuntimeSettings();
+#ifdef ICD_BUILD_APPPROFILE
+    AppProfile             appProfile       = m_pPhysicalDevice->GetAppProfile();
+#endif
+    // Get the executable name and path
+    char  executableNameBuffer[PATH_MAX];
+    char* pExecutablePtr;
+    Pal::Result palResult = Util::GetExecutableName(&executableNameBuffer[0],
+                                                    &pExecutablePtr,
+                                                    sizeof(executableNameBuffer));
+    VK_ASSERT(palResult == Pal::Result::Success);
+
+    // Initialize LLPC options according to runtime settings
+    const char*        llpcOptions[MaxLlpcOptions]     = {};
+    char               optionBuffers[OptionBufferSize] = {};
+
+    char*              pOptionBuffer                   = &optionBuffers[0];
+    size_t             bufSize                         = OptionBufferSize;
+    int                optionLength                    = 0;
+    uint32_t           numOptions                      = 0;
+    // Identify for Icd and stanalone compiler
+    llpcOptions[numOptions++] = Llpc::VkIcdName;
+
+    // LLPC log options
+    llpcOptions[numOptions++] = (settings.enableLog & 1) ? "-enable-errs=1" : "-enable-errs=0";
+    llpcOptions[numOptions++] = (settings.enableLog & 2) ? "-enable-outs=1" : "-enable-outs=0";
+
+    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-log-file-outs=%s", settings.logFileName);
+    ++optionLength;
+    llpcOptions[numOptions++] = pOptionBuffer;
+    pOptionBuffer += optionLength;
+    bufSize -= optionLength;
+
+    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-log-file-dbgs=%s", settings.debugLogFileName);
+    ++optionLength;
+    llpcOptions[numOptions++] = pOptionBuffer;
+    pOptionBuffer += optionLength;
+    bufSize -= optionLength;
+
+    // Generate ELF binary, not assembly text
+    llpcOptions[numOptions++] = "-filetype=obj";
+
+    // LLPC debug options
+    if (settings.enableDebug)
+    {
+        llpcOptions[numOptions++] = "-debug";
+    }
+
+    if (settings.llpcOptions[0] != '\0')
+    {
+        const char* pOptions = &settings.llpcOptions[0];
+        VK_ASSERT(pOptions[0] == '-');
+
+        // Split options
+        while (pOptions)
+        {
+            const char* pNext = strchr(pOptions, ' ');
+            if (pNext)
+            {
+                // Copy options to option buffer
+                optionLength = static_cast<int32_t>(pNext - pOptions);
+                memcpy(pOptionBuffer, pOptions, optionLength);
+                pOptionBuffer[optionLength] = 0;
+
+                llpcOptions[numOptions++] = pOptionBuffer;
+                pOptionBuffer += (optionLength + 1);
+
+                bufSize -= (optionLength + 1);
+                pOptions = strchr(pOptions + optionLength, '-');
+            }
+            else
+            {
+                // Use pOptions directly for last option
+                llpcOptions[numOptions++] = pOptions;
+                pOptions = nullptr;
+            }
+        }
+    }
+
+    // LLPC pipeline dump options
+    if (settings.enablePipelineDump)
+    {
+        llpcOptions[numOptions++] = "-enable-pipeline-dump";
+    }
+
+    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-pipeline-dump-dir=%s", settings.pipelineDumpDir);
+    ++optionLength;
+    llpcOptions[numOptions++] = pOptionBuffer;
+    pOptionBuffer += optionLength;
+    bufSize -= optionLength;
+
+    if (settings.enableLlpc == LlpcModeAutoFallback)
+    {
+        llpcOptions[numOptions++] = "-disable-WIP-features=1";
+    }
+
+    // NOTE: For testing consistency, these options should be kept the same as those of
+    // "amdllpc" (Init()).
+    llpcOptions[numOptions++] = "-pragma-unroll-threshold=4096";
+    llpcOptions[numOptions++] = "-unroll-allow-partial";
+    llpcOptions[numOptions++] = "-lower-dyn-index";
+    llpcOptions[numOptions++] = "-simplifycfg-sink-common=false";
+    llpcOptions[numOptions++] = "-amdgpu-vgpr-index-mode"; // force VGPR indexing on GFX8
+
+    ShaderCacheMode shaderCacheMode = settings.shaderCacheMode;
+#ifdef ICD_BUILD_APPPROFILE
+    if ((appProfile == AppProfile::Talos) ||
+        (appProfile == AppProfile::MadMax) ||
+        (appProfile == AppProfile::SeriousSamFusion))
+    {
+        llpcOptions[numOptions++] = "-enable-si-scheduler";
+    }
+
+    // Force enable cache to disk to improve user experience
+    if ((shaderCacheMode == ShaderCacheEnableRuntimeOnly) &&
+         ((appProfile == AppProfile::MadMax) ||
+          (appProfile == AppProfile::SeriousSamFusion) ||
+          (appProfile == AppProfile::F1_2017)))
+    {
+        // Force to use internal disk cache.
+        shaderCacheMode = ShaderCacheForceInternalCacheOnDisk;
+    }
+#endif
+
+    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-executable-name=%s", pExecutablePtr);
+    ++optionLength;
+    llpcOptions[numOptions++] = pOptionBuffer;
+    pOptionBuffer += optionLength;
+    bufSize -= optionLength;
+
+    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-cache-mode=%d", shaderCacheMode);
+    ++optionLength;
+    llpcOptions[numOptions++] = pOptionBuffer;
+    pOptionBuffer += optionLength;
+    bufSize -= optionLength;
+
+    if (settings.shaderReplaceMode != 0)
+    {
+        optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-mode=%d", settings.shaderReplaceMode);
+        ++optionLength;
+        llpcOptions[numOptions++] = pOptionBuffer;
+        pOptionBuffer += optionLength;
+        bufSize -= optionLength;
+
+        optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-dir=%s", settings.shaderReplaceDir);
+        ++optionLength;
+        llpcOptions[numOptions++] = pOptionBuffer;
+        pOptionBuffer += optionLength;
+        bufSize -= optionLength;
+
+        optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-pipeline-hashes=%s", settings.shaderReplacePipelineHashes);
+        ++optionLength;
+        llpcOptions[numOptions++] = pOptionBuffer;
+        pOptionBuffer += optionLength;
+        bufSize -= optionLength;
+    }
+
+    VK_ASSERT(numOptions <= MaxLlpcOptions);
+
+    // Create LLPC compiler
+    Llpc::Result llpcResult = Llpc::ICompiler::Create(m_gfxIp, numOptions, llpcOptions, &pCompiler);
+    VK_ASSERT(llpcResult == Llpc::Result::Success);
+
+    m_pLlpc = pCompiler;
+
+    return (llpcResult == Llpc::Result::Success) ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED;
+}
+
+// =====================================================================================================================
+// Creates graphics pipeline binary.
+VkResult PipelineCompiler::CreateGraphicsPipelineBinary(
+    Device*                             pDevice,
+    uint32_t                            deviceIdx,
+    PipelineCache*                      pPipelineCache,
+    GraphicsPipelineCreateInfo*         pCreateInfo,
+    size_t*                             pPipelineBinarySize,
+    const void**                        ppPipelineBinary)
+{
+    VkResult               result    = VK_SUCCESS;
+    const RuntimeSettings& settings  = m_pPhysicalDevice->GetRuntimeSettings();
+    auto                   pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+
+    // Build the LLPC pipeline
+    Llpc::GraphicsPipelineBuildOut  pipelineOut         = {};
+    void*                           pLlpcPipelineBuffer = nullptr;
+
+    {
+        // Fill pipeline create info for LLPC
+        auto pPipelineBuildInfo = &pCreateInfo->pipelineInfo;
+        pPipelineBuildInfo->pInstance      = pInstance;
+        pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput;
+        pPipelineBuildInfo->pUserData      = &pLlpcPipelineBuffer;
+        pPipelineBuildInfo->iaState.deviceIndex = deviceIdx;
+
+        if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCacheType() == PipelineCacheTypeLlpc))
+        {
+            pPipelineBuildInfo->pShaderCache = pPipelineCache->GetShaderCache(deviceIdx).pLlpcShaderCache;
+        }
+
+        auto llpcResult = m_pLlpc->BuildGraphicsPipeline(pPipelineBuildInfo, &pipelineOut);
+        if (llpcResult != Llpc::Result::Success)
+        {
+            // There shouldn't be anything to free for the failure case
+            VK_ASSERT(pLlpcPipelineBuffer == nullptr);
+
+            {
+                result = VK_ERROR_INITIALIZATION_FAILED;
+            }
+        }
+        else
+        {
+            *ppPipelineBinary   = pipelineOut.pipelineBin.pCode;
+            *pPipelineBinarySize = pipelineOut.pipelineBin.codeSize;
+        }
+    }
+
+    return result;
+}
+
+// =====================================================================================================================
+// Creates compute pipeline binary.
+VkResult PipelineCompiler::CreateComputePipelineBinary(
+    Device*                             pDevice,
+    uint32_t                            deviceIdx,
+    PipelineCache*                      pPipelineCache,
+    ComputePipelineCreateInfo*          pCreateInfo,
+    size_t*                             pPipelineBinarySize,
+    const void**                        ppPipelineBinary)
+{
+    VkResult               result    = VK_SUCCESS;
+    const RuntimeSettings& settings  = m_pPhysicalDevice->GetRuntimeSettings();
+    auto                   pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+    const ShaderModule*    pShader   = ShaderModule::ObjectFromHandle(pCreateInfo->pStage->module);
+
+    // Build the LLPC pipeline
+    Llpc::ComputePipelineBuildOut  pipelineOut         = {};
+    void*                          pLlpcPipelineBuffer = nullptr;
+
+    {
+        // Fill pipeline create info for LLPC
+        Llpc::ComputePipelineBuildInfo* pPipelineBuildInfo = &pCreateInfo->pipelineInfo;
+
+        pPipelineBuildInfo->pInstance      = pInstance;
+        pPipelineBuildInfo->pfnOutputAlloc = AllocateShaderOutput;
+        pPipelineBuildInfo->pUserData      = &pLlpcPipelineBuffer;
+        pPipelineBuildInfo->deviceIndex    = deviceIdx;
+
+        if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCacheType() == PipelineCacheTypeLlpc))
+        {
+            pPipelineBuildInfo->pShaderCache = pPipelineCache->GetShaderCache(deviceIdx).pLlpcShaderCache;
+        }
+
+        // Build pipline binary
+        auto llpcResult = m_pLlpc->BuildComputePipeline(pPipelineBuildInfo, &pipelineOut);
+        if (llpcResult != Llpc::Result::Success)
+        {
+            // There shouldn't be anything to free for the failure case
+            VK_ASSERT(pLlpcPipelineBuffer == nullptr);
+
+            {
+                result = VK_ERROR_INITIALIZATION_FAILED;
+            }
+        }
+        else
+        {
+            *ppPipelineBinary   = pipelineOut.pipelineBin.pCode;
+            *pPipelineBinarySize = pipelineOut.pipelineBin.codeSize;
+        }
+        VK_ASSERT(*ppPipelineBinary == pLlpcPipelineBuffer);
+    }
+
+    return VK_SUCCESS;
+}
+
+// =====================================================================================================================
+// Converts Vulkan graphics pipeline parameters to an internal structure
+VkResult PipelineCompiler::ConvertGraphicsPipelineInfo(
+    Device*                             pDevice,
+    const VkGraphicsPipelineCreateInfo* pIn,
+    GraphicsPipelineCreateInfo*         pCreateInfo,
+    VbBindingInfo*                      pVbInfo)
+{
+    VkResult               result    = VK_SUCCESS;
+    const RuntimeSettings& settings  = m_pPhysicalDevice->GetRuntimeSettings();
+    auto                   pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+
+    EXTRACT_VK_STRUCTURES_0(
+        gfxPipeline,
+        GraphicsPipelineCreateInfo,
+        pIn,
+        GRAPHICS_PIPELINE_CREATE_INFO)
+
+    // Fill in necessary non-zero defaults in case some information is missing
+    const RenderPass* pRenderPass = nullptr;
+
+    if (pGraphicsPipelineCreateInfo != nullptr)
+    {
+        for (uint32_t i = 0; i < pGraphicsPipelineCreateInfo->stageCount; ++i)
+        {
+            ShaderStage stage = ShaderFlagBitToStage(pGraphicsPipelineCreateInfo->pStages[i].stage);
+            VK_ASSERT(stage < ShaderGfxStageCount);
+            pCreateInfo->pStages[stage] = &pGraphicsPipelineCreateInfo->pStages[i];
+        }
+
+        VK_IGNORE(pGraphicsPipelineCreateInfo->flags & VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT);
+
+        pRenderPass = RenderPass::ObjectFromHandle(pGraphicsPipelineCreateInfo->renderPass);
+
+        if (pGraphicsPipelineCreateInfo->layout != VK_NULL_HANDLE)
+        {
+            pCreateInfo->pLayout = PipelineLayout::ObjectFromHandle(pGraphicsPipelineCreateInfo->layout);
+        }
+
+        pCreateInfo->pipelineInfo.pVertexInput = pGraphicsPipelineCreateInfo->pVertexInputState;
+
+        const VkPipelineInputAssemblyStateCreateInfo* pIa = pGraphicsPipelineCreateInfo->pInputAssemblyState;
+        // According to the spec this should never be null
+        VK_ASSERT(pIa != nullptr);
+
+        pCreateInfo->pipelineInfo.iaState.enableMultiView = pRenderPass->IsMultiviewEnabled();
+        pCreateInfo->pipelineInfo.iaState.topology           = pIa->topology;
+        pCreateInfo->pipelineInfo.iaState.disableVertexReuse = false;
+
+        EXTRACT_VK_STRUCTURES_1(
+            Tess,
+            PipelineTessellationStateCreateInfo,
+            PipelineTessellationDomainOriginStateCreateInfoKHR,
+            pGraphicsPipelineCreateInfo->pTessellationState,
+            PIPELINE_TESSELLATION_STATE_CREATE_INFO,
+            PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO_KHR)
+
+        if (pPipelineTessellationStateCreateInfo != nullptr)
+        {
+            pCreateInfo->pipelineInfo.iaState.patchControlPoints = pPipelineTessellationStateCreateInfo->patchControlPoints;
+        }
+
+        if (pPipelineTessellationDomainOriginStateCreateInfoKHR)
+        {
+            // Vulkan 1.0 incorrectly specified the tessellation u,v coordinate origin as lower left even though
+            // framebuffer and image coordinate origins are in the upper left.  This has since been fixed, but
+            // an extension exists to use the previous behavior.  Doing so with flat shading would likely appear
+            // incorrect, but Vulkan specifies that the provoking vertex is undefined when tessellation is active.
+            if (pPipelineTessellationDomainOriginStateCreateInfoKHR->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT_KHR)
+            {
+                pCreateInfo->pipelineInfo.iaState.switchWinding = true;
+            }
+        }
+
+        const VkPipelineRasterizationStateCreateInfo* pRs = pGraphicsPipelineCreateInfo->pRasterizationState;
+        // By default rasterization is disabled, unless rasterization creation info is present
+        pCreateInfo->pipelineInfo.rsState.rasterizerDiscardEnable = true;
+        if (pRs != nullptr)
+        {
+            pCreateInfo->pipelineInfo.vpState.depthClipEnable         = (pRs->depthClampEnable == VK_FALSE);
+            pCreateInfo->pipelineInfo.rsState.rasterizerDiscardEnable = (pRs->rasterizerDiscardEnable != VK_FALSE);
+        }
+
+        bool multisampleEnable = false;
+        uint32_t rasterizationSampleCount = 0;
+        const VkPipelineMultisampleStateCreateInfo* pMs = pGraphicsPipelineCreateInfo->pMultisampleState;
+
+        pCreateInfo->pipelineInfo.rsState.numSamples = 1;
+        if (pMs != nullptr)
+        {
+            multisampleEnable = (pMs->rasterizationSamples != 1);
+
+            if (multisampleEnable)
+            {
+                VK_ASSERT(pRenderPass != nullptr);
+
+                rasterizationSampleCount            = pMs->rasterizationSamples;
+                uint32_t subpassCoverageSampleCount = pRenderPass->GetSubpassMaxSampleCount(pGraphicsPipelineCreateInfo->subpass);
+                uint32_t subpassColorSampleCount    = pRenderPass->GetSubpassColorSampleCount(pGraphicsPipelineCreateInfo->subpass);
+
+                // subpassCoverageSampleCount would be equal to zero if there are zero attachments.
+                subpassCoverageSampleCount = subpassCoverageSampleCount == 0 ? rasterizationSampleCount : subpassCoverageSampleCount;
+
+                subpassColorSampleCount = subpassColorSampleCount == 0 ? subpassCoverageSampleCount : subpassColorSampleCount;
+
+                if (pMs->sampleShadingEnable && (pMs->minSampleShading > 0.0f))
+                {
+                    pCreateInfo->pipelineInfo.rsState.perSampleShading =((subpassColorSampleCount * pMs->minSampleShading) > 1.0f);
+                }
+                else
+                {
+                    pCreateInfo->pipelineInfo.rsState.perSampleShading = false;
+                }
+
+                pCreateInfo->pipelineInfo.rsState.numSamples = rasterizationSampleCount;
+
+                // NOTE: The sample pattern index here is actually the offset of sample position pair. This is
+                // different from the field of creation info of image view. For image view, the sample pattern
+                // index is really table index of the sample pattern.
+                pCreateInfo->pipelineInfo.rsState.samplePatternIdx =
+                    Device::GetDefaultSamplePatternIndex(subpassCoverageSampleCount) * Pal::MaxMsaaRasterizerSamples;
+            }
+            pCreateInfo->pipelineInfo.cbState.alphaToCoverageEnable = (pMs->alphaToCoverageEnable == VK_TRUE);
+        }
+
+        const VkPipelineColorBlendStateCreateInfo* pCb = pGraphicsPipelineCreateInfo->pColorBlendState;
+        bool dualSourceBlend = false;
+
+        if (pCb != nullptr)
+        {
+            const uint32_t numColorTargets = Util::Min(pCb->attachmentCount, Pal::MaxColorTargets);
+
+            for (uint32_t i = 0; i < numColorTargets; ++i)
+            {
+                const VkPipelineColorBlendAttachmentState& src = pCb->pAttachments[i];
+                auto pLlpcCbDst = &pCreateInfo->pipelineInfo.cbState.target[i];
+                if (pRenderPass)
+                {
+                    auto cbFormat = pRenderPass->GetColorAttachmentFormat(pGraphicsPipelineCreateInfo->subpass, i);
+                    // If the sub pass attachment format is UNDEFINED, then it means that that subpass does not
+                    // want to write to any attachment for that output (VK_ATTACHMENT_UNUSED).  Under such cases,
+                    // disable shader writes through that target.
+                    if (cbFormat != VK_FORMAT_UNDEFINED)
+                    {
+                        pLlpcCbDst->format               = cbFormat;
+                        pLlpcCbDst->blendEnable          = (src.blendEnable == VK_TRUE);
+                        pLlpcCbDst->blendSrcAlphaToColor = IsSrcAlphaUsedInBlend(src.srcAlphaBlendFactor) ||
+                                                           IsSrcAlphaUsedInBlend(src.dstAlphaBlendFactor) ||
+                                                           IsSrcAlphaUsedInBlend(src.srcColorBlendFactor) ||
+                                                           IsSrcAlphaUsedInBlend(src.dstColorBlendFactor);
+                        pLlpcCbDst->channelWriteMask     = src.colorWriteMask;
+                    }
+                }
+
+                dualSourceBlend |= IsDualSourceBlend(src.srcAlphaBlendFactor);
+                dualSourceBlend |= IsDualSourceBlend(src.dstAlphaBlendFactor);
+                dualSourceBlend |= IsDualSourceBlend(src.srcColorBlendFactor);
+                dualSourceBlend |= IsDualSourceBlend(src.dstColorBlendFactor);
+            }
+        }
+        pCreateInfo->pipelineInfo.cbState.dualSourceBlendEnable = dualSourceBlend;
+
+        VkFormat dbFormat = { };
+        if (pRenderPass != nullptr)
+        {
+            dbFormat = pRenderPass->GetDepthStencilAttachmentFormat(pGraphicsPipelineCreateInfo->subpass);
+            pCreateInfo->dbFormat = dbFormat;
+        }
+    }
+
+    // Allocate space to create the LLPC/SCPC pipeline resource mappings
+    if (pCreateInfo->pLayout != nullptr)
+    {
+        size_t tempBufferSize = pCreateInfo->pLayout->GetPipelineInfo()->tempBufferSize;
+
+        // Allocate the temp buffer
+        if (tempBufferSize > 0)
+        {
+            pCreateInfo->pMappingBuffer = pInstance->AllocMem(
+                tempBufferSize,
+                VK_DEFAULT_MEM_ALIGN,
+                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+            if (pCreateInfo->pMappingBuffer == nullptr)
+            {
+                result = VK_ERROR_OUT_OF_HOST_MEMORY;
+            }
+        }
+    }
+
+    // Build the LLPC pipeline
+    Llpc::PipelineShaderInfo* shaderInfos[] =
+    {
+        &pCreateInfo->pipelineInfo.vs,
+        &pCreateInfo->pipelineInfo.tcs,
+        &pCreateInfo->pipelineInfo.tes,
+        &pCreateInfo->pipelineInfo.gs,
+        &pCreateInfo->pipelineInfo.fs
+    };
+
+    // Apply patches
+    pCreateInfo->pipelineInfo.pInstance      = pInstance;
+    pCreateInfo->pipelineInfo.pfnOutputAlloc = AllocateShaderOutput;
+
+    for (uint32_t stage = 0; stage < ShaderGfxStageCount; ++stage)
+    {
+        auto pStage      = pCreateInfo->pStages[stage];
+        if (pStage == nullptr)
+            continue;
+        auto pScpcShader     = ShaderModule::ObjectFromHandle(pStage->module);
+        auto pShaderInfo = shaderInfos[stage];
+
+        pShaderInfo->pModuleData         = pScpcShader->GetShaderData(true);
+        pShaderInfo->pSpecializatonInfo  = pStage->pSpecializationInfo;
+        pShaderInfo->pEntryTarget        = pStage->pName;
+
+        // Build the resource mapping description for LLPC.  This data contains things about how shader
+        // inputs like descriptor set bindings are communicated to this pipeline in a form that LLPC can
+        // understand.
+        if (pCreateInfo->pLayout != nullptr)
+        {
+            const bool vertexShader = (stage == ShaderStageVertex);
+            result = pCreateInfo->pLayout->BuildLlpcPipelineMapping(
+                static_cast<ShaderStage>(stage),
+                pCreateInfo->pMappingBuffer,
+                vertexShader ? pCreateInfo->pipelineInfo.pVertexInput : nullptr,
+                pShaderInfo,
+                vertexShader ? pVbInfo : nullptr);
+        }
+    }
+
+    return result;
+}
+
+// =====================================================================================================================
+// Checks whether dual source blend is needed.
+bool PipelineCompiler::IsDualSourceBlend(
+    VkBlendFactor blend)
+{
+    bool result = false;
+    switch(blend)
+    {
+    case VK_BLEND_FACTOR_SRC1_COLOR:
+    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
+    case VK_BLEND_FACTOR_SRC1_ALPHA:
+    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
+        result = true;
+        break;
+    default:
+        result = false;
+        break;
+    }
+    return result;
+}
+
+// =====================================================================================================================
+// Converts Vulkan compute pipeline parameters to an internal structure
+VkResult PipelineCompiler::ConvertComputePipelineInfo(
+    const VkComputePipelineCreateInfo*  pIn,
+    ComputePipelineCreateInfo*          pCreateInfo)
+{
+    VkResult result    = VK_SUCCESS;
+    auto     pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+
+    VK_ASSERT(pIn->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
+
+    if (pIn->layout != VK_NULL_HANDLE)
+    {
+        pCreateInfo->pLayout = PipelineLayout::ObjectFromHandle(pIn->layout);
+    }
+
+    pCreateInfo->flags  = pIn->flags;
+    pCreateInfo->pStage = &pIn->stage;
+    // Allocate space to create the LLPC/SCPC pipeline resource mappings
+    if (pCreateInfo->pLayout != nullptr)
+    {
+        size_t tempBufferSize = pCreateInfo->pLayout->GetPipelineInfo()->tempBufferSize;
+
+        // Allocate the temp buffer
+        if (tempBufferSize > 0)
+        {
+            pCreateInfo->pMappingBuffer = pInstance->AllocMem(
+                tempBufferSize,
+                VK_DEFAULT_MEM_ALIGN,
+                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+            if (pCreateInfo->pMappingBuffer == nullptr)
+            {
+                result = VK_ERROR_OUT_OF_HOST_MEMORY;
+            }
+        }
+    }
+
+     const ShaderModule* pScpcShader = ShaderModule::ObjectFromHandle(pCreateInfo->pStage->module);
+     pCreateInfo->pipelineInfo.cs.pModuleData         = pScpcShader->GetShaderData(true);
+     pCreateInfo->pipelineInfo.cs.pSpecializatonInfo  = pCreateInfo->pStage->pSpecializationInfo;
+     pCreateInfo->pipelineInfo.cs.pEntryTarget        = pCreateInfo->pStage->pName;
+
+    // Build the resource mapping description for LLPC.  This data contains things about how shader
+    // inputs like descriptor set bindings interact with this pipeline in a form that LLPC can
+    // understand.
+    if (pCreateInfo->pLayout != nullptr)
+    {
+        result = pCreateInfo->pLayout->BuildLlpcPipelineMapping(
+            ShaderStageCompute,
+            pCreateInfo->pMappingBuffer,
+            nullptr,
+            &pCreateInfo->pipelineInfo.cs,
+            nullptr);
+    }
+
+    return result;
+}
+
+// =====================================================================================================================
+// Free compute pipeline binary
+void PipelineCompiler::FreeComputePipelineBinary(
+    ComputePipelineCreateInfo* pCreateInfo,
+    const void*                pPipelineBinary,
+    size_t                     binarySize)
+{
+    {
+        m_pPhysicalDevice->Manager()->VkInstance()->FreeMem(const_cast<void*>(pPipelineBinary));
+    }
+}
+
+// =====================================================================================================================
+// Free graphics pipeline binary
+void PipelineCompiler::FreeGraphicsPipelineBinary(
+    GraphicsPipelineCreateInfo* pCreateInfo,
+    const void*                 pPipelineBinary,
+    size_t                      binarySize)
+{
+    {
+        m_pPhysicalDevice->Manager()->VkInstance()->FreeMem(const_cast<void*>(pPipelineBinary));
+    }
+}
+
+// =====================================================================================================================
+// Free the temp memories in compute pipeline create info
+void PipelineCompiler::FreeComputePipelineCreateInfo(
+    ComputePipelineCreateInfo* pCreateInfo)
+{
+    auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+
+    if (pCreateInfo->pMappingBuffer != nullptr)
+    {
+        pInstance->FreeMem(pCreateInfo->pMappingBuffer);
+        pCreateInfo->pMappingBuffer = nullptr;
+    }
+}
+
+// =====================================================================================================================
+// Free the temp memories in graphics pipeline create info
+void PipelineCompiler::FreeGraphicsPipelineCreateInfo(
+    GraphicsPipelineCreateInfo* pCreateInfo)
+{
+    auto pInstance = m_pPhysicalDevice->Manager()->VkInstance();
+
+    if (pCreateInfo->pMappingBuffer != nullptr)
+    {
+        pInstance->FreeMem(pCreateInfo->pMappingBuffer);
+        pCreateInfo->pMappingBuffer = nullptr;
+    }
+}
+
+}
+
diff --git a/icd/api/strings/base_entry_points.txt b/icd/api/strings/base_entry_points.txt
index e82f7c4a..b936eb16 100644
--- a/icd/api/strings/base_entry_points.txt
+++ b/icd/api/strings/base_entry_points.txt
@@ -1,298 +1,299 @@
 ###############################################################################
-# Entry point name                              type  version/extension
-
-vkGetDeviceProcAddr                             @none
-vkGetInstanceProcAddr                           @none
-
-vkCreateInstance                                @none
-vkEnumerateInstanceExtensionProperties          @none
-vkEnumerateInstanceLayerProperties              @none
-
-vkAllocateCommandBuffers                        @core 1.0
-vkAllocateDescriptorSets                        @core 1.0
-vkAllocateMemory                                @core 1.0
-vkBeginCommandBuffer                            @core 1.0
-vkBindBufferMemory                              @core 1.0
-vkBindImageMemory                               @core 1.0
-vkCmdBeginRenderPass                            @core 1.0
-vkCmdBeginQuery                                 @core 1.0
-vkCmdBindDescriptorSets                         @core 1.0
-vkCmdBindIndexBuffer                            @core 1.0
-vkCmdBindPipeline                               @core 1.0
-vkCmdBindVertexBuffers                          @core 1.0
-vkCmdBlitImage                                  @core 1.0
-vkCmdClearAttachments                           @core 1.0
-vkCmdClearColorImage                            @core 1.0
-vkCmdClearDepthStencilImage                     @core 1.0
-vkCmdCopyBuffer                                 @core 1.0
-vkCmdCopyBufferToImage                          @core 1.0
-vkCmdCopyImage                                  @core 1.0
-vkCmdCopyImageToBuffer                          @core 1.0
-vkCmdCopyQueryPoolResults                       @core 1.0
-vkCmdDraw                                       @core 1.0
-vkCmdDrawIndexed                                @core 1.0
-vkCmdDrawIndexedIndirect                        @core 1.0
-vkCmdDrawIndirect                               @core 1.0
-vkCmdDispatch                                   @core 1.0
-vkCmdDispatchIndirect                           @core 1.0
-vkCmdEndRenderPass                              @core 1.0
-vkCmdEndQuery                                   @core 1.0
-vkCmdExecuteCommands                            @core 1.0
-vkCmdFillBuffer                                 @core 1.0
-vkCmdNextSubpass                                @core 1.0
-vkCmdPipelineBarrier                            @core 1.0
-vkCmdPushConstants                              @core 1.0
-vkCmdResetEvent                                 @core 1.0
-vkCmdResetQueryPool                             @core 1.0
-vkCmdResolveImage                               @core 1.0
-vkCmdSetBlendConstants                          @core 1.0
-vkCmdSetDepthBias                               @core 1.0
-vkCmdSetDepthBounds                             @core 1.0
-vkCmdSetEvent                                   @core 1.0
-vkCmdSetLineWidth                               @core 1.0
-vkCmdSetScissor                                 @core 1.0
-vkCmdSetStencilCompareMask                      @core 1.0
-vkCmdSetStencilReference                        @core 1.0
-vkCmdSetStencilWriteMask                        @core 1.0
-vkCmdSetViewport                                @core 1.0
-vkCmdUpdateBuffer                               @core 1.0
-vkCmdWaitEvents                                 @core 1.0
-vkCmdWriteTimestamp                             @core 1.0
-vkCreateBuffer                                  @core 1.0
-vkCreateBufferView                              @core 1.0
-vkCreateCommandPool                             @core 1.0
-vkCreateComputePipelines                        @core 1.0
-vkCreateDescriptorPool                          @core 1.0
-vkCreateDescriptorSetLayout                     @core 1.0
-vkCreateDevice                                  @core 1.0
-vkCreateEvent                                   @core 1.0
-vkCreateFence                                   @core 1.0
-vkCreateFramebuffer                             @core 1.0
-vkCreateGraphicsPipelines                       @core 1.0
-vkCreateImage                                   @core 1.0
-vkCreateImageView                               @core 1.0
-vkCreatePipelineCache                           @core 1.0
-vkCreatePipelineLayout                          @core 1.0
-vkCreateQueryPool                               @core 1.0
-vkCreateRenderPass                              @core 1.0
-vkCreateSampler                                 @core 1.0
-vkCreateSemaphore                               @core 1.0
-vkCreateShaderModule                            @core 1.0
-vkDestroyBuffer                                 @core 1.0
-vkDestroyBufferView                             @core 1.0
-vkDestroyCommandPool                            @core 1.0
-vkDestroyDescriptorPool                         @core 1.0
-vkDestroyDescriptorSetLayout                    @core 1.0
-vkDestroyDevice                                 @core 1.0
-vkDestroyEvent                                  @core 1.0
-vkDestroyFence                                  @core 1.0
-vkDestroyFramebuffer                            @core 1.0
-vkDestroyImage                                  @core 1.0
-vkDestroyImageView                              @core 1.0
-vkDestroyInstance                               @core 1.0
-vkDestroyPipeline                               @core 1.0
-vkDestroyPipelineCache                          @core 1.0
-vkDestroyPipelineLayout                         @core 1.0
-vkDestroyQueryPool                              @core 1.0
-vkDestroyRenderPass                             @core 1.0
-vkDestroySampler                                @core 1.0
-vkDestroySemaphore                              @core 1.0
-vkDestroyShaderModule                           @core 1.0
-vkDeviceWaitIdle                                @core 1.0
-vkEndCommandBuffer                              @core 1.0
-vkEnumerateDeviceExtensionProperties            @core 1.0
-vkEnumerateDeviceLayerProperties                @core 1.0
-vkEnumeratePhysicalDevices                      @core 1.0
-vkFlushMappedMemoryRanges                       @core 1.0
-vkFreeCommandBuffers                            @core 1.0
-vkFreeDescriptorSets                            @core 1.0
-vkFreeMemory                                    @core 1.0
-vkGetBufferMemoryRequirements                   @core 1.0
-vkGetDeviceMemoryCommitment                     @core 1.0
-vkGetDeviceQueue                                @core 1.0
-vkGetEventStatus                                @core 1.0
-vkGetFenceStatus                                @core 1.0
-vkGetImageMemoryRequirements                    @core 1.0
-vkGetImageSparseMemoryRequirements              @core 1.0
-vkGetImageSubresourceLayout                     @core 1.0
-vkGetPhysicalDeviceFeatures                     @core 1.0
-vkGetPhysicalDeviceFormatProperties             @core 1.0
-vkGetPhysicalDeviceImageFormatProperties        @core 1.0
-vkGetPhysicalDeviceMemoryProperties             @core 1.0
-vkGetPhysicalDeviceProperties                   @core 1.0
-vkGetPhysicalDeviceQueueFamilyProperties        @core 1.0
-vkGetPhysicalDeviceSparseImageFormatProperties  @core 1.0
-vkGetPipelineCacheData                          @core 1.0
-vkGetQueryPoolResults                           @core 1.0
-vkGetRenderAreaGranularity                      @core 1.0
-vkInvalidateMappedMemoryRanges                  @core 1.0
-vkMapMemory                                     @core 1.0
-vkMergePipelineCaches                           @core 1.0
-vkQueueBindSparse                               @core 1.0
-vkQueueSubmit                                   @core 1.0
-vkQueueWaitIdle                                 @core 1.0
-vkResetCommandBuffer                            @core 1.0
-vkResetCommandPool                              @core 1.0
-vkResetDescriptorPool                           @core 1.0
-vkResetEvent                                    @core 1.0
-vkResetFences                                   @core 1.0
-vkSetEvent                                      @core 1.0
-vkUnmapMemory                                   @core 1.0
-vkUpdateDescriptorSets                          @core 1.0
-vkWaitForFences                                 @core 1.0
-
-vkEnumerateInstanceVersion                      @none 1.1
-
-vkBindBufferMemory2                             @core 1.1
-vkBindImageMemory2                              @core 1.1
-vkCmdSetDeviceMask                              @core 1.1
-vkCmdDispatchBase                               @core 1.1
-vkCreateDescriptorUpdateTemplate                @core 1.1
-vkCreateSamplerYcbcrConversion                  @core 1.1
-vkDestroyDescriptorUpdateTemplate               @core 1.1
-vkDestroySamplerYcbcrConversion                 @core 1.1
-vkEnumeratePhysicalDeviceGroups                 @core 1.1
-vkGetBufferMemoryRequirements2                  @core 1.1
-vkGetDescriptorSetLayoutSupport                 @core 1.1
-vkGetDeviceGroupPeerMemoryFeatures              @core 1.1
-vkGetDeviceQueue2                               @core 1.1
-vkGetImageMemoryRequirements2                   @core 1.1
-vkGetImageSparseMemoryRequirements2             @core 1.1
-vkGetPhysicalDeviceExternalBufferProperties     @core 1.1
-vkGetPhysicalDeviceExternalFenceProperties      @core 1.1
-vkGetPhysicalDeviceExternalSemaphoreProperties  @core 1.1
-vkGetPhysicalDeviceFeatures2                    @core 1.1
-vkGetPhysicalDeviceFormatProperties2            @core 1.1
-vkGetPhysicalDeviceImageFormatProperties2       @core 1.1
-vkGetPhysicalDeviceMemoryProperties2            @core 1.1
-vkGetPhysicalDeviceProperties2                  @core 1.1
-vkGetPhysicalDeviceQueueFamilyProperties2       @core 1.1
-vkGetPhysicalDeviceSparseImageFormatProperties2 @core 1.1
-vkTrimCommandPool                               @core 1.1
-vkUpdateDescriptorSetWithTemplate               @core 1.1
-
-vkGetPhysicalDeviceFeatures2KHR                 @iext KHR_get_physical_device_properties2
-vkGetPhysicalDeviceProperties2KHR               @iext KHR_get_physical_device_properties2
-vkGetPhysicalDeviceFormatProperties2KHR         @iext KHR_get_physical_device_properties2
-vkGetPhysicalDeviceImageFormatProperties2KHR    @iext KHR_get_physical_device_properties2
-vkGetPhysicalDeviceQueueFamilyProperties2KHR    @iext KHR_get_physical_device_properties2
-vkGetPhysicalDeviceMemoryProperties2KHR         @iext KHR_get_physical_device_properties2
-
-vkGetPhysicalDeviceSparseImageFormatProperties2KHR @iext KHR_get_physical_device_properties2
-
-vkEnumeratePhysicalDeviceGroupsKHR              @iext KHR_device_group_creation
-
-vkGetDescriptorSetLayoutSupportKHR              @dext KHR_maintenance3
-
-vkGetPhysicalDevicePresentRectanglesKHR         @dext KHR_device_group
-vkGetDeviceGroupPeerMemoryFeaturesKHR           @dext KHR_device_group
-vkCmdSetDeviceMaskKHR                           @dext KHR_device_group
-vkGetDeviceGroupPresentCapabilitiesKHR          @dext KHR_device_group
-vkGetDeviceGroupSurfacePresentModesKHR          @dext KHR_device_group
-vkAcquireNextImage2KHR                          @dext KHR_device_group
-vkCmdDispatchBaseKHR                            @dext KHR_device_group
-
-vkBindBufferMemory2KHR                          @dext KHR_bind_memory2
-vkBindImageMemory2KHR                           @dext KHR_bind_memory2
-
-vkCreateDescriptorUpdateTemplateKHR             @dext KHR_descriptor_update_template
-vkDestroyDescriptorUpdateTemplateKHR            @dext KHR_descriptor_update_template
-vkUpdateDescriptorSetWithTemplateKHR            @dext KHR_descriptor_update_template
-
-vkEnumeratePhysicalDeviceGroupsKHX              @iext KHX_device_group_creation
-vkGetPhysicalDevicePresentRectanglesKHX         @dext KHX_device_group
-vkGetDeviceGroupPeerMemoryFeaturesKHX           @dext KHX_device_group
-vkCmdSetDeviceMaskKHX                           @dext KHX_device_group
-vkGetDeviceGroupPresentCapabilitiesKHX          @dext KHX_device_group
-vkGetDeviceGroupSurfacePresentModesKHX          @dext KHX_device_group
-vkAcquireNextImage2KHX                          @dext KHX_device_group
-vkCmdDispatchBaseKHX                            @dext KHX_device_group
-
-vkGetPhysicalDeviceExternalBufferPropertiesKHR  @iext KHR_external_memory_capabilities
-
-vkGetMemoryFdPropertiesKHR                      @dext KHR_external_memory_fd
-vkGetMemoryFdKHR                                @dext KHR_external_memory_fd
-
-vkGetMemoryWin32HandleKHR                       @dext KHR_external_memory_win32
-vkGetMemoryWin32HandlePropertiesKHR             @dext KHR_external_memory_win32
-
-vkGetPhysicalDeviceExternalSemaphorePropertiesKHR  @iext KHR_external_semaphore_capabilities
-vkImportSemaphoreFdKHR                          @dext KHR_external_semaphore_fd
-vkGetSemaphoreFdKHR                             @dext KHR_external_semaphore_fd
-
-vkImportSemaphoreWin32HandleKHR                 @dext KHR_external_semaphore_win32
-vkGetSemaphoreWin32HandleKHR                    @dext KHR_external_semaphore_win32
-
-vkTrimCommandPoolKHR                            @dext KHR_maintenance1
-
-vkDestroySurfaceKHR                             @iext KHR_surface
-vkGetPhysicalDeviceSurfaceCapabilitiesKHR       @iext KHR_surface
-vkGetPhysicalDeviceSurfaceFormatsKHR            @iext KHR_surface
-vkGetPhysicalDeviceSurfacePresentModesKHR       @iext KHR_surface
-vkGetPhysicalDeviceSurfaceSupportKHR            @iext KHR_surface
-
-vkGetPhysicalDeviceSurfaceCapabilities2KHR      @iext KHR_get_surface_capabilities2
-vkGetPhysicalDeviceSurfaceFormats2KHR           @iext KHR_get_surface_capabilities2
-
-vkCreateXcbSurfaceKHR                           @iext KHR_xcb_surface
-vkGetPhysicalDeviceXcbPresentationSupportKHR    @iext KHR_xcb_surface
-
-vkCreateXlibSurfaceKHR                          @iext KHR_xlib_surface
-vkGetPhysicalDeviceXlibPresentationSupportKHR   @iext KHR_xlib_surface
-
-vkAcquireNextImageKHR                           @dext KHR_swapchain
-vkCreateSwapchainKHR                            @dext KHR_swapchain
-vkDestroySwapchainKHR                           @dext KHR_swapchain
-vkGetSwapchainImagesKHR                         @dext KHR_swapchain
-vkQueuePresentKHR                               @dext KHR_swapchain
-
-vkCmdDrawIndexedIndirectCountAMD                @dext AMD_draw_indirect_count
-vkCmdDrawIndirectCountAMD                       @dext AMD_draw_indirect_count
-
-vkGetMultiDevicePropertiesAMDInternal           @none $win32_only
-vkOpenWin32BufferAMDInternal                    @none $win32_only
-vkOpenWin32ImageAMDInternal                     @none $win32_only
-vkOpenWin32SemaphoreAMDInternal                 @none $win32_only
-
-vkGetShaderInfoAMD                              @dext AMD_shader_info
-
-vkCmdDebugMarkerBeginEXT                        @dext EXT_debug_marker
-vkCmdDebugMarkerEndEXT                          @dext EXT_debug_marker
-vkCmdDebugMarkerInsertEXT                       @dext EXT_debug_marker
-vkDebugMarkerSetObjectTagEXT                    @dext EXT_debug_marker
-vkDebugMarkerSetObjectNameEXT                   @dext EXT_debug_marker
-
-vkCreateGpaSessionAMD                           @dext AMD_gpa_interface
-vkDestroyGpaSessionAMD                          @dext AMD_gpa_interface
-vkSetGpaDeviceClockModeAMD                      @dext AMD_gpa_interface
-vkCmdBeginGpaSessionAMD                         @dext AMD_gpa_interface
-vkCmdEndGpaSessionAMD                           @dext AMD_gpa_interface
-vkCmdBeginGpaSampleAMD                          @dext AMD_gpa_interface
-vkCmdEndGpaSampleAMD                            @dext AMD_gpa_interface
-vkGetGpaSessionStatusAMD                        @dext AMD_gpa_interface
-vkGetGpaSessionResultsAMD                       @dext AMD_gpa_interface
-vkResetGpaSessionAMD                            @dext AMD_gpa_interface
-vkCmdCopyGpaSessionResultsAMD                   @dext AMD_gpa_interface
+# Entry point name                                  type    version/extension
+
+vkGetDeviceProcAddr                                 @none
+vkGetInstanceProcAddr                               @none
+
+vkCreateInstance                                    @none
+vkEnumerateInstanceExtensionProperties              @none
+vkEnumerateInstanceLayerProperties                  @none
+
+vkCreateDevice                                      @icore 1.0
+vkDestroyInstance                                   @icore 1.0
+vkEnumerateDeviceExtensionProperties                @icore 1.0
+vkEnumerateDeviceLayerProperties                    @icore 1.0
+vkEnumeratePhysicalDevices                          @icore 1.0
+vkGetPhysicalDeviceFeatures                         @icore 1.0
+vkGetPhysicalDeviceFormatProperties                 @icore 1.0
+vkGetPhysicalDeviceImageFormatProperties            @icore 1.0
+vkGetPhysicalDeviceMemoryProperties                 @icore 1.0
+vkGetPhysicalDeviceProperties                       @icore 1.0
+vkGetPhysicalDeviceQueueFamilyProperties            @icore 1.0
+vkGetPhysicalDeviceSparseImageFormatProperties      @icore 1.0
+
+vkAllocateCommandBuffers                            @dcore 1.0
+vkAllocateDescriptorSets                            @dcore 1.0
+vkAllocateMemory                                    @dcore 1.0
+vkBeginCommandBuffer                                @dcore 1.0
+vkBindBufferMemory                                  @dcore 1.0
+vkBindImageMemory                                   @dcore 1.0
+vkCmdBeginQuery                                     @dcore 1.0
+vkCmdBeginRenderPass                                @dcore 1.0
+vkCmdBindDescriptorSets                             @dcore 1.0
+vkCmdBindIndexBuffer                                @dcore 1.0
+vkCmdBindPipeline                                   @dcore 1.0
+vkCmdBindVertexBuffers                              @dcore 1.0
+vkCmdBlitImage                                      @dcore 1.0
+vkCmdClearAttachments                               @dcore 1.0
+vkCmdClearColorImage                                @dcore 1.0
+vkCmdClearDepthStencilImage                         @dcore 1.0
+vkCmdCopyBuffer                                     @dcore 1.0
+vkCmdCopyBufferToImage                              @dcore 1.0
+vkCmdCopyImage                                      @dcore 1.0
+vkCmdCopyImageToBuffer                              @dcore 1.0
+vkCmdCopyQueryPoolResults                           @dcore 1.0
+vkCmdDispatch                                       @dcore 1.0
+vkCmdDispatchIndirect                               @dcore 1.0
+vkCmdDraw                                           @dcore 1.0
+vkCmdDrawIndexed                                    @dcore 1.0
+vkCmdDrawIndexedIndirect                            @dcore 1.0
+vkCmdDrawIndirect                                   @dcore 1.0
+vkCmdEndQuery                                       @dcore 1.0
+vkCmdEndRenderPass                                  @dcore 1.0
+vkCmdExecuteCommands                                @dcore 1.0
+vkCmdFillBuffer                                     @dcore 1.0
+vkCmdNextSubpass                                    @dcore 1.0
+vkCmdPipelineBarrier                                @dcore 1.0
+vkCmdPushConstants                                  @dcore 1.0
+vkCmdResetEvent                                     @dcore 1.0
+vkCmdResetQueryPool                                 @dcore 1.0
+vkCmdResolveImage                                   @dcore 1.0
+vkCmdSetBlendConstants                              @dcore 1.0
+vkCmdSetDepthBias                                   @dcore 1.0
+vkCmdSetDepthBounds                                 @dcore 1.0
+vkCmdSetEvent                                       @dcore 1.0
+vkCmdSetLineWidth                                   @dcore 1.0
+vkCmdSetScissor                                     @dcore 1.0
+vkCmdSetStencilCompareMask                          @dcore 1.0
+vkCmdSetStencilReference                            @dcore 1.0
+vkCmdSetStencilWriteMask                            @dcore 1.0
+vkCmdSetViewport                                    @dcore 1.0
+vkCmdUpdateBuffer                                   @dcore 1.0
+vkCmdWaitEvents                                     @dcore 1.0
+vkCmdWriteTimestamp                                 @dcore 1.0
+vkCreateBuffer                                      @dcore 1.0
+vkCreateBufferView                                  @dcore 1.0
+vkCreateCommandPool                                 @dcore 1.0
+vkCreateComputePipelines                            @dcore 1.0
+vkCreateDescriptorPool                              @dcore 1.0
+vkCreateDescriptorSetLayout                         @dcore 1.0
+vkCreateEvent                                       @dcore 1.0
+vkCreateFence                                       @dcore 1.0
+vkCreateFramebuffer                                 @dcore 1.0
+vkCreateGraphicsPipelines                           @dcore 1.0
+vkCreateImage                                       @dcore 1.0
+vkCreateImageView                                   @dcore 1.0
+vkCreatePipelineCache                               @dcore 1.0
+vkCreatePipelineLayout                              @dcore 1.0
+vkCreateQueryPool                                   @dcore 1.0
+vkCreateRenderPass                                  @dcore 1.0
+vkCreateSampler                                     @dcore 1.0
+vkCreateSemaphore                                   @dcore 1.0
+vkCreateShaderModule                                @dcore 1.0
+vkDestroyBuffer                                     @dcore 1.0
+vkDestroyBufferView                                 @dcore 1.0
+vkDestroyCommandPool                                @dcore 1.0
+vkDestroyDescriptorPool                             @dcore 1.0
+vkDestroyDescriptorSetLayout                        @dcore 1.0
+vkDestroyDevice                                     @dcore 1.0
+vkDestroyEvent                                      @dcore 1.0
+vkDestroyFence                                      @dcore 1.0
+vkDestroyFramebuffer                                @dcore 1.0
+vkDestroyImage                                      @dcore 1.0
+vkDestroyImageView                                  @dcore 1.0
+vkDestroyPipeline                                   @dcore 1.0
+vkDestroyPipelineCache                              @dcore 1.0
+vkDestroyPipelineLayout                             @dcore 1.0
+vkDestroyQueryPool                                  @dcore 1.0
+vkDestroyRenderPass                                 @dcore 1.0
+vkDestroySampler                                    @dcore 1.0
+vkDestroySemaphore                                  @dcore 1.0
+vkDestroyShaderModule                               @dcore 1.0
+vkDeviceWaitIdle                                    @dcore 1.0
+vkEndCommandBuffer                                  @dcore 1.0
+vkFlushMappedMemoryRanges                           @dcore 1.0
+vkFreeCommandBuffers                                @dcore 1.0
+vkFreeDescriptorSets                                @dcore 1.0
+vkFreeMemory                                        @dcore 1.0
+vkGetBufferMemoryRequirements                       @dcore 1.0
+vkGetDeviceMemoryCommitment                         @dcore 1.0
+vkGetDeviceQueue                                    @dcore 1.0
+vkGetEventStatus                                    @dcore 1.0
+vkGetFenceStatus                                    @dcore 1.0
+vkGetImageMemoryRequirements                        @dcore 1.0
+vkGetImageSparseMemoryRequirements                  @dcore 1.0
+vkGetImageSubresourceLayout                         @dcore 1.0
+vkGetPipelineCacheData                              @dcore 1.0
+vkGetQueryPoolResults                               @dcore 1.0
+vkGetRenderAreaGranularity                          @dcore 1.0
+vkInvalidateMappedMemoryRanges                      @dcore 1.0
+vkMapMemory                                         @dcore 1.0
+vkMergePipelineCaches                               @dcore 1.0
+vkQueueBindSparse                                   @dcore 1.0
+vkQueueSubmit                                       @dcore 1.0
+vkQueueWaitIdle                                     @dcore 1.0
+vkResetCommandBuffer                                @dcore 1.0
+vkResetCommandPool                                  @dcore 1.0
+vkResetDescriptorPool                               @dcore 1.0
+vkResetEvent                                        @dcore 1.0
+vkResetFences                                       @dcore 1.0
+vkSetEvent                                          @dcore 1.0
+vkUnmapMemory                                       @dcore 1.0
+vkUpdateDescriptorSets                              @dcore 1.0
+vkWaitForFences                                     @dcore 1.0
+
+vkEnumerateInstanceVersion                          @none 1.1
+
+vkEnumeratePhysicalDeviceGroups                     @icore 1.1
+vkGetPhysicalDeviceExternalBufferProperties         @icore 1.1
+vkGetPhysicalDeviceExternalFenceProperties          @icore 1.1
+vkGetPhysicalDeviceExternalSemaphoreProperties      @icore 1.1
+vkGetPhysicalDeviceFeatures2                        @icore 1.1
+vkGetPhysicalDeviceFormatProperties2                @icore 1.1
+vkGetPhysicalDeviceImageFormatProperties2           @icore 1.1
+vkGetPhysicalDeviceMemoryProperties2                @icore 1.1
+vkGetPhysicalDeviceProperties2                      @icore 1.1
+vkGetPhysicalDeviceQueueFamilyProperties2           @icore 1.1
+vkGetPhysicalDeviceSparseImageFormatProperties2     @icore 1.1
+
+vkBindBufferMemory2                                 @dcore 1.1
+vkBindImageMemory2                                  @dcore 1.1
+vkCmdDispatchBase                                   @dcore 1.1
+vkCmdSetDeviceMask                                  @dcore 1.1
+vkCreateDescriptorUpdateTemplate                    @dcore 1.1
+vkCreateSamplerYcbcrConversion                      @dcore 1.1
+vkDestroyDescriptorUpdateTemplate                   @dcore 1.1
+vkDestroySamplerYcbcrConversion                     @dcore 1.1
+vkGetBufferMemoryRequirements2                      @dcore 1.1
+vkGetDescriptorSetLayoutSupport                     @dcore 1.1
+vkGetDeviceGroupPeerMemoryFeatures                  @dcore 1.1
+vkGetDeviceQueue2                                   @dcore 1.1
+vkGetImageMemoryRequirements2                       @dcore 1.1
+vkGetImageSparseMemoryRequirements2                 @dcore 1.1
+vkTrimCommandPool                                   @dcore 1.1
+vkUpdateDescriptorSetWithTemplate                   @dcore 1.1
+
+vkGetPhysicalDeviceFeatures2KHR                     @iext KHR_get_physical_device_properties2
+vkGetPhysicalDeviceProperties2KHR                   @iext KHR_get_physical_device_properties2
+vkGetPhysicalDeviceFormatProperties2KHR             @iext KHR_get_physical_device_properties2
+vkGetPhysicalDeviceImageFormatProperties2KHR        @iext KHR_get_physical_device_properties2
+vkGetPhysicalDeviceQueueFamilyProperties2KHR        @iext KHR_get_physical_device_properties2
+vkGetPhysicalDeviceMemoryProperties2KHR             @iext KHR_get_physical_device_properties2
+vkGetPhysicalDeviceSparseImageFormatProperties2KHR  @iext KHR_get_physical_device_properties2
+
+vkEnumeratePhysicalDeviceGroupsKHR                  @iext KHR_device_group_creation
+vkGetPhysicalDevicePresentRectanglesKHR             @dext KHR_device_group
+vkGetDeviceGroupPeerMemoryFeaturesKHR               @dext KHR_device_group
+vkCmdSetDeviceMaskKHR                               @dext KHR_device_group
+vkGetDeviceGroupPresentCapabilitiesKHR              @dext KHR_device_group
+vkGetDeviceGroupSurfacePresentModesKHR              @dext KHR_device_group
+vkAcquireNextImage2KHR                              @dext KHR_device_group
+vkCmdDispatchBaseKHR                                @dext KHR_device_group
+
+vkBindBufferMemory2KHR                              @dext KHR_bind_memory2
+vkBindImageMemory2KHR                               @dext KHR_bind_memory2
+
+vkCreateDescriptorUpdateTemplateKHR                 @dext KHR_descriptor_update_template
+vkDestroyDescriptorUpdateTemplateKHR                @dext KHR_descriptor_update_template
+vkUpdateDescriptorSetWithTemplateKHR                @dext KHR_descriptor_update_template
+
+vkEnumeratePhysicalDeviceGroupsKHX                  @iext KHX_device_group_creation
+vkGetPhysicalDevicePresentRectanglesKHX             @dext KHX_device_group
+vkGetDeviceGroupPeerMemoryFeaturesKHX               @dext KHX_device_group
+vkCmdSetDeviceMaskKHX                               @dext KHX_device_group
+vkGetDeviceGroupPresentCapabilitiesKHX              @dext KHX_device_group
+vkGetDeviceGroupSurfacePresentModesKHX              @dext KHX_device_group
+vkAcquireNextImage2KHX                              @dext KHX_device_group
+vkCmdDispatchBaseKHX                                @dext KHX_device_group
+
+vkGetPhysicalDeviceExternalBufferPropertiesKHR      @iext KHR_external_memory_capabilities
+
+vkGetMemoryFdPropertiesKHR                          @dext KHR_external_memory_fd
+vkGetMemoryFdKHR                                    @dext KHR_external_memory_fd
+
+vkGetMemoryWin32HandleKHR                           @dext KHR_external_memory_win32
+vkGetMemoryWin32HandlePropertiesKHR                 @dext KHR_external_memory_win32
+
+vkGetPhysicalDeviceExternalSemaphorePropertiesKHR   @iext KHR_external_semaphore_capabilities
+
+vkImportSemaphoreFdKHR                              @dext KHR_external_semaphore_fd
+vkGetSemaphoreFdKHR                                 @dext KHR_external_semaphore_fd
+
+vkImportSemaphoreWin32HandleKHR                     @dext KHR_external_semaphore_win32
+vkGetSemaphoreWin32HandleKHR                        @dext KHR_external_semaphore_win32
+
+vkTrimCommandPoolKHR                                @dext KHR_maintenance1
+
+vkGetDescriptorSetLayoutSupportKHR                  @dext KHR_maintenance3
+
+vkDestroySurfaceKHR                                 @iext KHR_surface
+vkGetPhysicalDeviceSurfaceCapabilitiesKHR           @iext KHR_surface
+vkGetPhysicalDeviceSurfaceFormatsKHR                @iext KHR_surface
+vkGetPhysicalDeviceSurfacePresentModesKHR           @iext KHR_surface
+vkGetPhysicalDeviceSurfaceSupportKHR                @iext KHR_surface
+
+vkGetPhysicalDeviceSurfaceCapabilities2KHR          @iext KHR_get_surface_capabilities2
+vkGetPhysicalDeviceSurfaceFormats2KHR               @iext KHR_get_surface_capabilities2
+
+vkCreateXcbSurfaceKHR                               @iext KHR_xcb_surface
+vkGetPhysicalDeviceXcbPresentationSupportKHR        @iext KHR_xcb_surface
+
+vkCreateXlibSurfaceKHR                              @iext KHR_xlib_surface
+vkGetPhysicalDeviceXlibPresentationSupportKHR       @iext KHR_xlib_surface
+
+vkAcquireNextImageKHR                               @dext KHR_swapchain
+vkCreateSwapchainKHR                                @dext KHR_swapchain
+vkDestroySwapchainKHR                               @dext KHR_swapchain
+vkGetSwapchainImagesKHR                             @dext KHR_swapchain
+vkQueuePresentKHR                                   @dext KHR_swapchain
+
+vkCmdDrawIndexedIndirectCountAMD                    @dext AMD_draw_indirect_count
+vkCmdDrawIndirectCountAMD                           @dext AMD_draw_indirect_count
+
+vkGetMultiDevicePropertiesAMDInternal               @none $win32_only
+vkOpenWin32BufferAMDInternal                        @none $win32_only
+vkOpenWin32ImageAMDInternal                         @none $win32_only
+vkOpenWin32SemaphoreAMDInternal                     @none $win32_only
+
+vkGetShaderInfoAMD                                  @dext AMD_shader_info
+
+vkCmdDebugMarkerBeginEXT                            @dext EXT_debug_marker
+vkCmdDebugMarkerEndEXT                              @dext EXT_debug_marker
+vkCmdDebugMarkerInsertEXT                           @dext EXT_debug_marker
+vkDebugMarkerSetObjectTagEXT                        @dext EXT_debug_marker
+vkDebugMarkerSetObjectNameEXT                       @dext EXT_debug_marker
+
+vkCreateGpaSessionAMD                               @dext AMD_gpa_interface
+vkDestroyGpaSessionAMD                              @dext AMD_gpa_interface
+vkSetGpaDeviceClockModeAMD                          @dext AMD_gpa_interface
+vkCmdBeginGpaSessionAMD                             @dext AMD_gpa_interface
+vkCmdEndGpaSessionAMD                               @dext AMD_gpa_interface
+vkCmdBeginGpaSampleAMD                              @dext AMD_gpa_interface
+vkCmdEndGpaSampleAMD                                @dext AMD_gpa_interface
+vkGetGpaSessionStatusAMD                            @dext AMD_gpa_interface
+vkGetGpaSessionResultsAMD                           @dext AMD_gpa_interface
+vkResetGpaSessionAMD                                @dext AMD_gpa_interface
+vkCmdCopyGpaSessionResultsAMD                       @dext AMD_gpa_interface
 
-vkGetImageMemoryRequirements2KHR                @dext KHR_get_memory_requirements2
-vkGetBufferMemoryRequirements2KHR               @dext KHR_get_memory_requirements2
-vkGetImageSparseMemoryRequirements2KHR          @dext KHR_get_memory_requirements2
+vkGetImageMemoryRequirements2KHR                    @dext KHR_get_memory_requirements2
+vkGetBufferMemoryRequirements2KHR                   @dext KHR_get_memory_requirements2
+vkGetImageSparseMemoryRequirements2KHR              @dext KHR_get_memory_requirements2
 
-vkCmdSetSampleLocationsEXT                      @dext EXT_sample_locations
-vkGetPhysicalDeviceMultisamplePropertiesEXT     @dext EXT_sample_locations
+vkCmdSetSampleLocationsEXT                          @dext EXT_sample_locations
+vkGetPhysicalDeviceMultisamplePropertiesEXT         @dext EXT_sample_locations
 
-vkGetPhysicalDeviceExternalFencePropertiesKHR   @iext KHR_external_fence_capabilities
+vkGetPhysicalDeviceExternalFencePropertiesKHR       @iext KHR_external_fence_capabilities
 
-vkImportFenceFdKHR                              @dext KHR_external_fence_fd
-vkGetFenceFdKHR                                 @dext KHR_external_fence_fd
+vkImportFenceFdKHR                                  @dext KHR_external_fence_fd
+vkGetFenceFdKHR                                     @dext KHR_external_fence_fd
 
-vkImportFenceWin32HandleKHR                     @dext KHR_external_fence_win32
-vkGetFenceWin32HandleKHR                        @dext KHR_external_fence_win32
+vkImportFenceWin32HandleKHR                         @dext KHR_external_fence_win32
+vkGetFenceWin32HandleKHR                            @dext KHR_external_fence_win32
 
-vkCmdWriteBufferMarkerAMD                       @dext AMD_buffer_marker
+vkCmdWriteBufferMarkerAMD                           @dext AMD_buffer_marker
 
-vkCreateDebugReportCallbackEXT                  @iext EXT_debug_report
-vkDestroyDebugReportCallbackEXT                 @iext EXT_debug_report
-vkDebugReportMessageEXT                         @iext EXT_debug_report
+vkCreateDebugReportCallbackEXT                      @iext EXT_debug_report
+vkDestroyDebugReportCallbackEXT                     @iext EXT_debug_report
+vkDebugReportMessageEXT                             @iext EXT_debug_report
 
-vkGetMemoryHostPointerPropertiesEXT             @dext EXT_external_memory_host
+vkGetMemoryHostPointerPropertiesEXT                 @dext EXT_external_memory_host
diff --git a/icd/api/strings/base_extensions.txt b/icd/api/strings/base_extensions.txt
index 863772c9..b19a50ee 100644
--- a/icd/api/strings/base_extensions.txt
+++ b/icd/api/strings/base_extensions.txt
@@ -48,6 +48,7 @@ VK_AMD_texture_gather_bias_lod
 VK_AMD_mixed_attachment_samples
 VK_EXT_debug_marker
 VK_AMD_gpu_shader_int16
+VK_EXT_shader_subgroup_ballot
 VK_EXT_shader_subgroup_vote
 VK_KHR_16bit_storage
 VK_KHR_storage_buffer_storage_class
diff --git a/icd/api/strings/generate_strings.py b/icd/api/strings/generate_strings.py
index c5ca9166..26b5782c 100644
--- a/icd/api/strings/generate_strings.py
+++ b/icd/api/strings/generate_strings.py
@@ -148,8 +148,10 @@ def generate_string_file_pass(string_file_prefix, header_file_prefix, gentype):
             if gentype == 'decl':
                 if type == 'none':
                     generate_entry_point_condition(f, name, "NONE", 0)
-                elif type == 'core':
-                    generate_entry_point_condition(f, name, "CORE", make_version(value))
+                elif type == 'icore':
+                    generate_entry_point_condition(f, name, "CORE_INSTANCE", make_version(value))
+                elif type == 'dcore':
+                    generate_entry_point_condition(f, name, "CORE_DEVICE", make_version(value))
                 elif type == 'iext':
                     generate_entry_point_condition(f, name, "INSTANCE_EXTENSION", "vk::InstanceExtensions::%s" % value.upper())
                 elif type == 'dext':
diff --git a/icd/api/strings/strings.h b/icd/api/strings/strings.h
index 1c462817..a1406af3 100644
--- a/icd/api/strings/strings.h
+++ b/icd/api/strings/strings.h
@@ -47,7 +47,8 @@ namespace secure
         enum EntryPointCondition : uint32_t
         {
             ENTRY_POINT_NONE,               // First-class entry point without any condition
-            ENTRY_POINT_CORE,               // Core entry point specific to a core Vulkan version
+            ENTRY_POINT_CORE_INSTANCE,      // Core instance entry point specific to a core Vulkan version
+            ENTRY_POINT_CORE_DEVICE,        // Core device entry point specific to a core Vulkan version
             ENTRY_POINT_INSTANCE_EXTENSION, // Instance extension specific entry point
             ENTRY_POINT_DEVICE_EXTENSION,   // Device extension specific entry point
         };
diff --git a/icd/api/vk_buffer.cpp b/icd/api/vk_buffer.cpp
index 5be28d13..f9464ef9 100644
--- a/icd/api/vk_buffer.cpp
+++ b/icd/api/vk_buffer.cpp
@@ -469,7 +469,8 @@ VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements2KHR(
     VkMemoryRequirements2KHR*                   pMemoryRequirements)
 {
     const Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    VK_ASSERT(pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
+    VK_ASSERT((pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
+              pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
 
     union
     {
diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp
index 6c3ce94e..20ceb2fd 100644
--- a/icd/api/vk_cmdbuffer.cpp
+++ b/icd/api/vk_cmdbuffer.cpp
@@ -297,6 +297,46 @@ Pal::Result CreateClearSubresRanges(
     return palResult;
 }
 
+// =====================================================================================================================
+// Returns attachment's PAL subresource ranges defined by clearInfo for LoadOp Clear.
+// When multiview is enabled, layer ranges are modified according active views during a renderpass.
+Util::Vector<Pal::SubresRange, MaxRangePerAttachment * Pal::MaxViewInstanceCount, Util::GenericAllocator>
+LoadOpClearSubresRanges(
+    const Framebuffer::Attachment& attachment,
+    const RPLoadOpClearInfo&       clearInfo,
+    const RenderPass&              renderPass)
+{
+    // Note that no allocation will be performed, so Util::Vector allocator is nullptr.
+    Util::Vector<Pal::SubresRange, MaxRangePerAttachment * Pal::MaxViewInstanceCount, Util::GenericAllocator> clearSubresRanges { nullptr };
+
+    const auto attachmentSubresRanges = attachment.FindSubresRanges(clearInfo.aspect);
+
+    if (renderPass.IsMultiviewEnabled())
+    {
+        const auto activeViews = renderPass.GetActiveViewsBitMask();
+        const auto layerRanges = RangesOfOnesInBitMask(activeViews);
+
+        for (uint32_t rangeIndex = 0; rangeIndex < attachmentSubresRanges.NumElements(); ++rangeIndex)
+        {
+            for (auto layerRangeIt = layerRanges.Begin(); layerRangeIt.IsValid(); layerRangeIt.Next())
+            {
+                clearSubresRanges.PushBack(attachmentSubresRanges.At(rangeIndex));
+                clearSubresRanges.Back().startSubres.arraySlice += layerRangeIt.Get().offset;
+                clearSubresRanges.Back().numSlices               = layerRangeIt.Get().extent;
+            }
+        }
+    }
+    else
+    {
+        for (uint32_t rangeIndex = 0; rangeIndex < attachmentSubresRanges.NumElements(); ++rangeIndex)
+        {
+            clearSubresRanges.PushBack(attachmentSubresRanges.At(rangeIndex));
+        }
+    }
+
+    return clearSubresRanges;
+}
+
 // =====================================================================================================================
 // Populate a vector with PAL rects created from Vulkan clear rects.
 // Returns Pal::Result::Success if completed successfully.
@@ -1826,22 +1866,30 @@ void CmdBuffer::CopyBuffer(
 
     VirtualStackFrame virtStackFrame(m_pStackAllocator);
 
+    const auto maxRegions  = EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions));
+    auto       regionBatch = Util::Min(regionCount, maxRegions);
+
     // Allocate space to store memory copy regions
-    Pal::MemoryCopyRegion* pPalRegions = virtStackFrame.AllocArray<Pal::MemoryCopyRegion>(regionCount);
+    Pal::MemoryCopyRegion* pPalRegions = virtStackFrame.AllocArray<Pal::MemoryCopyRegion>(regionBatch);
 
     if (pPalRegions != nullptr)
     {
         Buffer* pSrcBuffer = Buffer::ObjectFromHandle(srcBuffer);
         Buffer* pDstBuffer = Buffer::ObjectFromHandle(destBuffer);
 
-        for (uint32_t i = 0; i < regionCount; ++i)
+        for (uint32_t regionIdx = 0; regionIdx < regionCount; regionIdx += regionBatch)
         {
-            pPalRegions[i].srcOffset    = pSrcBuffer->MemOffset() + pRegions[i].srcOffset;
-            pPalRegions[i].dstOffset    = pDstBuffer->MemOffset() + pRegions[i].dstOffset;
-            pPalRegions[i].copySize     = pRegions[i].size;
-        }
+            regionBatch = Util::Min(regionCount - regionIdx, maxRegions);
 
-        PalCmdCopyBuffer(pSrcBuffer, pDstBuffer, regionCount, pPalRegions);
+            for (uint32_t i = 0; i < regionBatch; ++i)
+            {
+                pPalRegions[i].srcOffset    = pSrcBuffer->MemOffset() + pRegions[regionIdx + i].srcOffset;
+                pPalRegions[i].dstOffset    = pDstBuffer->MemOffset() + pRegions[regionIdx + i].dstOffset;
+                pPalRegions[i].copySize     = pRegions[regionIdx + i].size;
+            }
+
+            PalCmdCopyBuffer(pSrcBuffer, pDstBuffer, regionBatch, pPalRegions);
+        }
 
         virtStackFrame.FreeArray(pPalRegions);
     }
@@ -1866,8 +1914,11 @@ void CmdBuffer::CopyImage(
 
     VirtualStackFrame virtStackFrame(m_pStackAllocator);
 
+    const auto maxRegions  = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions)), MaxPalAspectsPerMask);
+    auto       regionBatch = Util::Min(regionCount * MaxPalAspectsPerMask, maxRegions);
+
     Pal::ImageCopyRegion* pPalRegions =
-        virtStackFrame.AllocArray<Pal::ImageCopyRegion>(regionCount * MaxPalAspectsPerMask);
+        virtStackFrame.AllocArray<Pal::ImageCopyRegion>(regionBatch);
 
     if (pPalRegions != nullptr)
     {
@@ -1880,14 +1931,21 @@ void CmdBuffer::CopyImage(
         const Pal::ImageLayout palSrcImgLayout = pSrcImage->GetTransferLayout(srcImageLayout, this);
         const Pal::ImageLayout palDstImgLayout = pDstImage->GetTransferLayout(destImageLayout, this);
 
-        uint32_t palRegionCount = 0;
-
-        for (uint32_t i = 0; i < regionCount; ++i)
+        for (uint32_t regionIdx = 0; regionIdx < regionCount;)
         {
-            VkToPalImageCopyRegion(pRegions[i], srcFormat.format, dstFormat.format, pPalRegions, palRegionCount);
-        }
+            uint32_t palRegionCount = 0;
 
-        PalCmdCopyImage(pSrcImage, palSrcImgLayout, pDstImage, palDstImgLayout, palRegionCount, pPalRegions);
+            while ((regionIdx < regionCount) &&
+                   (palRegionCount <= (regionBatch - MaxPalAspectsPerMask)))
+            {
+                VkToPalImageCopyRegion(pRegions[regionIdx], srcFormat.format, dstFormat.format,
+                    pPalRegions, palRegionCount);
+
+                ++regionIdx;
+            }
+
+            PalCmdCopyImage(pSrcImage, palSrcImgLayout, pDstImage, palDstImgLayout, palRegionCount, pPalRegions);
+        }
 
         virtStackFrame.FreeArray(pPalRegions);
     }
@@ -1911,39 +1969,50 @@ void CmdBuffer::BlitImage(
 {
     DbgBarrierPreCmd(DbgBarrierCopyImage);
 
-    const Image* const pSrcImage    = Image::ObjectFromHandle(srcImage);
-    const Image* const pDstImage    = Image::ObjectFromHandle(destImage);
-
-    const Pal::SwizzledFormat srcFormat = VkToPalFormat(pSrcImage->GetFormat());
-    const Pal::SwizzledFormat dstFormat = VkToPalFormat(pDstImage->GetFormat());
-
-    Pal::ScaledCopyInfo palCopyInfo = {};
-
-    palCopyInfo.srcImageLayout = pSrcImage->GetTransferLayout(srcImageLayout, this);
-    palCopyInfo.dstImageLayout = pDstImage->GetTransferLayout(destImageLayout, this);
-
     VirtualStackFrame virtStackFrame(m_pStackAllocator);
 
+    const auto maxRegions  = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions)), MaxPalAspectsPerMask);
+    auto       regionBatch = Util::Min(regionCount * MaxPalAspectsPerMask, maxRegions);
+
     // Allocate space to store scaled image copy regions (we need a separate region per PAL aspect)
     Pal::ImageScaledCopyRegion* pPalRegions =
-        virtStackFrame.AllocArray<Pal::ImageScaledCopyRegion>(regionCount * MaxPalAspectsPerMask);
+        virtStackFrame.AllocArray<Pal::ImageScaledCopyRegion>(regionBatch);
 
     if (pPalRegions != nullptr)
     {
-        for (uint32_t i = 0; i < regionCount; ++i)
-        {
-            VkToPalImageScaledCopyRegion(pRegions[i], srcFormat.format, dstFormat.format, pPalRegions, palCopyInfo.regionCount);
-        }
+        const Image* const pSrcImage    = Image::ObjectFromHandle(srcImage);
+        const Image* const pDstImage    = Image::ObjectFromHandle(destImage);
 
-        palCopyInfo.pRegions = pPalRegions;
+        const Pal::SwizzledFormat srcFormat = VkToPalFormat(pSrcImage->GetFormat());
+        const Pal::SwizzledFormat dstFormat = VkToPalFormat(pDstImage->GetFormat());
 
-        // Maps blit filters to their PAL equivalent
-        palCopyInfo.filter = VkToPalTexFilter(VK_FALSE, filter, filter, VK_SAMPLER_MIPMAP_MODE_NEAREST);
+        Pal::ScaledCopyInfo palCopyInfo = {};
+
+        palCopyInfo.srcImageLayout = pSrcImage->GetTransferLayout(srcImageLayout, this);
+        palCopyInfo.dstImageLayout = pDstImage->GetTransferLayout(destImageLayout, this);
 
+        // Maps blit filters to their PAL equivalent
+        palCopyInfo.filter   = VkToPalTexFilter(VK_FALSE, filter, filter, VK_SAMPLER_MIPMAP_MODE_NEAREST);
         palCopyInfo.rotation = Pal::ImageRotation::Ccw0;
 
-        // This will do a scaled blit
-        PalCmdScaledCopyImage(pSrcImage, pDstImage, palCopyInfo);
+        palCopyInfo.pRegions = pPalRegions;
+
+        for (uint32_t regionIdx = 0; regionIdx < regionCount;)
+        {
+            palCopyInfo.regionCount = 0;
+
+            while ((regionIdx < regionCount) &&
+                   (palCopyInfo.regionCount <= (regionBatch - MaxPalAspectsPerMask)))
+            {
+                VkToPalImageScaledCopyRegion(pRegions[regionIdx], srcFormat.format, dstFormat.format,
+                    pPalRegions, palCopyInfo.regionCount);
+
+                ++regionIdx;
+            }
+
+            // This will do a scaled blit
+            PalCmdScaledCopyImage(pSrcImage, pDstImage, palCopyInfo);
+        }
 
         virtStackFrame.FreeArray(pPalRegions);
     }
@@ -1955,6 +2024,9 @@ void CmdBuffer::BlitImage(
     DbgBarrierPostCmd(DbgBarrierCopyImage);
 }
 
+// PAL version 391.1 adds support for mis-aligned buffer-image/image-buffer copies
+#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 391) || \
+    ((PAL_CLIENT_INTERFACE_MAJOR_VERSION == 391) && (PAL_CLIENT_INTERFACE_MINOR_VERSION < 1))
 // =====================================================================================================================
 // Align memory to image copy region
 void CmdBuffer::AlignMemoryImageCopyRegion(
@@ -1981,6 +2053,7 @@ void CmdBuffer::AlignMemoryImageCopyRegion(
     const uint32_t copySizePixelsHeight = copySizeBytesHeight  / bytesPerPixel;
     pRegion->imageExtent.height = copySizePixelsHeight;
 }
+#endif
 
 // =====================================================================================================================
 // Copies from a buffer of linear data to a region of an image (vkCopyBufferToImage)
@@ -1995,8 +2068,11 @@ void CmdBuffer::CopyBufferToImage(
 
     VirtualStackFrame virtStackFrame(m_pStackAllocator);
 
+    const auto maxRegions  = EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions));
+    auto       regionBatch = Util::Min(regionCount, maxRegions);
+
     // Allocate space to store memory image copy regions
-    Pal::MemoryImageCopyRegion* pPalRegions = virtStackFrame.AllocArray<Pal::MemoryImageCopyRegion>(regionCount);
+    Pal::MemoryImageCopyRegion* pPalRegions = virtStackFrame.AllocArray<Pal::MemoryImageCopyRegion>(regionBatch);
 
     if (pPalRegions != nullptr)
     {
@@ -2006,26 +2082,35 @@ void CmdBuffer::CopyBufferToImage(
 
         const Pal::ImageLayout layout = pDstImage->GetTransferLayout(destImageLayout, this);
 
-        for (uint32_t i = 0; i < regionCount; ++i)
+        for (uint32_t regionIdx = 0; regionIdx < regionCount; regionIdx += regionBatch)
         {
-            // For image-buffer copies we have to override the format for depth-only and stencil-only copies
-            Pal::SwizzledFormat dstFormat = VkToPalFormat(Formats::GetAspectFormat(
-                pDstImage->GetFormat(), pRegions[i].imageSubresource.aspectMask));
-
-            pPalRegions[i] = VkToPalMemoryImageCopyRegion(pRegions[i], dstFormat.format, srcMemOffset);
-
-            if (!GpuUtil::ValidateMemoryImageRegion(
-                m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties(),
-                m_palEngineType,
-                *pDstImage->PalImage(),
-                *pSrcBuffer->PalMemory(),
-                pPalRegions[i]))
+            regionBatch = Util::Min(regionCount - regionIdx, maxRegions);
+
+            for (uint32_t i = 0; i < regionBatch; ++i)
             {
-                AlignMemoryImageCopyRegion(pDstImage->PalImage(), &pPalRegions[i]);
+                // For image-buffer copies we have to override the format for depth-only and stencil-only copies
+                Pal::SwizzledFormat dstFormat = VkToPalFormat(Formats::GetAspectFormat(
+                    pDstImage->GetFormat(), pRegions[regionIdx + i].imageSubresource.aspectMask));
+
+                pPalRegions[i] = VkToPalMemoryImageCopyRegion(pRegions[regionIdx + i], dstFormat.format, srcMemOffset);
+
+                // PAL version 391.1 adds support for mis-aligned buffer-image/image-buffer copies
+#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION < 391) || \
+    ((PAL_CLIENT_INTERFACE_MAJOR_VERSION == 391) && (PAL_CLIENT_INTERFACE_MINOR_VERSION < 1))
+                if (!GpuUtil::ValidateMemoryImageRegion(
+                    m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties(),
+                    m_palEngineType,
+                    *pDstImage->PalImage(),
+                    *pSrcBuffer->PalMemory(),
+                    pPalRegions[i]))
+                {
+                     AlignMemoryImageCopyRegion(pDstImage->PalImage(), &pPalRegions[i]);
+                }
+#endif
             }
-        }
 
-        PalCmdCopyMemoryToImage(pSrcBuffer, pDstImage, layout, regionCount, pPalRegions);
+            PalCmdCopyMemoryToImage(pSrcBuffer, pDstImage, layout, regionBatch, pPalRegions);
+        }
 
         virtStackFrame.FreeArray(pPalRegions);
     }
@@ -2050,8 +2135,11 @@ void CmdBuffer::CopyImageToBuffer(
 
     VirtualStackFrame virtStackFrame(m_pStackAllocator);
 
+    const auto maxRegions  = EstimateMaxObjectsOnVirtualStack(sizeof(*pRegions));
+    auto       regionBatch = Util::Min(regionCount, maxRegions);
+
     // Allocate space to store memory image copy regions
-    Pal::MemoryImageCopyRegion* pPalRegions = virtStackFrame.AllocArray<Pal::MemoryImageCopyRegion>(regionCount);
+    Pal::MemoryImageCopyRegion* pPalRegions = virtStackFrame.AllocArray<Pal::MemoryImageCopyRegion>(regionBatch);
 
     if (pPalRegions != nullptr)
     {
@@ -2063,18 +2151,21 @@ void CmdBuffer::CopyImageToBuffer(
 
         const Pal::ImageLayout layout = pSrcImage->GetTransferLayout(srcImageLayout, this);
 
-        uint32_t engineCopyCount = 0;
-
-        for (uint32_t i = 0; i < regionCount; ++i)
+        for (uint32_t regionIdx = 0; regionIdx < regionCount; regionIdx += regionBatch)
         {
-            // For image-buffer copies we have to override the format for depth-only and stencil-only copies
-            Pal::SwizzledFormat srcFormat = VkToPalFormat(Formats::GetAspectFormat(pSrcImage->GetFormat(),
-                pRegions[i].imageSubresource.aspectMask));
+            regionBatch = Util::Min(regionCount - regionIdx, maxRegions);
 
-            pPalRegions[engineCopyCount++] = VkToPalMemoryImageCopyRegion(pRegions[i], srcFormat.format, dstMemOffset);
-        }
+            for (uint32_t i = 0; i < regionBatch; ++i)
+            {
+                // For image-buffer copies we have to override the format for depth-only and stencil-only copies
+                Pal::SwizzledFormat srcFormat = VkToPalFormat(Formats::GetAspectFormat(pSrcImage->GetFormat(),
+                    pRegions[regionIdx + i].imageSubresource.aspectMask));
 
-        PalCmdCopyImageToMemory(pSrcImage, pDstBuffer, layout, regionCount, pPalRegions);
+                pPalRegions[i] = VkToPalMemoryImageCopyRegion(pRegions[regionIdx + i], srcFormat.format, dstMemOffset);
+            }
+
+            PalCmdCopyImageToMemory(pSrcImage, pDstBuffer, layout, regionBatch, pPalRegions);
+        }
 
         virtStackFrame.FreeArray(pPalRegions);
     }
@@ -2147,37 +2238,46 @@ void CmdBuffer::ClearColorImage(
 
     VirtualStackFrame virtStackFrame(m_pStackAllocator);
 
+    const auto maxRanges  = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRanges)), MaxPalColorAspectsPerMask);
+    auto       rangeBatch = Util::Min(rangeCount * MaxPalColorAspectsPerMask, maxRanges);
+
     // Allocate space to store image subresource ranges
-    Pal::SubresRange* pPalRanges = virtStackFrame.AllocArray<Pal::SubresRange>(rangeCount * MaxPalColorAspectsPerMask);
+    Pal::SubresRange* pPalRanges = virtStackFrame.AllocArray<Pal::SubresRange>(rangeBatch);
 
     if (pPalRanges != nullptr)
     {
-        uint32_t palRangeCount = 0;
-
         const Pal::ImageLayout layout = pImage->GetTransferLayout(imageLayout, this);
 
-        for (uint32_t i = 0; i < rangeCount; ++i)
+        for (uint32_t rangeIdx = 0; rangeIdx < rangeCount;)
         {
-            // Only color aspect is allowed here
-            VK_ASSERT(pRanges[i].aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
-
-            VkToPalSubresRange(palFormat.format,
-                               pRanges[i],
-                               pImage->GetMipLevels(),
-                               pImage->GetArraySize(),
-                               pPalRanges,
-                               palRangeCount);
-        }
+            uint32_t palRangeCount = 0;
 
-        PalCmdClearColorImage(
-            *pImage,
-            layout,
-            VkToPalClearColor(pColor, palFormat.format),
-            palRangeCount,
-            pPalRanges,
-            0,
-            nullptr,
-            0);
+            while ((rangeIdx < rangeCount) &&
+                   (palRangeCount <= (rangeBatch - MaxPalColorAspectsPerMask)))
+            {
+                // Only color aspect is allowed here
+                VK_ASSERT(pRanges[rangeIdx].aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+
+                VkToPalSubresRange(palFormat.format,
+                                   pRanges[rangeIdx],
+                                   pImage->GetMipLevels(),
+                                   pImage->GetArraySize(),
+                                   pPalRanges,
+                                   palRangeCount);
+
+                ++rangeIdx;
+            }
+
+            PalCmdClearColorImage(
+                *pImage,
+                layout,
+                VkToPalClearColor(pColor, palFormat.format),
+                palRangeCount,
+                pPalRanges,
+                0,
+                nullptr,
+                0);
+        }
 
         virtStackFrame.FreeArray(pPalRanges);
     }
@@ -2236,40 +2336,49 @@ void CmdBuffer::ClearDepthStencilImage(
 {
     VirtualStackFrame virtStackFrame(m_pStackAllocator);
 
+    const auto maxRanges  = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRanges)), MaxPalDepthAspectsPerMask);
+    auto       rangeBatch = Util::Min(rangeCount * MaxPalDepthAspectsPerMask, maxRanges);
+
     // Allocate space to store image subresource ranges (we need a separate region per PAL aspect)
-    Pal::SubresRange* pPalRanges = virtStackFrame.AllocArray<Pal::SubresRange>(rangeCount * MaxPalDepthAspectsPerMask);
+    Pal::SubresRange* pPalRanges = virtStackFrame.AllocArray<Pal::SubresRange>(rangeBatch);
 
     if (pPalRanges != nullptr)
     {
-        uint32_t palRangeCount = 0;
-
         const Image* pImage           = Image::ObjectFromHandle(image);
         const Pal::ImageLayout layout = pImage->GetTransferLayout(imageLayout, this);
 
-        for (uint32_t i = 0; i < rangeCount; ++i)
+        for (uint32_t rangeIdx = 0; rangeIdx < rangeCount;)
         {
-            // Only depth or stencil aspect is allowed here
-            VK_ASSERT((pRanges[i].aspectMask & ~(VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) == 0);
-
-            VkToPalSubresRange(VkToPalFormat(pImage->GetFormat()).format,
-                               pRanges[i],
-                               pImage->GetMipLevels(),
-                               pImage->GetArraySize(),
-                               pPalRanges,
-                               palRangeCount);
-        }
+            uint32_t palRangeCount = 0;
 
-        PalCmdClearDepthStencil(
-            *pImage,
-            layout,
-            layout,
-            VkToPalClearDepth(depth),
-            stencil,
-            palRangeCount,
-            pPalRanges,
-            0,
-            nullptr,
-            0);
+            while ((rangeIdx < rangeCount) &&
+                   (palRangeCount <= (rangeBatch - MaxPalDepthAspectsPerMask)))
+            {
+                // Only depth or stencil aspect is allowed here
+                VK_ASSERT((pRanges[rangeIdx].aspectMask & ~(VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) == 0);
+
+                VkToPalSubresRange(VkToPalFormat(pImage->GetFormat()).format,
+                                   pRanges[rangeIdx],
+                                   pImage->GetMipLevels(),
+                                   pImage->GetArraySize(),
+                                   pPalRanges,
+                                   palRangeCount);
+
+                ++rangeIdx;
+            }
+
+            PalCmdClearDepthStencil(
+                *pImage,
+                layout,
+                layout,
+                VkToPalClearDepth(depth),
+                stencil,
+                palRangeCount,
+                pPalRanges,
+                0,
+                nullptr,
+                0);
+        }
 
         virtStackFrame.FreeArray(pPalRanges);
     }
@@ -2316,11 +2425,9 @@ void CmdBuffer::ClearBoundAttachments(
     Util::Vector<Pal::ClearBoundTargetRegion, 8, VirtualStackFrame> clearRegions { &virtStackFrame };
     Util::Vector<Pal::BoundColorTarget,       8, VirtualStackFrame> colorTargets { &virtStackFrame };
 
-    const auto palResult1 = CreateClearRegions(
-        rectCount, pRects,
-        *pRenderPass, subpass,
-        &clearRegions);
-
+    const auto maxRects   = EstimateMaxObjectsOnVirtualStack(sizeof(*pRects));
+    auto       rectBatch  = Util::Min(rectCount, maxRects);
+    const auto palResult1 = clearRegions.Reserve(rectBatch);
     const auto palResult2 = colorTargets.Reserve(attachmentCount);
 
     if ((palResult1 != Pal::Result::Success) ||
@@ -2373,15 +2480,25 @@ void CmdBuffer::ClearBoundAttachments(
 
                 DbgBarrierPreCmd(DbgBarrierClearDepth);
 
-                // Clear the bound depth stencil target immediately
-                PalCmdBuffer(DefaultDeviceIndex)->CmdClearBoundDepthStencilTargets(
-                    VkToPalClearDepth(clearInfo.clearValue.depthStencil.depth),
-                    clearInfo.clearValue.depthStencil.stencil,
-                    pRenderPass->GetDepthStencilAttachmentSamples(subpass),
-                    pRenderPass->GetDepthStencilAttachmentSamples(subpass),
-                    selectFlags,
-                    clearRegions.NumElements(),
-                    clearRegions.Data());
+                for (uint32_t rectIdx = 0; rectIdx < rectCount; rectIdx += rectBatch)
+                {
+                    rectBatch = Util::Min(rectCount - rectIdx, maxRects);
+
+                    CreateClearRegions(
+                        rectBatch, pRects + rectIdx,
+                        *pRenderPass, subpass,
+                        &clearRegions);
+
+                    // Clear the bound depth stencil target immediately
+                    PalCmdBuffer(DefaultDeviceIndex)->CmdClearBoundDepthStencilTargets(
+                        VkToPalClearDepth(clearInfo.clearValue.depthStencil.depth),
+                        clearInfo.clearValue.depthStencil.stencil,
+                        pRenderPass->GetDepthStencilAttachmentSamples(subpass),
+                        pRenderPass->GetDepthStencilAttachmentSamples(subpass),
+                        selectFlags,
+                        clearRegions.NumElements(),
+                        clearRegions.Data());
+                }
 
                 DbgBarrierPostCmd(DbgBarrierClearDepth);
             }
@@ -2392,12 +2509,22 @@ void CmdBuffer::ClearBoundAttachments(
     {
         DbgBarrierPreCmd(DbgBarrierClearColor);
 
-        // Clear the bound color targets
-        PalCmdBuffer(DefaultDeviceIndex)->CmdClearBoundColorTargets(
-            colorTargets.NumElements(),
-            colorTargets.Data(),
-            clearRegions.NumElements(),
-            clearRegions.Data());
+        for (uint32_t rectIdx = 0; rectIdx < rectCount; rectIdx += rectBatch)
+        {
+            rectBatch = Util::Min(rectCount - rectIdx, maxRects);
+
+            CreateClearRegions(
+                rectBatch, pRects + rectIdx,
+                *pRenderPass, subpass,
+                &clearRegions);
+
+            // Clear the bound color targets
+            PalCmdBuffer(DefaultDeviceIndex)->CmdClearBoundColorTargets(
+                colorTargets.NumElements(),
+                colorTargets.Data(),
+                clearRegions.NumElements(),
+                clearRegions.Data());
+        }
 
         DbgBarrierPostCmd(DbgBarrierClearColor);
     }
@@ -2578,7 +2705,8 @@ void CmdBuffer::ClearImageAttachments(
 
     // Get the current renderpass and subpass
     const RenderPass* pRenderPass = m_state.allGpuState.pRenderPass;
-    const uint32_t subpass        = m_renderPassInstance.subpass;
+    const uint32_t    subpass     = m_renderPassInstance.subpass;
+    const auto        maxRects    = EstimateMaxObjectsOnVirtualStack(sizeof(*pRects));
 
     // Go through each of the clear attachment infos
     for (uint32_t idx = 0; idx < attachmentCount; ++idx)
@@ -2609,29 +2737,38 @@ void CmdBuffer::ClearImageAttachments(
                 Util::Vector<Pal::Box,         8, VirtualStackFrame> clearBoxes        { &virtStackFrame };
                 Util::Vector<Pal::SubresRange, 8, VirtualStackFrame> clearSubresRanges { &virtStackFrame };
 
-                const auto palResult1 = CreateClearRegions(
-                    rectCount, pRects,
-                    *pRenderPass, subpass,
-                    &clearBoxes);
-
-                const auto palResult2 = CreateClearSubresRanges(
-                    attachment, clearInfo,
-                    rectCount, pRects,
-                    *pRenderPass, subpass,
-                    &clearSubresRanges);
+                auto       rectBatch  = Util::Min(rectCount, maxRects);
+                const auto palResult1 = clearBoxes.Reserve(rectBatch);
+                const auto palResult2 = clearSubresRanges.Reserve(rectBatch);
 
                 if ((palResult1 == Pal::Result::Success) &&
                     (palResult2 == Pal::Result::Success))
                 {
-                    PalCmdClearColorImage(
-                        *attachment.pImage,
-                        targetLayout,
-                        VkToPalClearColor(&clearInfo.clearValue.color, attachment.viewFormat.format),
-                        clearSubresRanges.NumElements(),
-                        clearSubresRanges.Data(),
-                        clearBoxes.NumElements(),
-                        clearBoxes.Data(),
-                        Pal::ClearColorImageFlags::ColorClearAutoSync);
+                    for (uint32_t rectIdx = 0; rectIdx < rectCount; rectIdx += rectBatch)
+                    {
+                        rectBatch = Util::Min(rectCount - rectIdx, maxRects);
+
+                        CreateClearRegions(
+                            rectCount, pRects + rectIdx,
+                            *pRenderPass, subpass,
+                            &clearBoxes);
+
+                        CreateClearSubresRanges(
+                            attachment, clearInfo,
+                            rectCount, pRects + rectIdx,
+                            *pRenderPass, subpass,
+                            &clearSubresRanges);
+
+                        PalCmdClearColorImage(
+                            *attachment.pImage,
+                            targetLayout,
+                            VkToPalClearColor(&clearInfo.clearValue.color, attachment.viewFormat.format),
+                            clearSubresRanges.NumElements(),
+                            clearSubresRanges.Data(),
+                            clearBoxes.NumElements(),
+                            clearBoxes.Data(),
+                            Pal::ClearColorImageFlags::ColorClearAutoSync);
+                    }
                 }
                 else
                 {
@@ -2660,30 +2797,39 @@ void CmdBuffer::ClearImageAttachments(
                 Util::Vector<Pal::Rect,        8, VirtualStackFrame> clearRects        { &virtStackFrame };
                 Util::Vector<Pal::SubresRange, 8, VirtualStackFrame> clearSubresRanges { &virtStackFrame };
 
-                const auto palResult1 = CreateClearRects(
-                    rectCount, pRects,
-                    &clearRects);
-
-                const auto palResult2 = CreateClearSubresRanges(
-                    attachment, clearInfo,
-                    rectCount, pRects,
-                    *pRenderPass, subpass,
-                    &clearSubresRanges);
+                auto       rectBatch  = Util::Min(rectCount, maxRects);
+                const auto palResult1 = clearRects.Reserve(rectBatch);
+                const auto palResult2 = clearSubresRanges.Reserve(rectBatch);
 
                 if ((palResult1 == Pal::Result::Success) &&
                     (palResult2 == Pal::Result::Success))
                 {
-                    PalCmdClearDepthStencil(
-                        *attachment.pImage,
-                        depthLayout,
-                        stencilLayout,
-                        VkToPalClearDepth(clearInfo.clearValue.depthStencil.depth),
-                        clearInfo.clearValue.depthStencil.stencil,
-                        clearSubresRanges.NumElements(),
-                        clearSubresRanges.Data(),
-                        clearRects.NumElements(),
-                        clearRects.Data(),
-                        Pal::ClearDepthStencilFlags::DsClearAutoSync);
+                    for (uint32_t rectIdx = 0; rectIdx < rectCount; rectIdx += rectBatch)
+                    {
+                        rectBatch = Util::Min(rectCount - rectIdx, maxRects);
+
+                        CreateClearRects(
+                            rectCount, pRects + rectIdx,
+                            &clearRects);
+
+                        CreateClearSubresRanges(
+                            attachment, clearInfo,
+                            rectCount, pRects + rectIdx,
+                            *pRenderPass, subpass,
+                            &clearSubresRanges);
+
+                        PalCmdClearDepthStencil(
+                            *attachment.pImage,
+                            depthLayout,
+                            stencilLayout,
+                            VkToPalClearDepth(clearInfo.clearValue.depthStencil.depth),
+                            clearInfo.clearValue.depthStencil.stencil,
+                            clearSubresRanges.NumElements(),
+                            clearSubresRanges.Data(),
+                            clearRects.NumElements(),
+                            clearRects.Data(),
+                            Pal::ClearDepthStencilFlags::DsClearAutoSync);
+                    }
                 }
                 else
                 {
@@ -2705,9 +2851,12 @@ void CmdBuffer::ResolveImage(
 {
     VirtualStackFrame virtStackFrame(m_pStackAllocator);
 
+    const auto maxRects  = Util::Max(EstimateMaxObjectsOnVirtualStack(sizeof(*pRects)), MaxPalAspectsPerMask);
+    auto       rectBatch = Util::Min(rectCount * MaxPalAspectsPerMask, maxRects);
+
     // Allocate space to store image resolve regions (we need a separate region per PAL aspect)
     Pal::ImageResolveRegion* pPalRegions =
-        virtStackFrame.AllocArray<Pal::ImageResolveRegion>(rectCount * MaxPalAspectsPerMask);
+        virtStackFrame.AllocArray<Pal::ImageResolveRegion>(rectBatch);
 
     if (pPalRegions != nullptr)
     {
@@ -2718,23 +2867,29 @@ void CmdBuffer::ResolveImage(
         const Pal::SwizzledFormat srcFormat       = VkToPalFormat(pSrcImage->GetFormat());
         const Pal::SwizzledFormat dstFormat       = VkToPalFormat(pDstImage->GetFormat());
 
-        uint32_t palRegionCount = 0;
-
-        for (uint32_t i = 0; i < rectCount; ++i)
+        for (uint32_t rectIdx = 0; rectIdx < rectCount;)
         {
-            // We expect MSAA images to never have mipmaps
-            VK_ASSERT(pRects[i].srcSubresource.mipLevel == 0);
+            uint32_t palRegionCount = 0;
 
-            VkToPalImageResolveRegion(pRects[i], srcFormat.format, dstFormat.format, pPalRegions, palRegionCount);
-        }
+            while ((rectIdx < rectCount) &&
+                   (palRegionCount <= (rectBatch - MaxPalAspectsPerMask)))
+            {
+                // We expect MSAA images to never have mipmaps
+                VK_ASSERT(pRects[rectIdx].srcSubresource.mipLevel == 0);
+
+                VkToPalImageResolveRegion(pRects[rectIdx], srcFormat.format, dstFormat.format, pPalRegions, palRegionCount);
 
-        PalCmdResolveImage<false>(
-            *pSrcImage,
-            palSrcImageLayout,
-            *pDstImage,
-            palDestImageLayout,
-            palRegionCount,
-            pPalRegions);
+                ++rectIdx;
+            }
+
+            PalCmdResolveImage<false>(
+                *pSrcImage,
+                palSrcImageLayout,
+                *pDstImage,
+                palDestImageLayout,
+                palRegionCount,
+                pPalRegions);
+        }
 
         virtStackFrame.FreeArray(pPalRegions);
     }
@@ -4622,6 +4777,10 @@ void CmdBuffer::RPLoadOpClearColor(
             &m_renderPassInstance.pAttachments[clear.attachment].clearValue.color,
             attachment.viewFormat.format);
 
+        const auto clearSubresRanges = LoadOpClearSubresRanges(
+            attachment, clear,
+            *m_state.allGpuState.pRenderPass);
+
         utils::IterateMask deviceGroup(GetDeviceMask());
 
         while (deviceGroup.Iterate())
@@ -4634,8 +4793,8 @@ void CmdBuffer::RPLoadOpClearColor(
                 *attachment.pImage->PalImage(deviceIdx),
                 clearLayout,
                 clearColor,
-                attachment.subresRangeCount,
-                attachment.subresRange,
+                clearSubresRanges.NumElements(),
+                clearSubresRanges.Data(),
                 1, &clearBox,
                 Pal::ColorClearAutoSync);
         }
@@ -4676,24 +4835,9 @@ void CmdBuffer::RPLoadOpClearDepthStencil(
         float clearDepth        = VkToPalClearDepth(clearValue.depthStencil.depth);
         Pal::uint8 clearStencil = clearValue.depthStencil.stencil;
 
-        Pal::SubresRange clearRanges[2];
-        uint32_t clearRangeCount = 0;
-
-        for (uint32_t sr = 0; sr < attachment.subresRangeCount; ++sr)
-        {
-            VK_ASSERT(clearRangeCount < 2);
-
-            if ((clear.aspect & VK_IMAGE_ASPECT_DEPTH_BIT) &&
-                (attachment.subresRange[sr].startSubres.aspect == Pal::ImageAspect::Depth))
-            {
-                clearRanges[clearRangeCount++] = attachment.subresRange[sr];
-            }
-            else if ((clear.aspect & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-                (attachment.subresRange[sr].startSubres.aspect == Pal::ImageAspect::Stencil))
-            {
-                clearRanges[clearRangeCount++] = attachment.subresRange[sr];
-            }
-        }
+        const auto clearSubresRanges = LoadOpClearSubresRanges(
+            attachment, clear,
+            *m_state.allGpuState.pRenderPass);
 
         utils::IterateMask deviceGroup(GetDeviceMask());
 
@@ -4709,8 +4853,8 @@ void CmdBuffer::RPLoadOpClearDepthStencil(
                 stencilLayout,
                 clearDepth,
                 clearStencil,
-                clearRangeCount,
-                clearRanges,
+                clearSubresRanges.NumElements(),
+                clearSubresRanges.Data(),
                 1, &clearRect,
                 Pal::DsClearAutoSync);
         }
@@ -5099,7 +5243,8 @@ void CmdBuffer::SetViewport(
 
     DbgBarrierPreCmd(DbgBarrierSetDynamicPipelineState);
 
-    const bool khrMaintenance1 = m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1);
+    const bool khrMaintenance1 = ((m_pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
+                                  m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1));
 
     for (uint32_t i = 0; i < viewportCount; ++i)
     {
@@ -5356,6 +5501,14 @@ void CmdBuffer::SetStencilReference(
     DbgBarrierPostCmd(DbgBarrierSetDynamicPipelineState);
 }
 
+// =====================================================================================================================
+// Get a safe number of objects that can be allocated by the virtual stack frame allocator without risking OOM error.
+uint32_t CmdBuffer::EstimateMaxObjectsOnVirtualStack(size_t objectSize) const
+{
+    // Return at least 1 and use only 50% of the remaining space.
+    return 1 + static_cast<uint32_t>((m_pStackAllocator->Remaining() / objectSize) >> 1);
+}
+
 #if VK_ENABLE_DEBUG_BARRIERS
 // =====================================================================================================================
 // This function inserts a command before or after a particular Vulkan command if the given runtime settings are asking
diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp
index dce956ab..6e298627 100644
--- a/icd/api/vk_compute_pipeline.cpp
+++ b/icd/api/vk_compute_pipeline.cpp
@@ -55,137 +55,6 @@ void ComputePipeline::ConvertComputePipelineInfo(
         pOutInfo->pLayout = PipelineLayout::ObjectFromHandle(pIn->layout);
     }
 
-    pOutInfo->flags  = pIn->flags;
-    pOutInfo->pStage = &pIn->stage;
-
-}
-
-// =====================================================================================================================
-// Creates a compute pipeline binary for each PAL device
-VkResult ComputePipeline::CreateComputePipelineBinaries(
-    Device*        pDevice,
-    PipelineCache* pPipelineCache,
-    CreateInfo*    pCreateInfo,
-    size_t         pipelineBinarySizes[MaxPalDevices],
-    void*          pPipelineBinaries[MaxPalDevices])
-{
-    VkResult               result   = VK_SUCCESS;
-    const RuntimeSettings& settings = pDevice->GetRuntimeSettings();
-    const ShaderModule*    pShader  = ShaderModule::ObjectFromHandle(pCreateInfo->pStage->module);
-
-    // Allocate space to create the LLPC/SCPC pipeline resource mappings
-    void* pMappingBuffer = nullptr;
-
-    if (pCreateInfo->pLayout != nullptr)
-    {
-        size_t tempBufferSize = pCreateInfo->pLayout->GetPipelineInfo()->tempBufferSize;
-
-        // Allocate the temp buffer
-        if (tempBufferSize > 0)
-        {
-            pMappingBuffer = pDevice->VkInstance()->AllocMem(
-                tempBufferSize,
-                VK_DEFAULT_MEM_ALIGN,
-                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-
-            if (pMappingBuffer == nullptr)
-            {
-                result = VK_ERROR_OUT_OF_HOST_MEMORY;
-            }
-        }
-    }
-
-    // Build the LLPC pipeline
-    Llpc::ComputePipelineBuildInfo pipelineBuildInfo   = {};
-    Llpc::ComputePipelineBuildOut  pipelineOut         = {};
-    void*                          pLlpcPipelineBuffer = nullptr;
-
-    if ((result == VK_SUCCESS)
-        )
-    {
-        // Fill pipeline create info for LLPC
-        pipelineBuildInfo.pInstance      = pDevice->VkPhysicalDevice()->VkInstance();
-        pipelineBuildInfo.pfnOutputAlloc = AllocateShaderOutput;
-        pipelineBuildInfo.pUserData      = &pLlpcPipelineBuffer;
-        auto pShaderInfo = &pipelineBuildInfo.cs;
-
-        pShaderInfo->pModuleData         = pShader->GetShaderData(true);
-        pShaderInfo->pSpecializatonInfo  = pCreateInfo->pStage->pSpecializationInfo;
-        pShaderInfo->pEntryTarget        = pCreateInfo->pStage->pName;
-
-        // Build the resource mapping description for LLPC.  This data contains things about how shader
-        // inputs like descriptor set bindings interact with this pipeline in a form that LLPC can
-        // understand.
-        if (pCreateInfo->pLayout != nullptr)
-        {
-            result = pCreateInfo->pLayout->BuildLlpcPipelineMapping(
-                ShaderStageCompute,
-                pMappingBuffer,
-                nullptr,
-                pShaderInfo,
-                nullptr);
-        }
-    }
-
-    uint64_t pipeHash = 0;
-
-    bool enableLlpc = false;
-    enableLlpc = true;
-
-    if (result == VK_SUCCESS)
-    {
-        if (enableLlpc)
-        {
-            if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCacheType() == PipelineCacheTypeLlpc))
-            {
-                pipelineBuildInfo.pShaderCache = pPipelineCache->GetShaderCache(DefaultDeviceIndex).pLlpcShaderCache;
-            }
-
-            auto llpcResult = pDevice->GetLlpcCompiler()->BuildComputePipeline(&pipelineBuildInfo, &pipelineOut);
-            if (llpcResult != Llpc::Result::Success)
-            {
-                // There shouldn't be anything to free for the failure case
-                VK_ASSERT(pLlpcPipelineBuffer == nullptr);
-
-                {
-                    result = VK_ERROR_INITIALIZATION_FAILED;
-                }
-            }
-        }
-        else
-        if (settings.enablePipelineDump)
-        {
-            // LLPC isn't enabled but pipeline dump is required, call LLPC dump interface explicitly
-            void* pHandle = Llpc::IPipelineDumper::BeginPipelineDump(settings.pipelineDumpDir, &pipelineBuildInfo, nullptr);
-            Llpc::IPipelineDumper::EndPipelineDump(pHandle);
-        }
-    }
-
-    // Update PAL pipeline create info with LLPC output
-    if (enableLlpc)
-    {
-        if (result == VK_SUCCESS)
-        {
-
-            // Make sure that this is the same pointer we will free once the PAL pipeline is created
-            VK_ASSERT(pLlpcPipelineBuffer == pipelineOut.pipelineBin.pCode);
-
-            pPipelineBinaries[DefaultDeviceIndex]   = pLlpcPipelineBuffer;
-            pipelineBinarySizes[DefaultDeviceIndex] = pipelineOut.pipelineBin.codeSize;
-        }
-    }
-    else
-    {
-        result = VK_SUCCESS;
-    }
-
-    // Free the memory for the LLPC/SCPC pipeline resource mappings
-    if (pMappingBuffer != nullptr)
-    {
-        pDevice->VkInstance()->FreeMem(pMappingBuffer);
-    }
-
-    return result;
 }
 
 // =====================================================================================================================
@@ -242,22 +111,33 @@ VkResult ComputePipeline::Create(
     // Setup PAL create info from Vulkan inputs
     CreateInfo  createInfo                         = {};
     size_t      pipelineBinarySizes[MaxPalDevices] = {};
-    void*       pPipelineBinaries[MaxPalDevices]   = {};
-
-    ConvertComputePipelineInfo(pDevice, pCreateInfo, &createInfo);
+    const void* pPipelineBinaries[MaxPalDevices]   = {};
+    PipelineCompiler*   pDefaultCompiler = pDevice->GetCompiler();
+    PipelineCompiler::ComputePipelineCreateInfo binaryCreateInfo = {};
+    VkResult result = pDefaultCompiler->ConvertComputePipelineInfo(pCreateInfo, &binaryCreateInfo);
 
-    VkResult result = CreateComputePipelineBinaries(
-        pDevice,
-        pPipelineCache,
-        &createInfo,
-        pipelineBinarySizes,
-        pPipelineBinaries);
+    for (uint32_t deviceIdx = 0; (result == VK_SUCCESS) && (deviceIdx < pDevice->NumPalDevices()); deviceIdx++)
+    {
+        result = pDevice->GetCompiler(deviceIdx)->CreateComputePipelineBinary(
+            pDevice,
+            deviceIdx,
+            pPipelineCache,
+            &binaryCreateInfo,
+            &pipelineBinarySizes[deviceIdx],
+            &pPipelineBinaries[deviceIdx]);
+    }
 
     if (result != VK_SUCCESS)
     {
         return result;
     }
 
+    if (result == VK_SUCCESS)
+    {
+        ConvertComputePipelineInfo(pDevice, pCreateInfo, &createInfo);
+
+    }
+
     size_t pipelineSize = 0;
     void*  pSystemMem   = nullptr;
 
@@ -347,11 +227,11 @@ VkResult ComputePipeline::Create(
     {
         if (pPipelineBinaries[deviceIdx] != nullptr)
         {
-            {
-                pDevice->VkInstance()->FreeMem(pPipelineBinaries[deviceIdx]);
-            }
+            pDevice->GetCompiler(deviceIdx)->FreeComputePipelineBinary(
+                &binaryCreateInfo, pPipelineBinaries[deviceIdx], pipelineBinarySizes[deviceIdx]);
         }
     }
+    pDefaultCompiler->FreeComputePipelineCreateInfo(&binaryCreateInfo);
 
     // Something went wrong with creating the PAL object. Free memory and return error.
     if (result != VK_SUCCESS)
diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp
index 78e6259d..93da913b 100644
--- a/icd/api/vk_descriptor_pool.cpp
+++ b/icd/api/vk_descriptor_pool.cpp
@@ -228,7 +228,7 @@ VkResult DescriptorPool::AllocDescriptorSets(
                 DescriptorSet* pSet = DescriptorSet::StateFromHandle(pDescriptorSets[allocCount]);
 
                 pSet->Reassign(pLayout, setGpuMemOffset, m_gpuAddressCached, m_pCpuAddressCached, m_pDevice->NumPalDevices(),
-                    &m_internalMem, pSetAllocHandle, &pDescriptorSets[allocCount]);
+                    pSetAllocHandle);
             }
             else
             {
@@ -838,7 +838,6 @@ void* DescriptorGpuMemHeap::GetDescriptorSetMappedAddress(
 DescriptorSetHeap::DescriptorSetHeap() :
 m_nextFreeHandle(0),
 m_maxSets(0),
-m_pHandles(nullptr),
 m_pFreeIndexStack(nullptr),
 m_freeIndexStackCount(0),
 m_pSetMemory(nullptr)
@@ -859,7 +858,7 @@ VkResult DescriptorSetHeap::Init(
     m_maxSets = maxSets;
 
     // Allocate memory for all sets
-    size_t setSize = Util::Pow2Align(sizeof(DescriptorSet), VK_DEFAULT_MEM_ALIGN);
+    size_t setSize = SetSize();
 
     bool oneShot = (poolUsage & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT) == 0;
 
@@ -873,17 +872,6 @@ VkResult DescriptorSetHeap::Init(
         return VK_ERROR_OUT_OF_HOST_MEMORY;
     }
 
-    // Allocate memory for all handles
-    m_pHandles = reinterpret_cast<VkDescriptorSet*>(pDevice->VkInstance()->AllocMem(
-        sizeof(VkDescriptorSet) * maxSets,
-        VK_DEFAULT_MEM_ALIGN,
-        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT));
-
-    if (m_pHandles == nullptr)
-    {
-        return VK_ERROR_OUT_OF_HOST_MEMORY;
-    }
-
     // Allocate memory for the free index stack
     if (oneShot == false) //dynamic usage
     {
@@ -908,8 +896,6 @@ VkResult DescriptorSetHeap::Init(
         flags.robustBufferAccess        = pDevice->GetEnabledFeatures().robustBufferAccess ? 1 : 0;
 
         VK_PLACEMENT_NEW (pSetMem) DescriptorSet(pPool, index, flags);
-
-        m_pHandles[index] = DescriptorSet::HandleFromVoidPointer(pSetMem);
     }
 
     return VK_SUCCESS;
@@ -921,10 +907,19 @@ void DescriptorSetHeap::Destroy(
     Device* pDevice)
 {
     pDevice->VkInstance()->FreeMem(m_pSetMemory);
-    pDevice->VkInstance()->FreeMem(m_pHandles);
     pDevice->VkInstance()->FreeMem(m_pFreeIndexStack);
 }
 
+// =====================================================================================================================
+// Compute a descriptor set handle from an index in the heap
+VkDescriptorSet DescriptorSetHeap::DescriptorSetHandleFromIndex(
+    uint32_t idx) const
+{
+    void* pMem = Util::VoidPtrInc(m_pSetMemory, (SetSize() * idx));
+
+    return DescriptorSet::HandleFromVoidPointer(pMem);
+}
+
 // =====================================================================================================================
 // Allocates a new VkDescriptorSet instance and returns a handle to it.
 bool DescriptorSetHeap::AllocSetState(
@@ -933,7 +928,7 @@ bool DescriptorSetHeap::AllocSetState(
     // First try to allocate through free range start index since it is by far fastest
     if (m_nextFreeHandle < m_maxSets)
     {
-        *pSet = m_pHandles[m_nextFreeHandle++];
+        *pSet = DescriptorSetHandleFromIndex(m_nextFreeHandle++);
 
         return true;
     }
@@ -943,7 +938,7 @@ bool DescriptorSetHeap::AllocSetState(
     {
         --m_freeIndexStackCount;
 
-        *pSet = m_pHandles[m_pFreeIndexStack[m_freeIndexStackCount]];
+        *pSet = DescriptorSetHandleFromIndex(m_pFreeIndexStack[m_freeIndexStackCount]);
 
         return true;
     }
@@ -962,13 +957,14 @@ void DescriptorSetHeap::FreeSetState(
     {
         DescriptorSet* pSet = DescriptorSet::StateFromHandle(set);
 
+        // We can compute this, but a divide might be a bad idea.
         uint32_t heapIndex = pSet->HeapIndex();
 
-        VK_ASSERT((heapIndex < m_maxSets) && DescriptorSet::StateFromHandle(m_pHandles[heapIndex]) == pSet);
+        VK_ASSERT(heapIndex < m_maxSets);
 
 #if DEBUG
         // Clear the descriptor set state for debugging purposes
-        pSet->Reassign(nullptr, 0, 0, 0, MaxPalDevices, nullptr, nullptr, nullptr);
+        pSet->Reset();
 #endif
 
         m_pFreeIndexStack[m_freeIndexStackCount++] = heapIndex;
@@ -987,14 +983,14 @@ void DescriptorSetHeap::Reset()
 
 #if DEBUG
     // Clear the descriptor set states for debugging purposes
-    size_t setSize = Util::Pow2Align(sizeof(DescriptorSet), VK_DEFAULT_MEM_ALIGN);
+    size_t setSize = SetSize();
 
     for (uint32_t index = 0; index < m_maxSets; ++index)
     {
         VkDescriptorSet setHandle =
             DescriptorSet::HandleFromVoidPointer(Util::VoidPtrInc(m_pSetMemory, index * setSize));
 
-        DescriptorSet::ObjectFromHandle(setHandle)->Reassign(nullptr, 0, 0, 0, MaxPalDevices, nullptr, nullptr, nullptr);
+        DescriptorSet::ObjectFromHandle(setHandle)->Reset();
     }
 #endif
 }
diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp
index d00dc160..5be1f8c0 100644
--- a/icd/api/vk_descriptor_set.cpp
+++ b/icd/api/vk_descriptor_set.cpp
@@ -77,51 +77,36 @@ void DescriptorSet::Reassign(
     Pal::gpusize*               gpuBaseAddress,
     uint32_t**                  cpuBaseAddress,
     uint32_t                    numPalDevices,
-    const InternalMemory* const pInternalMem,
-    void*                       pAllocHandle,
-    VkDescriptorSet*            pHandle)
+    void*                       pAllocHandle)
 {
     m_pLayout = pLayout;
     m_pAllocHandle = pAllocHandle;
 
-    if (pInternalMem != nullptr)
+    for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
     {
-        for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
-        {
-            m_gpuAddress[deviceIdx] = gpuBaseAddress[deviceIdx] + gpuMemOffset;
-        }
+        m_gpuAddress[deviceIdx] = gpuBaseAddress[deviceIdx] + gpuMemOffset;
+
+        // When memory is assigned to this descriptor set let's cache its mapped CPU address as we anyways use
+        // persistent mapped memory for descriptor pools.
+        m_pCpuAddress[deviceIdx] =
+            static_cast<uint32_t*>(Util::VoidPtrInc(cpuBaseAddress[deviceIdx], static_cast<intptr_t>(gpuMemOffset)));
+        VK_ASSERT(Util::IsPow2Aligned(reinterpret_cast<uint64_t>(m_pCpuAddress[deviceIdx]), sizeof(uint32_t)));
     }
 
-    if (pHandle != nullptr)
-    {
-        if (pInternalMem != nullptr)
-        {
-            // When memory is assigned to this descriptor set let's cache its mapped CPU address as we anyways use
-            // persistent mapped memory for descriptor pools.
-            for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
-            {
-                m_pCpuAddress[deviceIdx] = static_cast<uint32_t*>(Util::VoidPtrInc(cpuBaseAddress[deviceIdx], static_cast<intptr_t>(gpuMemOffset)));
-                VK_ASSERT(Util::IsPow2Aligned(reinterpret_cast<uint64_t>(m_pCpuAddress[deviceIdx]), sizeof(uint32_t)));
-            }
+    // In this case we also have to copy the immutable sampler data from the descriptor set layout to the
+    // descriptor set's appropriate memory locations.
+    InitImmutableDescriptors(pLayout, numPalDevices);
 
-            // In this case we also have to copy the immutable sampler data from the descriptor set layout to the
-            // descriptor set's appropriate memory locations.
-            InitImmutableDescriptors(pLayout, numPalDevices);
-        }
-        else
-        {
-            // This path can only be hit if the set doesn't need GPU memory
-            // i.e. it doesn't have static section and fmask section data
-            VK_ASSERT((pLayout->Info().sta.dwSize + pLayout->Info().fmask.dwSize) == 0);
+}
 
-            memset(m_pCpuAddress, 0, sizeof(m_pCpuAddress[0]) * numPalDevices);
-        }
-    }
-    else
-    {
-        memset(m_pCpuAddress, 0, sizeof(m_pCpuAddress[0]) * numPalDevices);
-    }
+// =====================================================================================================================
+// Resets a DescriptorSet to an intial state
+void DescriptorSet::Reset()
+{
+    m_pLayout = nullptr;
+    m_pAllocHandle = nullptr;
 
+    memset(m_pCpuAddress, 0, sizeof(m_pCpuAddress));
 }
 
 // =====================================================================================================================
@@ -132,57 +117,62 @@ void DescriptorSet::InitImmutableDescriptors(
 {
     VK_ASSERT(m_pLayout == pLayout);
 
-    const size_t imageDescDwSize = pLayout->VkDevice()->GetProperties().descriptorSizes.imageView / sizeof(uint32_t);
-    const size_t samplerDescSize = pLayout->VkDevice()->GetProperties().descriptorSizes.sampler;
-
     uint32_t immutableSamplersLeft = pLayout->Info().imm.numImmutableSamplers;
-    uint32_t binding = 0;
-
-    uint32_t* pSrcData  = pLayout->Info().imm.pImmutableSamplerData;
 
-    while (immutableSamplersLeft > 0)
+    if (immutableSamplersLeft > 0)
     {
-        const DescriptorSetLayout::BindingInfo& bindingInfo = pLayout->Binding(binding);
-        uint32_t desCount = bindingInfo.info.descriptorCount;
+        const size_t imageDescDwSize = pLayout->VkDevice()->GetProperties().descriptorSizes.imageView / sizeof(uint32_t);
+        const size_t samplerDescSize = pLayout->VkDevice()->GetProperties().descriptorSizes.sampler;
+
+        uint32_t binding = 0;
+
+        uint32_t* pSrcData  = pLayout->Info().imm.pImmutableSamplerData;
 
-        if (bindingInfo.imm.dwSize > 0)
+        do
         {
-            for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
+            const DescriptorSetLayout::BindingInfo& bindingInfo = pLayout->Binding(binding);
+            uint32_t desCount = bindingInfo.info.descriptorCount;
+
+            if (bindingInfo.imm.dwSize > 0)
             {
-                if (bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER)
-                {
-                    // If it's a pure immutable sampler descriptor binding then we can copy all descriptors in one shot.
-                    memcpy(m_pCpuAddress[deviceIdx] + bindingInfo.sta.dwOffset,
-                           pSrcData  + bindingInfo.imm.dwOffset,
-                           bindingInfo.imm.dwSize * sizeof(uint32_t));
-                }
-                else
+                for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
                 {
-                    // Otherwise, if it's a combined image sampler descriptor with immutable sampler then we have to
-                    // copy each element individually because the source and destination strides don't match.
-                    VK_ASSERT(bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
-
-                    for (uint32_t i = 0; i < desCount; ++i)
+                    if (bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER)
+                    {
+                        // If it's a pure immutable sampler descriptor binding then we can copy all descriptors in one shot.
+                        memcpy(m_pCpuAddress[deviceIdx] + bindingInfo.sta.dwOffset,
+                                pSrcData  + bindingInfo.imm.dwOffset,
+                                bindingInfo.imm.dwSize * sizeof(uint32_t));
+                    }
+                    else
                     {
-                        memcpy(m_pCpuAddress[deviceIdx] + bindingInfo.sta.dwOffset +
-                                                                (i * bindingInfo.sta.dwArrayStride) + imageDescDwSize,
-                               pSrcData + bindingInfo.imm.dwOffset + (i * bindingInfo.imm.dwArrayStride),
-                               samplerDescSize);
+                        // Otherwise, if it's a combined image sampler descriptor with immutable sampler then we have to
+                        // copy each element individually because the source and destination strides don't match.
+                        VK_ASSERT(bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
+
+                        for (uint32_t i = 0; i < desCount; ++i)
+                        {
+                            memcpy(m_pCpuAddress[deviceIdx] + bindingInfo.sta.dwOffset +
+                                                                    (i * bindingInfo.sta.dwArrayStride) + imageDescDwSize,
+                                    pSrcData + bindingInfo.imm.dwOffset + (i * bindingInfo.imm.dwArrayStride),
+                                    samplerDescSize);
+                        }
                     }
                 }
+                // Update the remaining number of immutable samplers to copy.
+                immutableSamplersLeft -= desCount;
             }
-            // Update the remaining number of immutable samplers to copy.
-            immutableSamplersLeft -= desCount;
-        }
 
-        binding++;
+            binding++;
+        }
+        while (immutableSamplersLeft > 0);
     }
 }
 
 // =====================================================================================================================
 // Write sampler descriptors
-VK_INLINE void DescriptorSet::WriteSamplerDescriptors(
-    const Device::Properties&    deviceProperties,
+template <size_t samplerDescSize>
+void DescriptorSet::WriteSamplerDescriptors(
     const VkDescriptorImageInfo* pDescriptors,
     uint32_t*                    pDestAddr,
     uint32_t                     count,
@@ -192,7 +182,6 @@ VK_INLINE void DescriptorSet::WriteSamplerDescriptors(
     const VkDescriptorImageInfo* pImageInfo      = pDescriptors;
     const size_t                 imageInfoStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes :
                                                                                     sizeof(VkDescriptorImageInfo);
-    const size_t                 samplerDescSize = deviceProperties.descriptorSizes.sampler;
 
     for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride)
     {
@@ -206,8 +195,8 @@ VK_INLINE void DescriptorSet::WriteSamplerDescriptors(
 
 // =====================================================================================================================
 // Write combined image-sampler descriptors
-VK_INLINE void DescriptorSet::WriteImageSamplerDescriptors(
-    const Device::Properties&       deviceProperties,
+template <size_t imageDescSize, size_t samplerDescSize>
+void DescriptorSet::WriteImageSamplerDescriptors(
     const VkDescriptorImageInfo*    pDescriptors,
     uint32_t                        deviceIdx,
     uint32_t*                       pDestAddr,
@@ -218,8 +207,6 @@ VK_INLINE void DescriptorSet::WriteImageSamplerDescriptors(
     const VkDescriptorImageInfo* pImageInfo      = pDescriptors;
     const size_t                 imageInfoStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes
                                                                                   : sizeof(VkDescriptorImageInfo);
-    const size_t                 imageDescSize   = deviceProperties.descriptorSizes.imageView;
-    const size_t                 samplerDescSize = deviceProperties.descriptorSizes.sampler;
 
     for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride)
     {
@@ -236,9 +223,8 @@ VK_INLINE void DescriptorSet::WriteImageSamplerDescriptors(
 
 // =====================================================================================================================
 // Write image view descriptors (including input attachments)
-VK_INLINE void DescriptorSet::WriteImageDescriptors(
-    VkDescriptorType                descType,
-    const Device::Properties&       deviceProperties,
+template <size_t imageDescSize>
+void DescriptorSet::WriteImageDescriptors(
     const VkDescriptorImageInfo*    pDescriptors,
     uint32_t                        deviceIdx,
     uint32_t*                       pDestAddr,
@@ -249,7 +235,6 @@ VK_INLINE void DescriptorSet::WriteImageDescriptors(
     const VkDescriptorImageInfo* pImageInfo      = pDescriptors;
     const size_t                 imageInfoStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes
                                                                                   : sizeof(VkDescriptorImageInfo);
-    const size_t                 imageDescSize   = deviceProperties.descriptorSizes.imageView;
 
     for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride)
     {
@@ -264,8 +249,8 @@ VK_INLINE void DescriptorSet::WriteImageDescriptors(
 
 // =====================================================================================================================
 // Write fmask descriptors
-VK_INLINE void DescriptorSet::WriteFmaskDescriptors(
-    const Device*                   pDevice,
+template <size_t imageDescSize>
+void DescriptorSet::WriteFmaskDescriptors(
     const VkDescriptorImageInfo*    pDescriptors,
     uint32_t                        deviceIdx,
     uint32_t*                       pDestAddr,
@@ -276,16 +261,12 @@ VK_INLINE void DescriptorSet::WriteFmaskDescriptors(
     const VkDescriptorImageInfo* pImageInfo      = pDescriptors;
     const size_t                 imageInfoStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes
                                                                                   : sizeof(VkDescriptorImageInfo);
-    const size_t                 imageDescSize   = pDevice->GetProperties().descriptorSizes.imageView;
-    VK_ASSERT((pDevice->GetProperties().descriptorSizes.fmaskView / sizeof(uint32_t)) == dwStride);
 
     for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride)
     {
         const ImageView* const pImageView = ImageView::ObjectFromHandle(pImageInfo->imageView);
         const void*            pImageDesc = pImageView->Descriptor(pImageInfo->imageLayout, deviceIdx, 0);
 
-        VK_ASSERT(FmaskBasedMsaaReadEnabled() == true);
-
         if (pImageView->NeedsFmaskViewSrds())
         {
             // Copy over FMASK descriptor
@@ -306,9 +287,8 @@ VK_INLINE void DescriptorSet::WriteFmaskDescriptors(
 
 // =====================================================================================================================
 // Write buffer descriptors
-VK_INLINE void DescriptorSet::WriteBufferDescriptors(
-    const Device::Properties&           deviceProperties,
-    VkDescriptorType                    type,
+template <size_t bufferDescSize, VkDescriptorType type>
+void DescriptorSet::WriteBufferDescriptors(
     const VkBufferView*                 pDescriptors,
     uint32_t                            deviceIdx,
     uint32_t*                           pDestAddr,
@@ -319,7 +299,6 @@ VK_INLINE void DescriptorSet::WriteBufferDescriptors(
     const VkBufferView* pBufferView      = pDescriptors;
     const size_t        bufferViewStride = (descriptorStrideInBytes != 0) ? descriptorStrideInBytes
                                                                           : sizeof(VkBufferView);
-    const size_t        bufferDescSize   = deviceProperties.descriptorSizes.bufferView;
 
     for (uint32_t arrayElem = 0; arrayElem < count; ++arrayElem, pDestAddr += dwStride)
     {
@@ -333,9 +312,9 @@ VK_INLINE void DescriptorSet::WriteBufferDescriptors(
 
 // =====================================================================================================================
 // Write buffer descriptors using bufferInfo field used with uniform and storage buffers
-VK_INLINE void DescriptorSet::WriteBufferInfoDescriptors(
+template <VkDescriptorType type>
+void DescriptorSet::WriteBufferInfoDescriptors(
     const Device*                   pDevice,
-    VkDescriptorType                type,
     const VkDescriptorBufferInfo*   pDescriptors,
     uint32_t                        deviceIdx,
     uint32_t*                       pDestAddr,
@@ -349,20 +328,14 @@ VK_INLINE void DescriptorSet::WriteBufferInfoDescriptors(
 
     Pal::BufferViewInfo info = {};
 
-    switch (type)
-    {
-    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-        // Setup and create SRD for storage buffer case
-        info.swizzledFormat = Pal::UndefinedSwizzledFormat;
-        info.stride         = 0; // Raw buffers have a zero byte stride
-        break;
-    default:
-        VK_NEVER_CALLED();
-        break;
-    }
+    VK_ASSERT((type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)         ||
+              (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) ||
+              (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)         ||
+              (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC));
+
+    // Setup and create SRD for storage buffer case
+    info.swizzledFormat = Pal::UndefinedSwizzledFormat;
+    info.stride         = 0; // Raw buffers have a zero byte stride
 
     Pal::IDevice* pPalDevice = pDevice->PalDevice(deviceIdx);
 
@@ -401,10 +374,10 @@ VK_INLINE void DescriptorSet::WriteBufferInfoDescriptors(
 //
 // NOTE: descriptorStrideInBytes is used for VK_KHR_descriptor_update_template's sparsely packed imageInfo, bufferInfo,
 //       or bufferView array elements and defaults to 0, i.e. vkUpdateDescriptorSets behavior
+template <size_t imageDescSize, size_t samplerDescSize, size_t bufferDescSize>
 void DescriptorSet::WriteDescriptorSets(
     const Device*                pDevice,
     uint32_t                     deviceIdx,
-    const Device::Properties&    deviceProperties,
     uint32_t                     descriptorWriteCount,
     const VkWriteDescriptorSet*  pDescriptorWrites,
     size_t                       descriptorStrideInBytes)
@@ -418,11 +391,11 @@ void DescriptorSet::WriteDescriptorSets(
 
         DescriptorSet* pDestSet  = DescriptorSet::ObjectFromHandle(params.dstSet);
         const DescriptorSetLayout::BindingInfo& destBinding = pDestSet->Layout()->Binding(params.dstBinding);
-        uint32_t* pDestAddr = pDestSet->CpuAddress(deviceIdx) + destBinding.sta.dwOffset
-                            + (params.dstArrayElement * destBinding.sta.dwArrayStride);
+        uint32_t* pDestAddr = pDestSet->CpuAddress(deviceIdx) +
+                              pDestSet->Layout()->GetDstStaOffset(destBinding, params.dstArrayElement);
 
-        uint32_t* pDestFmaskAddr = pDestSet->CpuAddress(deviceIdx) + pDestSet->Layout()->Info().sta.dwSize
-                                 + destBinding.fmask.dwOffset + (params.dstArrayElement * destBinding.fmask.dwArrayStride);
+        uint32_t* pDestFmaskAddr = pDestSet->CpuAddress(deviceIdx) +
+                                   pDestSet->Layout()->GetDstFmaskOffset(destBinding, params.dstArrayElement);
 
         // Determine whether the binding has immutable sampler descriptors.
         bool hasImmutableSampler = (destBinding.imm.dwSize != 0);
@@ -436,8 +409,7 @@ void DescriptorSet::WriteDescriptorSets(
             }
             else
             {
-                pDestSet->WriteSamplerDescriptors(
-                    deviceProperties,
+                WriteSamplerDescriptors<samplerDescSize>(
                     params.pImageInfo,
                     pDestAddr,
                     params.descriptorCount,
@@ -451,9 +423,7 @@ void DescriptorSet::WriteDescriptorSets(
             {
                 // If the sampler part of the combined image sampler is immutable then we should only update the image
                 // descriptors, but have to make sure to still use the appropriate stride.
-                pDestSet->WriteImageDescriptors(
-                    params.descriptorType,
-                    deviceProperties,
+                WriteImageDescriptors<imageDescSize>(
                     params.pImageInfo,
                     deviceIdx,
                     pDestAddr,
@@ -463,8 +433,7 @@ void DescriptorSet::WriteDescriptorSets(
             }
             else
             {
-                pDestSet->WriteImageSamplerDescriptors(
-                    deviceProperties,
+                WriteImageSamplerDescriptors<imageDescSize, samplerDescSize>(
                     params.pImageInfo,
                     deviceIdx,
                     pDestAddr,
@@ -475,8 +444,7 @@ void DescriptorSet::WriteDescriptorSets(
 
             if (pDestSet->FmaskBasedMsaaReadEnabled() && (destBinding.fmask.dwSize > 0))
             {
-                pDestSet->WriteFmaskDescriptors(
-                    pDevice,
+                WriteFmaskDescriptors<imageDescSize>(
                     params.pImageInfo,
                     deviceIdx,
                     pDestFmaskAddr,
@@ -490,9 +458,7 @@ void DescriptorSet::WriteDescriptorSets(
         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-            pDestSet->WriteImageDescriptors(
-                params.descriptorType,
-                deviceProperties,
+            WriteImageDescriptors<imageDescSize>(
                 params.pImageInfo,
                 deviceIdx,
                 pDestAddr,
@@ -502,8 +468,7 @@ void DescriptorSet::WriteDescriptorSets(
 
             if (pDestSet->FmaskBasedMsaaReadEnabled() && (destBinding.fmask.dwSize > 0))
             {
-                pDestSet->WriteFmaskDescriptors(
-                    pDevice,
+                pDestSet->WriteFmaskDescriptors<imageDescSize>(
                     params.pImageInfo,
                     deviceIdx,
                     pDestFmaskAddr,
@@ -514,51 +479,79 @@ void DescriptorSet::WriteDescriptorSets(
             break;
 
         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-        case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-            pDestSet->WriteBufferDescriptors(
-                deviceProperties,
-                params.descriptorType,
+            pDestSet->WriteBufferDescriptors<bufferDescSize, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER>(
                 params.pTexelBufferView,
                 deviceIdx,
                 pDestAddr,
                 params.descriptorCount,
                 destBinding.sta.dwArrayStride,
                 descriptorStrideInBytes);
+            break;
 
+        case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+            WriteBufferDescriptors<bufferDescSize, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER>(
+                params.pTexelBufferView,
+                deviceIdx,
+                pDestAddr,
+                params.descriptorCount,
+                destBinding.sta.dwArrayStride,
+                descriptorStrideInBytes);
             break;
 
         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-        case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-            pDestSet->WriteBufferInfoDescriptors(
+            WriteBufferInfoDescriptors<VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER>(
                 pDevice,
-                params.descriptorType,
                 params.pBufferInfo,
                 deviceIdx,
                 pDestAddr,
                 params.descriptorCount,
                 destBinding.sta.dwArrayStride,
                 descriptorStrideInBytes);
+            break;
 
+        case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+            WriteBufferInfoDescriptors<VK_DESCRIPTOR_TYPE_STORAGE_BUFFER>(
+                pDevice,
+                params.pBufferInfo,
+                deviceIdx,
+                pDestAddr,
+                params.descriptorCount,
+                destBinding.sta.dwArrayStride,
+                descriptorStrideInBytes);
             break;
 
         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-        case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
             // We need to treat dynamic buffer descriptors specially as we store the base buffer SRDs in
             // client memory.
             // NOTE: Nuke this once we have proper support for dynamic descriptors in SC.
-            pDestAddr = pDestSet->DynamicDescriptorData() + destBinding.dyn.dwOffset
-                      + params.dstArrayElement * destBinding.dyn.dwArrayStride;
+            pDestAddr = pDestSet->DynamicDescriptorData() +
+                        pDestSet->Layout()->GetDstDynOffset(destBinding, params.dstArrayElement);
 
-            pDestSet->WriteBufferInfoDescriptors(
+            WriteBufferInfoDescriptors<VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC>(
                 pDevice,
-                params.descriptorType,
                 params.pBufferInfo,
                 deviceIdx,
                 pDestAddr,
                 params.descriptorCount,
                 destBinding.dyn.dwArrayStride,
                 descriptorStrideInBytes);
+            break;
 
+        case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+            // We need to treat dynamic buffer descriptors specially as we store the base buffer SRDs in
+            // client memory.
+            // NOTE: Nuke this once we have proper support for dynamic descriptors in SC.
+            pDestAddr = pDestSet->DynamicDescriptorData() +
+                        pDestSet->Layout()->GetDstDynOffset(destBinding, params.dstArrayElement);
+
+            WriteBufferInfoDescriptors<VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC>(
+                pDevice,
+                params.pBufferInfo,
+                deviceIdx,
+                pDestAddr,
+                params.descriptorCount,
+                destBinding.dyn.dwArrayStride,
+                descriptorStrideInBytes);
             break;
 
         default:
@@ -570,10 +563,10 @@ void DescriptorSet::WriteDescriptorSets(
 
 // =====================================================================================================================
 // Copy from one descriptor set to another
+template <size_t imageDescSize>
 void DescriptorSet::CopyDescriptorSets(
     const Device*                pDevice,
     uint32_t                     deviceIdx,
-    const Device::Properties&    deviceProperties,
     uint32_t                     descriptorCopyCount,
     const VkCopyDescriptorSet*   pDescriptorCopies)
 {
@@ -636,7 +629,6 @@ void DescriptorSet::CopyDescriptorSets(
             {
                 // If we have immutable samplers inline with the image data to copy then we have to do a per array
                 // element copy to ensure we don't overwrite the immutable sampler data
-                const size_t imageDescSize = deviceProperties.descriptorSizes.imageView;
 
                 for (uint32_t j = 0; j < count; ++j)
                 {
@@ -669,6 +661,85 @@ void DescriptorSet::CopyDescriptorSets(
     }
 }
 
+// =====================================================================================================================
+template <size_t imageDescSize, size_t samplerDescSize, size_t bufferDescSize, uint32_t numPalDevices>
+VKAPI_ATTR void VKAPI_CALL DescriptorSet::UpdateDescriptorSets(
+    VkDevice                                    device,
+    uint32_t                                    descriptorWriteCount,
+    const VkWriteDescriptorSet*                 pDescriptorWrites,
+    uint32_t                                    descriptorCopyCount,
+    const VkCopyDescriptorSet*                  pDescriptorCopies)
+{
+    const Device* pDevice = ApiDevice::ObjectFromHandle(device);
+
+    for (uint32_t deviceIdx = 0; deviceIdx < numPalDevices; deviceIdx++)
+    {
+        WriteDescriptorSets<imageDescSize, samplerDescSize, bufferDescSize>(
+                            pDevice,
+                            deviceIdx,
+                            descriptorWriteCount,
+                            pDescriptorWrites);
+
+        CopyDescriptorSets<imageDescSize>(pDevice,
+                           deviceIdx,
+                           descriptorCopyCount,
+                           pDescriptorCopies);
+    }
+}
+
+// =====================================================================================================================
+PFN_vkUpdateDescriptorSets DescriptorSet::GetUpdateDescriptorSetsFunc(
+    const Device* pDevice)
+{
+    PFN_vkUpdateDescriptorSets pFunc = nullptr;
+
+    switch (pDevice->NumPalDevices())
+    {
+        case 1:
+            pFunc = GetUpdateDescriptorSetsFunc<1>(pDevice);
+            break;
+        case 2:
+            pFunc = GetUpdateDescriptorSetsFunc<2>(pDevice);
+            break;
+        case 3:
+            pFunc = GetUpdateDescriptorSetsFunc<3>(pDevice);
+            break;
+        case 4:
+            pFunc = GetUpdateDescriptorSetsFunc<4>(pDevice);
+            break;
+        default:
+            break;
+    }
+
+    return pFunc;
+}
+
+// =====================================================================================================================
+template <uint32_t numPalDevices>
+PFN_vkUpdateDescriptorSets DescriptorSet::GetUpdateDescriptorSetsFunc(
+    const Device* pDevice)
+{
+    const size_t imageDescSize      = pDevice->GetProperties().descriptorSizes.imageView;
+    const size_t samplerDescSize    = pDevice->GetProperties().descriptorSizes.sampler;
+    const size_t bufferDescSize     = pDevice->GetProperties().descriptorSizes.bufferView;
+
+    PFN_vkUpdateDescriptorSets pFunc = nullptr;
+
+    if ((imageDescSize == 32) &&
+        (samplerDescSize == 16) &&
+        (bufferDescSize == 16))
+    {
+        pFunc = &UpdateDescriptorSets<32, 16, 16, numPalDevices>;
+    }
+    else
+    {
+        VK_NEVER_CALLED();
+        pFunc = nullptr;
+    }
+
+    return pFunc;
+}
+
 namespace entry
 {
 
@@ -681,24 +752,108 @@ VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSets(
     const VkCopyDescriptorSet*                  pDescriptorCopies)
 {
     const Device*             pDevice          = ApiDevice::ObjectFromHandle(device);
-    const Device::Properties& deviceProperties = pDevice->GetProperties();
 
-    for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++)
-    {
-        DescriptorSet::WriteDescriptorSets(pDevice,
-                                           deviceIdx,
-                                           deviceProperties,
-                                           descriptorWriteCount,
-                                           pDescriptorWrites);
-
-        DescriptorSet::CopyDescriptorSets(pDevice,
-                                          deviceIdx,
-                                          deviceProperties,
-                                          descriptorCopyCount,
-                                          pDescriptorCopies);
-    }
+    PFN_vkUpdateDescriptorSets pFunc = pDevice->GetUpdateDescriptorSetsFunc();
+
+    (*pFunc)(device, descriptorWriteCount, pDescriptorWrites, descriptorCopyCount, pDescriptorCopies);
 }
 
 } // namespace entry
 
+// =====================================================================================================================
+// Template instantiation needed for references in other files.  Linux complains if we don't do this.
+
+template
+void DescriptorSet::WriteFmaskDescriptors<32>(
+    const VkDescriptorImageInfo*    pDescriptors,
+    uint32_t                        deviceIdx,
+    uint32_t*                       pDestAddr,
+    uint32_t                        count,
+    uint32_t                        dwStride,
+    size_t                          descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteSamplerDescriptors<16>(
+    const VkDescriptorImageInfo* pDescriptors,
+    uint32_t*                    pDestAddr,
+    uint32_t                     count,
+    uint32_t                     dwStride,
+    size_t                       descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteImageSamplerDescriptors<32, 16>(
+    const VkDescriptorImageInfo*    pDescriptors,
+    uint32_t                        deviceIdx,
+    uint32_t*                       pDestAddr,
+    uint32_t                        count,
+    uint32_t                        dwStride,
+    size_t                          descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteImageDescriptors<32>(
+    const VkDescriptorImageInfo*    pDescriptors,
+    uint32_t                        deviceIdx,
+    uint32_t*                       pDestAddr,
+    uint32_t                        count,
+    uint32_t                        dwStride,
+    size_t                          descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteBufferDescriptors<16, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER>(
+    const VkBufferView*                 pDescriptors,
+    uint32_t                            deviceIdx,
+    uint32_t*                           pDestAddr,
+    uint32_t                            count,
+    uint32_t                            dwStride,
+    size_t                              descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteBufferDescriptors<16, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER>(
+    const VkBufferView*                 pDescriptors,
+    uint32_t                            deviceIdx,
+    uint32_t*                           pDestAddr,
+    uint32_t                            count,
+    uint32_t                            dwStride,
+    size_t                              descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteBufferInfoDescriptors<VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER>(
+    const Device*                   pDevice,
+    const VkDescriptorBufferInfo*   pDescriptors,
+    uint32_t                        deviceIdx,
+    uint32_t*                       pDestAddr,
+    uint32_t                        count,
+    uint32_t                        dwStride,
+    size_t                          descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteBufferInfoDescriptors<VK_DESCRIPTOR_TYPE_STORAGE_BUFFER>(
+    const Device*                   pDevice,
+    const VkDescriptorBufferInfo*   pDescriptors,
+    uint32_t                        deviceIdx,
+    uint32_t*                       pDestAddr,
+    uint32_t                        count,
+    uint32_t                        dwStride,
+    size_t                          descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteBufferInfoDescriptors<VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC>(
+    const Device*                   pDevice,
+    const VkDescriptorBufferInfo*   pDescriptors,
+    uint32_t                        deviceIdx,
+    uint32_t*                       pDestAddr,
+    uint32_t                        count,
+    uint32_t                        dwStride,
+    size_t                          descriptorStrideInBytes);
+
+template
+void DescriptorSet::WriteBufferInfoDescriptors<VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC>(
+    const Device*                   pDevice,
+    const VkDescriptorBufferInfo*   pDescriptors,
+    uint32_t                        deviceIdx,
+    uint32_t*                       pDestAddr,
+    uint32_t                        count,
+    uint32_t                        dwStride,
+    size_t                          descriptorStrideInBytes);
+
 } // namespace vk
diff --git a/icd/api/vk_descriptor_update_template.cpp b/icd/api/vk_descriptor_update_template.cpp
index 3596ec5f..5177813f 100644
--- a/icd/api/vk_descriptor_update_template.cpp
+++ b/icd/api/vk_descriptor_update_template.cpp
@@ -39,14 +39,17 @@ namespace vk
 
 // =====================================================================================================================
 VkResult DescriptorUpdateTemplate::Create(
+    const Device*                                   pDevice,
     const VkDescriptorUpdateTemplateCreateInfoKHR*  pCreateInfo,
     const VkAllocationCallbacks*                    pAllocator,
     VkDescriptorUpdateTemplateKHR*                  pDescriptorUpdateTemplate)
 {
-    VkResult     result      = VK_SUCCESS;
-    const size_t apiSize     = sizeof(DescriptorUpdateTemplate);
-    const size_t entriesSize = pCreateInfo->descriptorUpdateEntryCount * sizeof(VkDescriptorUpdateTemplateEntryKHR);
-    const size_t objSize     = apiSize + entriesSize;
+    VkResult                    result      = VK_SUCCESS;
+    const uint32_t              numEntries  = pCreateInfo->descriptorUpdateEntryCount;
+    const DescriptorSetLayout*  pLayout     = DescriptorSetLayout::ObjectFromHandle(pCreateInfo->descriptorSetLayout);
+    const size_t                apiSize     = sizeof(DescriptorUpdateTemplate);
+    const size_t                entriesSize = numEntries * sizeof(TemplateUpdateInfo);
+    const size_t                objSize     = apiSize + entriesSize;
 
     void* pSysMem = pAllocator->pfnAllocation(pAllocator->pUserData,
                                               objSize,
@@ -64,12 +67,30 @@ VkResult DescriptorUpdateTemplate::Create(
         // we don't support VK_KHR_push_descriptors.
         VK_ASSERT(pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR);
 
-        VkDescriptorUpdateTemplateEntryKHR* pEntries = static_cast<VkDescriptorUpdateTemplateEntryKHR*>(
-                                                            Util::VoidPtrInc(pSysMem, apiSize));
+        TemplateUpdateInfo* pEntries = static_cast<TemplateUpdateInfo*>(Util::VoidPtrInc(pSysMem, apiSize));
 
-        memcpy(pEntries, pCreateInfo->pDescriptorUpdateEntries, entriesSize);
+        for (uint32_t ii = 0; ii < numEntries; ii++)
+        {
+            const VkDescriptorUpdateTemplateEntryKHR&   srcEntry    = pCreateInfo->pDescriptorUpdateEntries[ii];
+            const DescriptorSetLayout::BindingInfo&     dstBinding  = pLayout->Binding(srcEntry.dstBinding);
 
-        VK_PLACEMENT_NEW(pSysMem) DescriptorUpdateTemplate(pEntries, pCreateInfo->descriptorUpdateEntryCount);
+            pEntries[ii].descriptorCount                = srcEntry.descriptorCount;
+            pEntries[ii].srcOffset                      = srcEntry.offset;
+            pEntries[ii].srcStride                      = srcEntry.stride;
+            pEntries[ii].dstBindStaDwArrayStride        = dstBinding.sta.dwArrayStride;
+            pEntries[ii].dstBindFmaskDwArrayStride      = dstBinding.fmask.dwArrayStride;
+            pEntries[ii].dstBindDynDataDwArrayStride    = dstBinding.dyn.dwArrayStride;
+            pEntries[ii].dstStaOffset                   =
+                pLayout->GetDstStaOffset(dstBinding, srcEntry.dstArrayElement);
+            pEntries[ii].dstFmaskOffset                 =
+                pLayout->GetDstFmaskOffset(dstBinding, srcEntry.dstArrayElement);
+            pEntries[ii].dstDynOffset                   =
+                pLayout->GetDstDynOffset(dstBinding, srcEntry.dstArrayElement);
+            pEntries[ii].pFunc                          =
+                GetUpdateEntryFunc(pDevice, srcEntry.descriptorType, dstBinding);
+        }
+
+        VK_PLACEMENT_NEW(pSysMem) DescriptorUpdateTemplate(pCreateInfo->descriptorUpdateEntryCount);
 
         *pDescriptorUpdateTemplate = DescriptorUpdateTemplate::HandleFromVoidPointer(pSysMem);
     }
@@ -77,12 +98,113 @@ VkResult DescriptorUpdateTemplate::Create(
     return result;
 }
 
+// =====================================================================================================================
+template <size_t imageDescSize, size_t samplerDescSize, size_t bufferDescSize>
+DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntryFunc(
+    const Device*                           pDevice,
+    VkDescriptorType                        descriptorType,
+    const DescriptorSetLayout::BindingInfo& dstBinding)
+{
+    PfnUpdateEntry pFunc = NULL;
+
+    switch (descriptorType)
+    {
+    case VK_DESCRIPTOR_TYPE_SAMPLER:
+        pFunc = &UpdateEntrySampler<samplerDescSize>;
+        break;
+    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+        if (pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead && (dstBinding.fmask.dwSize > 0))
+        {
+            if (dstBinding.imm.dwSize != 0)
+            {
+                pFunc = &UpdateEntryCombinedImageSampler<imageDescSize, samplerDescSize, true, true>;
+            }
+            else
+            {
+                pFunc = &UpdateEntryCombinedImageSampler<imageDescSize, samplerDescSize, true, false>;
+            }
+        }
+        else
+        {
+            if (dstBinding.imm.dwSize != 0)
+            {
+                pFunc = &UpdateEntryCombinedImageSampler<imageDescSize, samplerDescSize, false, true>;
+            }
+            else
+            {
+                pFunc = &UpdateEntryCombinedImageSampler<imageDescSize, samplerDescSize, false, false>;
+            }
+        }
+        break;
+    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+        if (pDevice->GetRuntimeSettings().enableFmaskBasedMsaaRead && (dstBinding.fmask.dwSize > 0))
+        {
+            pFunc = &UpdateEntrySampledImage<imageDescSize, true>;
+        }
+        else
+        {
+            pFunc = &UpdateEntrySampledImage<imageDescSize, false>;
+        }
+        break;
+    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+        pFunc = &UpdateEntryTexelBuffer<bufferDescSize, VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER>;
+        break;
+    case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+        pFunc = &UpdateEntryTexelBuffer<bufferDescSize, VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER>;
+        break;
+    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+        pFunc = &UpdateEntryBuffer<VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER>;
+        break;
+    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+        pFunc = &UpdateEntryBuffer<VK_DESCRIPTOR_TYPE_STORAGE_BUFFER>;
+        break;
+    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+        pFunc = &UpdateEntryBuffer<VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC>;
+        break;
+    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+        pFunc = &UpdateEntryBuffer<VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC>;
+        break;
+    default:
+        VK_ASSERT(!"Unexpected descriptor type");
+        break;
+    }
+
+    return pFunc;
+}
+
+// =====================================================================================================================
+DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntryFunc(
+    const Device*                           pDevice,
+    VkDescriptorType                        descriptorType,
+    const DescriptorSetLayout::BindingInfo& dstBinding)
+{
+    const size_t imageDescSize      = pDevice->GetProperties().descriptorSizes.imageView;
+    const size_t samplerDescSize    = pDevice->GetProperties().descriptorSizes.sampler;
+    const size_t bufferDescSize     = pDevice->GetProperties().descriptorSizes.bufferView;
+
+    DescriptorUpdateTemplate::PfnUpdateEntry pFunc = nullptr;
+
+    if ((imageDescSize == 32) &&
+        (samplerDescSize == 16) &&
+        (bufferDescSize == 16))
+    {
+        pFunc = GetUpdateEntryFunc<32, 16, 16>(pDevice, descriptorType, dstBinding);
+    }
+    else
+    {
+        VK_NEVER_CALLED();
+        pFunc = nullptr;
+    }
+
+    return pFunc;
+}
+
 // =====================================================================================================================
 DescriptorUpdateTemplate::DescriptorUpdateTemplate(
-    const VkDescriptorUpdateTemplateEntryKHR* pEntries,
-    uint32_t                                  numEntries)
+    uint32_t                    numEntries)
     :
-    m_pEntries(pEntries),
     m_numEntries(numEntries)
 {
 }
@@ -106,39 +228,195 @@ VkResult DescriptorUpdateTemplate::Destroy(
 
 // =====================================================================================================================
 void DescriptorUpdateTemplate::Update(
-    Device*         pDevice,
+    const Device*   pDevice,
     uint32_t        deviceIdx,
     VkDescriptorSet descriptorSet,
     const void*     pData)
 {
-    const Device::Properties& deviceProperties = pDevice->GetProperties();
+    auto pEntries = GetEntries();
 
-    // Use descriptor write structure as params to share write code path with vkUpdateDescriptorSets.
-    VkWriteDescriptorSet descriptorWrite;
+    for (uint32_t i = 0; i < m_numEntries; ++i)
+    {
+        const void* pDescriptorInfo = Util::VoidPtrInc(pData, pEntries[i].srcOffset);
 
-    descriptorWrite.sType  = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
-    descriptorWrite.pNext  = nullptr;
-    descriptorWrite.dstSet = descriptorSet;
+        pEntries[i].pFunc(pDevice, descriptorSet, deviceIdx, pDescriptorInfo, pEntries[i]);
+    }
+}
 
-    for (uint32_t i = 0; i < m_numEntries; ++i)
+// =====================================================================================================================
+template <size_t imageDescSize, size_t samplerDescSize, bool updateFmask, bool immutable>
+void DescriptorUpdateTemplate::UpdateEntryCombinedImageSampler(
+    const Device*               pDevice,
+    VkDescriptorSet             descriptorSet,
+    uint32_t                    deviceIdx,
+    const void*                 pDescriptorInfo,
+    const TemplateUpdateInfo&   entry)
+{
+    DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet);
+
+    const VkDescriptorImageInfo* pImageInfo = static_cast<const VkDescriptorImageInfo*>(pDescriptorInfo);
+
+    uint32_t* pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset;
+
+    if (immutable)
     {
-        const void* pDescriptorInfo = Util::VoidPtrInc(pData, m_pEntries[i].offset);
-
-        descriptorWrite.dstBinding       = m_pEntries[i].dstBinding;
-        descriptorWrite.dstArrayElement  = m_pEntries[i].dstArrayElement;
-        descriptorWrite.descriptorCount  = m_pEntries[i].descriptorCount;
-        descriptorWrite.descriptorType   = m_pEntries[i].descriptorType;
-        // Decide which descriptor info is relevant later using descriptorType.
-        descriptorWrite.pImageInfo       = static_cast<const VkDescriptorImageInfo*>(pDescriptorInfo);
-        descriptorWrite.pBufferInfo      = static_cast<const VkDescriptorBufferInfo*>(pDescriptorInfo);
-        descriptorWrite.pTexelBufferView = static_cast<const VkBufferView*>(pDescriptorInfo);
-
-        DescriptorSet::WriteDescriptorSets(pDevice,
-                                           deviceIdx,
-                                           deviceProperties,
-                                           1,
-                                           &descriptorWrite,
-                                           m_pEntries[i].stride);
+        // If the sampler part of the combined image sampler is immutable then we should only update the image
+        // descriptors, but have to make sure to still use the appropriate stride.
+        DescriptorSet::WriteImageDescriptors<imageDescSize>(
+            pImageInfo,
+            deviceIdx,
+            pDestAddr,
+            entry.descriptorCount,
+            entry.dstBindStaDwArrayStride,
+            entry.srcStride);
+    }
+    else
+    {
+        DescriptorSet::WriteImageSamplerDescriptors<imageDescSize, samplerDescSize>(
+            pImageInfo,
+            deviceIdx,
+            pDestAddr,
+            entry.descriptorCount,
+            entry.dstBindStaDwArrayStride,
+            entry.srcStride);
+    }
+
+    if (updateFmask)
+    {
+        uint32_t* pDestFmaskAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstFmaskOffset;
+
+        DescriptorSet::WriteFmaskDescriptors<imageDescSize>(
+            pImageInfo,
+            deviceIdx,
+            pDestFmaskAddr,
+            entry.descriptorCount,
+            entry.dstBindFmaskDwArrayStride,
+            entry.srcStride);
+    }
+}
+
+// =====================================================================================================================
+template <size_t bufferDescSize, VkDescriptorType descriptorType>
+void DescriptorUpdateTemplate::UpdateEntryTexelBuffer(
+    const Device*               pDevice,
+    VkDescriptorSet             descriptorSet,
+    uint32_t                    deviceIdx,
+    const void*                 pDescriptorInfo,
+    const TemplateUpdateInfo&   entry)
+{
+    DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet);
+
+    const VkBufferView* pTexelBufferView = static_cast<const VkBufferView*>(pDescriptorInfo);
+
+    uint32_t* pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset;
+
+    DescriptorSet::WriteBufferDescriptors<bufferDescSize, descriptorType>(
+            pTexelBufferView,
+            deviceIdx,
+            pDestAddr,
+            entry.descriptorCount,
+            entry.dstBindStaDwArrayStride,
+            entry.srcStride);
+}
+
+// =====================================================================================================================
+template <VkDescriptorType descriptorType>
+void DescriptorUpdateTemplate::UpdateEntryBuffer(
+    const Device*               pDevice,
+    VkDescriptorSet             descriptorSet,
+    uint32_t                    deviceIdx,
+    const void*                 pDescriptorInfo,
+    const TemplateUpdateInfo&   entry)
+{
+    DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet);
+
+    const VkDescriptorBufferInfo* pBufferInfo = static_cast<const VkDescriptorBufferInfo*>(pDescriptorInfo);
+
+    uint32_t* pDestAddr;
+    uint32_t stride;
+
+    if ((descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) ||
+        (descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC))
+    {
+        // We need to treat dynamic buffer descriptors specially as we store the base buffer SRDs in
+        // client memory.
+        // NOTE: Nuke this once we have proper support for dynamic descriptors in SC.
+        pDestAddr   = pDstSet->DynamicDescriptorData() + entry.dstDynOffset;
+        stride      = entry.dstBindDynDataDwArrayStride;
+    }
+    else
+    {
+        pDestAddr   = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset;
+        stride      = entry.dstBindStaDwArrayStride;
+    }
+
+    DescriptorSet::WriteBufferInfoDescriptors<descriptorType>(
+            pDevice,
+            pBufferInfo,
+            deviceIdx,
+            pDestAddr,
+            entry.descriptorCount,
+            stride,
+            entry.srcStride);
+}
+
+// =====================================================================================================================
+template <size_t samplerDescSize>
+void DescriptorUpdateTemplate::UpdateEntrySampler(
+    const Device*               pDevice,
+    VkDescriptorSet             descriptorSet,
+    uint32_t                    deviceIdx,
+    const void*                 pDescriptorInfo,
+    const TemplateUpdateInfo&   entry)
+{
+    DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet);
+
+    const VkDescriptorImageInfo* pImageInfo = static_cast<const VkDescriptorImageInfo*>(pDescriptorInfo);
+
+    uint32_t* pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset;
+
+    DescriptorSet::WriteSamplerDescriptors<samplerDescSize>(
+        pImageInfo,
+        pDestAddr,
+        entry.descriptorCount,
+        entry.dstBindStaDwArrayStride,
+        entry.srcStride);
+}
+
+// =====================================================================================================================
+template <size_t imageDescSize, bool updateFmask>
+void DescriptorUpdateTemplate::UpdateEntrySampledImage(
+        const Device*               pDevice,
+        VkDescriptorSet             descriptorSet,
+        uint32_t                    deviceIdx,
+        const void*                 pDescriptorInfo,
+        const TemplateUpdateInfo&   entry)
+{
+    DescriptorSet* pDstSet = DescriptorSet::ObjectFromHandle(descriptorSet);
+
+    const VkDescriptorImageInfo* pImageInfo = static_cast<const VkDescriptorImageInfo*>(pDescriptorInfo);
+
+    uint32_t* pDestAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstStaOffset;
+
+    DescriptorSet::WriteImageDescriptors<imageDescSize>(
+            pImageInfo,
+            deviceIdx,
+            pDestAddr,
+            entry.descriptorCount,
+            entry.dstBindStaDwArrayStride,
+            entry.srcStride);
+
+    if (updateFmask)
+    {
+        uint32_t* pDestFmaskAddr = pDstSet->CpuAddress(deviceIdx) + entry.dstFmaskOffset;
+
+        DescriptorSet::WriteFmaskDescriptors<imageDescSize>(
+            pImageInfo,
+            deviceIdx,
+            pDestFmaskAddr,
+            entry.descriptorCount,
+            entry.dstBindFmaskDwArrayStride,
+            entry.srcStride);
     }
 }
 
@@ -170,10 +448,13 @@ VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSetWithTemplateKHR(
     Device*                   pDevice   = ApiDevice::ObjectFromHandle(device);
     DescriptorUpdateTemplate* pTemplate = DescriptorUpdateTemplate::ObjectFromHandle(descriptorUpdateTemplate);
 
-    for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); ++deviceIdx)
+    uint32_t deviceIdx = 0;
+    do
     {
         pTemplate->Update(pDevice, deviceIdx, descriptorSet, pData);
+        deviceIdx++;
     }
+    while (deviceIdx < pDevice->NumPalDevices());
 }
 
 } // namespace entry
diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp
index 6b03c379..fd487ddb 100644
--- a/icd/api/vk_device.cpp
+++ b/icd/api/vk_device.cpp
@@ -36,6 +36,7 @@
 #include "include/vk_buffer.h"
 #include "include/vk_buffer_view.h"
 #include "include/vk_descriptor_pool.h"
+#include "include/vk_descriptor_set.h"
 #include "include/vk_descriptor_set_layout.h"
 #include "include/vk_descriptor_update_template.h"
 #include "include/vk_device.h"
@@ -178,7 +179,8 @@ Device::Device(
     m_renderStateCache(this),
     m_enabledExtensions(enabledExtensions),
     m_pSqttMgr(nullptr),
-    m_pipelineCacheCount(0)
+    m_pipelineCacheCount(0),
+    m_pfnUpdateDescriptorSets(nullptr)
 {
     memcpy(m_pPhysicalDevices, pPhysicalDevices, sizeof(pPhysicalDevices[DefaultDeviceIndex]) * palDeviceCount);
     memcpy(m_pPalDevices, pPalDevices, sizeof(pPalDevices[0]) * palDeviceCount);
@@ -203,8 +205,6 @@ Device::Device(
     m_allocatedCount = 0;
     m_maxAllocations = pPhysicalDevices[DefaultDeviceIndex]->GetLimits().maxMemoryAllocationCount;
 
-    memset(m_pLlpcCompiler, 0, sizeof(m_pLlpcCompiler));
-
 }
 
 // =====================================================================================================================
@@ -268,30 +268,6 @@ VkResult Device::Create(
 
     DeviceExtensions::Enabled enabledDeviceExtensions;
 
-#ifdef ICD_VULKAN_1_1
-    // Implicitly enable device extensions that are core in the API version
-    if (pPhysicalDevice->VkInstance()->GetAPIVersion() >= VK_MAKE_VERSION(1, 1, 0))
-    {
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_16BIT_STORAGE);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_BIND_MEMORY2);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_DEDICATED_ALLOCATION);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_DESCRIPTOR_UPDATE_TEMPLATE);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_DEVICE_GROUP);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_EXTERNAL_FENCE);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_EXTERNAL_MEMORY);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_EXTERNAL_SEMAPHORE);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_MAINTENANCE1);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_MAINTENANCE2);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_MAINTENANCE3);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_MULTIVIEW);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_RELAXED_BLOCK_LAYOUT);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_SHADER_DRAW_PARAMETERS);
-//        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_SAMPLER_YCBCR_CONVERSION);
-        enabledDeviceExtensions.EnableExtension(DeviceExtensions::KHR_STORAGE_BUFFER_STORAGE_CLASS);
-    }
-#endif
-
     VK_ASSERT(pCreateInfo != nullptr);
 
     // Make sure the caller only requests extensions we actually support.
@@ -444,7 +420,7 @@ VkResult Device::Create(
 
     vkResult = VK_ERROR_OUT_OF_HOST_MEMORY;
 
-    if (pMemory != nullptr)
+    if ((pCreateInfo != nullptr) && (pMemory != nullptr))
     {
         vkResult = PalToVkResult(palResult);
 
@@ -627,7 +603,7 @@ VkResult Device::Initialize(
     uint8_t*            pPalQueueMemory)
 {
     // Initialize the internal memory manager
-    VkResult  result = m_internalMemMgr.Init();
+    VkResult result = m_internalMemMgr.Init();
 
     // Initialize the render state cache
     if (result == VK_SUCCESS)
@@ -737,19 +713,6 @@ VkResult Device::Initialize(
         }
     }
 
-    if (result == VK_SUCCESS)
-    {
-        for (uint32_t i = 0; i < m_palDeviceCount; ++i)
-        {
-            result = CreateLlpcCompiler(i);
-
-            if (result != VK_SUCCESS)
-            {
-                break;
-            }
-        }
-    }
-
     if (result == VK_SUCCESS)
     {
         result = CreateLlpcInternalPipelines();
@@ -798,9 +761,21 @@ VkResult Device::Initialize(
     }
 #endif
 
+    if (result == VK_SUCCESS)
+    {
+        InitEntryPointFuncs();
+    }
+
     return result;
 }
 
+// =====================================================================================================================
+// Initialize the entry point functions for paths known at device init time
+void Device::InitEntryPointFuncs()
+{
+    m_pfnUpdateDescriptorSets = DescriptorSet::GetUpdateDescriptorSetsFunc(this);
+}
+
 // =====================================================================================================================
 // Initialize the specified sample pattern palette with default values.
 void Device::InitSamplePatternPalette(
@@ -1012,15 +987,6 @@ VkResult Device::Destroy(const VkAllocationCallbacks* pAllocator)
 
     DestroyInternalPipelines();
 
-    for (uint32_t i = 0; i < MaxPalDevices; ++i)
-    {
-        if (m_pLlpcCompiler[i] != nullptr)
-        {
-            m_pLlpcCompiler[i]->Destroy();
-            m_pLlpcCompiler[i] = nullptr;
-        }
-    }
-
     if (m_settings.useSharedCmdAllocator)
     {
         for (uint32_t deviceIdx = 0; deviceIdx < NumPalDevices(); deviceIdx++)
@@ -1071,7 +1037,7 @@ VkResult Device::CreateLlpcInternalComputePipeline(
     shaderInfo.shaderBin.pCode    = pCode;
     shaderInfo.shaderBin.codeSize = codeByteSize;
 
-    llpcResult = GetLlpcCompiler()->BuildShaderModule(&shaderInfo, &shaderOut);
+    llpcResult = GetCompiler()->GetLlpcCompiler()->BuildShaderModule(&shaderInfo, &shaderOut);
     if ((llpcResult != Llpc::Result::Success) && (llpcResult != Llpc::Result::Delayed))
     {
         result = VK_ERROR_INITIALIZATION_FAILED;
@@ -1091,7 +1057,7 @@ VkResult Device::CreateLlpcInternalComputePipeline(
         pShaderInfo->pEntryTarget        = "main";
         pShaderInfo->pUserDataNodes      = pUserDataNodes;
         pShaderInfo->userDataNodeCount   = numUserDataNodes;
-        llpcResult = GetLlpcCompiler()->BuildComputePipeline(&pipelineBuildInfo, &pipelineOut);
+        llpcResult = GetCompiler()->GetLlpcCompiler()->BuildComputePipeline(&pipelineBuildInfo, &pipelineOut);
         if (llpcResult != Llpc::Result::Success)
         {
             result = VK_ERROR_INITIALIZATION_FAILED;
@@ -1413,7 +1379,7 @@ VkResult Device::CreateDescriptorUpdateTemplate(
     const VkAllocationCallbacks*                    pAllocator,
     VkDescriptorUpdateTemplateKHR*                  pDescriptorUpdateTemplate)
 {
-    return DescriptorUpdateTemplate::Create(pCreateInfo, pAllocator, pDescriptorUpdateTemplate);
+    return DescriptorUpdateTemplate::Create(this, pCreateInfo, pAllocator, pDescriptorUpdateTemplate);
 }
 
 // =====================================================================================================================
@@ -1830,6 +1796,8 @@ VkResult Device::BindImageMemory(
     return VK_SUCCESS;
 }
 
+// =====================================================================================================================
+
 // =====================================================================================================================
 VkResult Device::CreateSampler(
     const VkSamplerCreateInfo*                  pCreateInfo,
@@ -1951,216 +1919,6 @@ VkDeviceSize Device::GetMemoryBaseAddrAlignment(
     return minAlignment;
 }
 
-// =====================================================================================================================
-// Create LLPC compiler handle
-VkResult Device::CreateLlpcCompiler(
-    int32_t deviceIdx)  // Device index
-{
-    const uint32_t     OptionBufferSize = 4096;
-    const uint32_t     MaxLlpcOptions   = 32;
-    Llpc::GfxIpVersion gfxIp            = {};
-    Llpc::ICompiler*   pCompiler        = nullptr;
-
-    // Initialzie GfxIpVersion according to PAL gfxLevel
-    Pal::DeviceProperties info;
-    PalDevice(deviceIdx)->GetProperties(&info);
-
-    switch (info.gfxLevel)
-    {
-    case Pal::GfxIpLevel::GfxIp6:
-        gfxIp.major = 6;
-        gfxIp.minor = 0;
-        break;
-    case Pal::GfxIpLevel::GfxIp7:
-        gfxIp.major = 7;
-        gfxIp.minor = 0;
-        break;
-    case Pal::GfxIpLevel::GfxIp8:
-        gfxIp.major = 8;
-        gfxIp.minor = 0;
-        break;
-    case Pal::GfxIpLevel::GfxIp8_1:
-        gfxIp.major = 8;
-        gfxIp.minor = 1;
-        break;
-    case Pal::GfxIpLevel::GfxIp9:
-        gfxIp.major = 9;
-        gfxIp.minor = 0;
-        break;
-    default:
-        VK_NEVER_CALLED();
-        break;
-    }
-
-    gfxIp.stepping = info.gfxStepping;
-
-    // Get the executable name and path
-    char  executableNameBuffer[PATH_MAX];
-
-    char* pExecutablePtr;
-    Pal::Result palResult = Util::GetExecutableName(&executableNameBuffer[0],
-                                                    &pExecutablePtr,
-                                                    sizeof(executableNameBuffer));
-    VK_ASSERT(palResult == Pal::Result::Success);
-
-    // Initialize LLPC options according to runtime settings
-    auto               settings                        = GetRuntimeSettings();
-    const char*        llpcOptions[MaxLlpcOptions]     = {};
-    char               optionBuffers[OptionBufferSize] = {};
-
-    char*              pOptionBuffer                   = &optionBuffers[0];
-    size_t             bufSize                         = OptionBufferSize;
-    int                optionLength                    = 0;
-    uint32_t           numOptions                      = 0;
-    // Identify for Icd and stanalone compiler
-    llpcOptions[numOptions++] = Llpc::VkIcdName;
-
-    // Generate ELF binary, not assembly text
-    llpcOptions[numOptions++] = "-filetype=obj";
-
-    // LLPC log options
-    llpcOptions[numOptions++] = (settings.enableLog & 1) ? "-enable-errs=1" : "-enable-errs=0";
-    llpcOptions[numOptions++] = (settings.enableLog & 2) ? "-enable-outs=1" : "-enable-outs=0";
-
-    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-log-file-outs=%s", settings.logFileName);
-    ++optionLength;
-    llpcOptions[numOptions++] = pOptionBuffer;
-    pOptionBuffer += optionLength;
-    bufSize -= optionLength;
-
-    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-log-file-dbgs=%s", settings.debugLogFileName);
-    ++optionLength;
-    llpcOptions[numOptions++] = pOptionBuffer;
-    pOptionBuffer += optionLength;
-    bufSize -= optionLength;
-
-    // LLPC debug options
-    if (settings.enableDebug)
-    {
-        llpcOptions[numOptions++] = "-debug";
-    }
-
-    if (settings.llpcOptions[0] != '\0')
-    {
-        const char* pOptions = &settings.llpcOptions[0];
-        VK_ASSERT(pOptions[0] == '-');
-
-        // Split options
-        while (pOptions)
-        {
-            const char* pNext = strchr(pOptions, ' ');
-            if (pNext)
-            {
-                // Copy options to option buffer
-                optionLength = static_cast<int32_t>(pNext - pOptions);
-                memcpy(pOptionBuffer, pOptions, optionLength);
-                pOptionBuffer[optionLength] = 0;
-
-                llpcOptions[numOptions++] = pOptionBuffer;
-                pOptionBuffer += (optionLength + 1);
-
-                bufSize -= (optionLength + 1);
-                pOptions = strchr(pOptions + optionLength, '-');
-            }
-            else
-            {
-                // Use pOptions directly for last option
-                llpcOptions[numOptions++] = pOptions;
-                pOptions = nullptr;
-            }
-        }
-    }
-
-    // LLPC pipeline dump options
-    if (settings.enablePipelineDump)
-    {
-        llpcOptions[numOptions++] = "-enable-pipeline-dump";
-    }
-
-    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-pipeline-dump-dir=%s", settings.pipelineDumpDir);
-    ++optionLength;
-    llpcOptions[numOptions++] = pOptionBuffer;
-    pOptionBuffer += optionLength;
-    bufSize -= optionLength;
-
-    if (settings.enableLlpc == LlpcModeAutoFallback)
-    {
-        llpcOptions[numOptions++] = "-disable-WIP-features=1";
-    }
-
-    // NOTE: For testing consistency, these options should be kept the same as those of
-    // "amdllpc" (Init()).
-    llpcOptions[numOptions++] = "-pragma-unroll-threshold=4096";
-    llpcOptions[numOptions++] = "-unroll-allow-partial";
-    llpcOptions[numOptions++] = "-lower-dyn-index";
-    llpcOptions[numOptions++] = "-simplifycfg-sink-common=false";
-    llpcOptions[numOptions++] = "-amdgpu-vgpr-index-mode"; // force VGPR indexing on GFX8
-
-    ShaderCacheMode shaderCacheMode = m_settings.shaderCacheMode;
-#ifdef ICD_BUILD_APPPROFILE
-    const AppProfile appProfile = GetAppProfile();
-    if ((appProfile == AppProfile::Talos) ||
-        (appProfile == AppProfile::MadMax) ||
-        (appProfile == AppProfile::SeriousSamFusion))
-    {
-        llpcOptions[numOptions++] = "-enable-si-scheduler";
-    }
-
-    // Force enable cache to disk to improve user experience
-    if ((shaderCacheMode == ShaderCacheEnableRuntimeOnly) &&
-         ((appProfile == AppProfile::MadMax) ||
-          (appProfile == AppProfile::SeriousSamFusion) ||
-          (appProfile == AppProfile::F1_2017)))
-    {
-        // Force to use internal disk cache.
-        shaderCacheMode = ShaderCacheForceInternalCacheOnDisk;
-    }
-#endif
-
-    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-executable-name=%s", pExecutablePtr);
-    ++optionLength;
-    llpcOptions[numOptions++] = pOptionBuffer;
-    pOptionBuffer += optionLength;
-    bufSize -= optionLength;
-
-    optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-cache-mode=%d", shaderCacheMode);
-    ++optionLength;
-    llpcOptions[numOptions++] = pOptionBuffer;
-    pOptionBuffer += optionLength;
-    bufSize -= optionLength;
-
-    if (settings.shaderReplaceMode != 0)
-    {
-        optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-mode=%d", settings.shaderReplaceMode);
-        ++optionLength;
-        llpcOptions[numOptions++] = pOptionBuffer;
-        pOptionBuffer += optionLength;
-        bufSize -= optionLength;
-
-        optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-dir=%s", settings.shaderReplaceDir);
-        ++optionLength;
-        llpcOptions[numOptions++] = pOptionBuffer;
-        pOptionBuffer += optionLength;
-        bufSize -= optionLength;
-
-        optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-shader-replace-pipeline-hashes=%s", settings.shaderReplacePipelineHashes);
-        ++optionLength;
-        llpcOptions[numOptions++] = pOptionBuffer;
-        pOptionBuffer += optionLength;
-        bufSize -= optionLength;
-    }
-
-    VK_ASSERT(numOptions <= MaxLlpcOptions);
-
-    // Create LLPC compiler
-    Llpc::Result llpcResult = Llpc::ICompiler::Create(gfxIp, numOptions, llpcOptions, &pCompiler);
-    VK_ASSERT(llpcResult == Llpc::Result::Success);
-
-    m_pLlpcCompiler[deviceIdx] = pCompiler;
-
-    return (llpcResult == Llpc::Result::Success) ? VK_SUCCESS : VK_ERROR_INITIALIZATION_FAILED;
-}
-
 // =====================================================================================================================
 // Gets default pipeline cache expected entry count based on current existing pipeline cache count.
 uint32_t Device::GetPipelineCacheExpectedEntryCount()
@@ -2820,6 +2578,8 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorSetLayoutSupportKHR(
 }
 #endif
 
+// =====================================================================================================================
+
 // =====================================================================================================================
 VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryHostPointerPropertiesEXT(
     VkDevice                                    device,
diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp
index c3fa0b15..54ff0dd6 100644
--- a/icd/api/vk_dispatch.cpp
+++ b/icd/api/vk_dispatch.cpp
@@ -103,7 +103,8 @@ void* GetIcdProcAddr(
                         pFunc = pEntry->pFunc;
                         break;
                     }
-                case vk::secure::entry::ENTRY_POINT_CORE:
+                case vk::secure::entry::ENTRY_POINT_CORE_INSTANCE:
+                case vk::secure::entry::ENTRY_POINT_CORE_DEVICE:
                     {
                         // Check version requested against the required version.
                         if ((pInstance != nullptr) && (pInstance->GetAPIVersion() >= pEntry->conditionValue))
@@ -591,7 +592,7 @@ VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr(
     return vk::entry::vkGetDeviceProcAddr(device, pName);
 }
 
-} // namespace vk
+} // extern "C"
 
 struct VK_LAYER_DISPATCH_TABLE
 {
diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp
index 14094dfd..23b24877 100644
--- a/icd/api/vk_graphics_pipeline.cpp
+++ b/icd/api/vk_graphics_pipeline.cpp
@@ -101,7 +101,6 @@ void GraphicsPipeline::BuildRasterizationState(
     };
 
     // By default rasterization is disabled, unless rasterization creation info is present
-    pInfo->pipelineLlpc.rsState.rasterizerDiscardEnable = true;
 
     const VkPhysicalDeviceLimits& limits = pDevice->VkPhysicalDevice()->GetLimits();
 
@@ -116,9 +115,6 @@ void GraphicsPipeline::BuildRasterizationState(
             {
                 pInfo->pipeline.rsState.depthClampDisable = (pRs->depthClampEnable == VK_FALSE);
                 // When depth clamping is enabled, depth clipping should be disabled, and vice versa
-                pInfo->pipelineLlpc.vpState.depthClipEnable         = (pRs->depthClampEnable == VK_FALSE);
-                pInfo->pipelineLlpc.rsState.rasterizerDiscardEnable = (pRs->rasterizerDiscardEnable != VK_FALSE);
-
                 pInfo->immedInfo.triangleRasterState.fillMode  = VkToPalFillMode(pRs->polygonMode);
                 pInfo->immedInfo.triangleRasterState.cullMode  = VkToPalCullMode(pRs->cullMode);
                 pInfo->immedInfo.triangleRasterState.frontFace = VkToPalFaceOrientation(pRs->frontFace);
@@ -208,22 +204,20 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
 
     if (pGraphicsPipelineCreateInfo != nullptr)
     {
-        pInfo->activeStageCount = pGraphicsPipelineCreateInfo->stageCount;
-        pInfo->pActiveStages    = pGraphicsPipelineCreateInfo->pStages;
-
+        for (uint32_t i = 0; i < pGraphicsPipelineCreateInfo->stageCount; ++i)
+        {
+            pInfo->activeStages = static_cast<VkShaderStageFlagBits>(
+                pInfo->activeStages | pGraphicsPipelineCreateInfo->pStages[i].stage);
+        }
         VK_IGNORE(pGraphicsPipelineCreateInfo->flags & VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT);
 
         pRenderPass = RenderPass::ObjectFromHandle(pGraphicsPipelineCreateInfo->renderPass);
 
-        pInfo->isMultiviewEnabled = pRenderPass->IsMultiviewEnabled();
-
         if (pGraphicsPipelineCreateInfo->layout != VK_NULL_HANDLE)
         {
             pInfo->pLayout = PipelineLayout::ObjectFromHandle(pGraphicsPipelineCreateInfo->layout);
         }
 
-        pInfo->pVertexInput = pGraphicsPipelineCreateInfo->pVertexInputState;
-
         const VkPipelineInputAssemblyStateCreateInfo* pIa = pGraphicsPipelineCreateInfo->pInputAssemblyState;
 
         // According to the spec this should never be null
@@ -239,36 +233,17 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
             &pInfo->pipeline.iaState.topologyInfo.primitiveType,
             &pInfo->pipeline.iaState.topologyInfo.adjacency);
 
-        pInfo->pipelineLlpc.iaState.topology           = pIa->topology;
-        pInfo->pipelineLlpc.iaState.disableVertexReuse = false;
-
-        EXTRACT_VK_STRUCTURES_1(
+        EXTRACT_VK_STRUCTURES_0(
             Tess,
             PipelineTessellationStateCreateInfo,
-            PipelineTessellationDomainOriginStateCreateInfoKHR,
             pGraphicsPipelineCreateInfo->pTessellationState,
-            PIPELINE_TESSELLATION_STATE_CREATE_INFO,
-            PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO_KHR)
+            PIPELINE_TESSELLATION_STATE_CREATE_INFO)
 
             if (pPipelineTessellationStateCreateInfo != nullptr)
             {
                 pInfo->pipeline.iaState.topologyInfo.patchControlPoints = pPipelineTessellationStateCreateInfo->patchControlPoints;
-
-                pInfo->pipelineLlpc.iaState.patchControlPoints = pInfo->pipeline.iaState.topologyInfo.patchControlPoints;
             }
 
-        if (pPipelineTessellationDomainOriginStateCreateInfoKHR)
-        {
-            // Vulkan 1.0 incorrectly specified the tessellation u,v coordinate origin as lower left even though
-            // framebuffer and image coordinate origins are in the upper left.  This has since been fixed, but
-            // an extension exists to use the previous behavior.  Doing so with flat shading would likely appear
-            // incorrect, but Vulkan specifies that the provoking vertex is undefined when tessellation is active.
-            if (pPipelineTessellationDomainOriginStateCreateInfoKHR->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT_KHR)
-            {
-                pInfo->pipelineLlpc.iaState.switchWinding = true;
-            }
-        }
-
         pInfo->immedInfo.staticStateMask = 0;
 
         const VkPipelineDynamicStateCreateInfo* pDy = pGraphicsPipelineCreateInfo->pDynamicState;
@@ -313,12 +288,16 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
             {
                 VK_ASSERT(pVp->pViewports != nullptr);
 
+                const bool khrMaintenance1 =
+                    ((pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
+                     pDevice->IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1));
+
                 for (uint32_t i = 0; i < pVp->viewportCount; ++i)
                 {
                     VkToPalViewport(
                         pVp->pViewports[i],
                         i,
-                        pDevice->IsExtensionEnabled(DeviceExtensions::KHR_MAINTENANCE1),
+                        khrMaintenance1,
                         &pInfo->immedInfo.viewportParams);
                 }
 
@@ -410,14 +389,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
                     pInfo->msaa.pixelShaderSamples = 1;
                 }
 
-                pInfo->pipelineLlpc.rsState.numSamples = rasterizationSampleCount;
-
-                // NOTE: The sample pattern index here is actually the offset of sample position pair. This is
-                // different from the field of creation info of image view. For image view, the sample pattern
-                // index is really table index of the sample pattern.
-                pInfo->pipelineLlpc.rsState.samplePatternIdx =
-                    Device::GetDefaultSamplePatternIndex(subpassCoverageSampleCount) * Pal::MaxMsaaRasterizerSamples;
-
                 pInfo->msaa.depthStencilSamples = subpassDepthSampleCount;
                 pInfo->msaa.shaderExportMaskSamples = subpassCoverageSampleCount;
                 pInfo->msaa.sampleMask = (pMs->pSampleMask != nullptr)
@@ -476,7 +447,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
             }
 
             pInfo->pipeline.cbState.alphaToCoverageEnable     = (pMs->alphaToCoverageEnable == VK_TRUE);
-            pInfo->pipelineLlpc.cbState.alphaToCoverageEnable = (pMs->alphaToCoverageEnable == VK_TRUE);
         }
 
         const VkPipelineColorBlendStateCreateInfo* pCb = pGraphicsPipelineCreateInfo->pColorBlendState;
@@ -500,7 +470,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
                 const VkPipelineColorBlendAttachmentState& src = pCb->pAttachments[i];
 
                 auto pCbDst     = &pInfo->pipeline.cbState.target[i];
-                auto pLlpcCbDst = &pInfo->pipelineLlpc.cbState.target[i];
                 auto pBlendDst  = &pInfo->blend.targets[i];
 
                 if (pRenderPass)
@@ -515,20 +484,10 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
                 if (pCbDst->swizzledFormat.format != Pal::ChNumFormat::Undefined)
                 {
                     pCbDst->channelWriteMask         = src.colorWriteMask;
-                    pLlpcCbDst->format               = cbFormat[i];
-                    pLlpcCbDst->blendEnable          = (src.blendEnable == VK_TRUE);
-                    pLlpcCbDst->blendSrcAlphaToColor = IsSrcAlphaUsedInBlend(src.srcAlphaBlendFactor) ||
-                                                       IsSrcAlphaUsedInBlend(src.dstAlphaBlendFactor) ||
-                                                       IsSrcAlphaUsedInBlend(src.srcColorBlendFactor) ||
-                                                       IsSrcAlphaUsedInBlend(src.dstColorBlendFactor);
-                    blendingEnabled |= pLlpcCbDst->blendEnable;
-                }
-                else
-                {
-                    pLlpcCbDst->blendEnable = false;
+                    blendingEnabled |= (src.blendEnable == VK_TRUE);
                 }
 
-                pBlendDst->blendEnable    = pLlpcCbDst->blendEnable;
+                pBlendDst->blendEnable    = (src.blendEnable == VK_TRUE);
                 pBlendDst->srcBlendColor  = VkToPalBlend(src.srcColorBlendFactor);
                 pBlendDst->dstBlendColor  = VkToPalBlend(src.dstColorBlendFactor);
                 pBlendDst->blendFuncColor = VkToPalBlendFunc(src.colorBlendOp);
@@ -544,7 +503,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
         }
 
         pInfo->pipeline.cbState.dualSourceBlendEnable     = dualSourceBlend;
-        pInfo->pipelineLlpc.cbState.dualSourceBlendEnable = dualSourceBlend;
 
         if (blendingEnabled == true && dynamicStateFlags[VK_DYNAMIC_STATE_BLEND_CONSTANTS] == false)
         {
@@ -650,147 +608,6 @@ void GraphicsPipeline::ConvertGraphicsPipelineInfo(
 
 }
 
-// =====================================================================================================================
-// Creates a graphics pipeline binary for each PAL device
-VkResult GraphicsPipeline::CreateGraphicsPipelineBinaries(
-    Device*                             pDevice,
-    PipelineCache*                      pPipelineCache,
-    CreateInfo*                         pInfo,
-    VbBindingInfo*                      pVbInfo,
-    size_t                              pipelineBinarySizes[MaxPalDevices],
-    void*                               pPipelineBinaries[MaxPalDevices])
-{
-    const RuntimeSettings& settings = pDevice->GetRuntimeSettings();
-
-    VkResult result         = VK_SUCCESS;
-    void*    pMappingBuffer = nullptr;
-
-    // Allocate space to create the LLPC/SCPC pipeline resource mappings
-    if (pInfo->pLayout != nullptr)
-    {
-        size_t tempBufferSize = pInfo->pLayout->GetPipelineInfo()->tempBufferSize;
-
-        // Allocate the temp buffer
-        if (tempBufferSize > 0)
-        {
-            pMappingBuffer = pDevice->VkInstance()->AllocMem(
-                tempBufferSize,
-                VK_DEFAULT_MEM_ALIGN,
-                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-
-            if (pMappingBuffer == nullptr)
-            {
-                result = VK_ERROR_OUT_OF_HOST_MEMORY;
-            }
-        }
-    }
-
-    bool enableLlpc = false;
-
-    if (result == VK_SUCCESS)
-    {
-        // Build the LLPC pipeline
-        Llpc::GraphicsPipelineBuildOut pipelineOut         = {};
-        void*                          pLlpcPipelineBuffer = nullptr;
-        {
-            Llpc::PipelineShaderInfo* shaderInfos[] =
-            {
-                &pInfo->pipelineLlpc.vs,
-                &pInfo->pipelineLlpc.tcs,
-                &pInfo->pipelineLlpc.tes,
-                &pInfo->pipelineLlpc.gs,
-                &pInfo->pipelineLlpc.fs
-            };
-
-            // Apply patches
-            pInfo->pipelineLlpc.pInstance      = pDevice->VkPhysicalDevice()->VkInstance();
-            pInfo->pipelineLlpc.pfnOutputAlloc = AllocateShaderOutput;
-            pInfo->pipelineLlpc.pUserData      = &pLlpcPipelineBuffer;
-            pInfo->pipelineLlpc.pVertexInput   = pInfo->pVertexInput;
-
-            pInfo->pipelineLlpc.iaState.enableMultiView = pInfo->pipeline.viewInstancingDesc.enableMasking;
-            pInfo->pipelineLlpc.rsState.perSampleShading   = (pInfo->msaa.pixelShaderSamples > 1);
-
-            for (uint32_t stage = 0; stage < pInfo->activeStageCount; ++stage)
-            {
-                auto pStage      = &pInfo->pActiveStages[stage];
-                auto pShader     = ShaderModule::ObjectFromHandle(pStage->module);
-                auto shaderStage = ShaderFlagBitToStage(pStage->stage);
-                auto pShaderInfo = shaderInfos[shaderStage];
-
-                pShaderInfo->pModuleData         = pShader->GetShaderData(true);
-                pShaderInfo->pSpecializatonInfo  = pStage->pSpecializationInfo;
-                pShaderInfo->pEntryTarget        = pStage->pName;
-
-                // Build the resource mapping description for LLPC.  This data contains things about how shader
-                // inputs like descriptor set bindings are communicated to this pipeline in a form that LLPC can
-                // understand.
-                if (pInfo->pLayout != nullptr)
-                {
-                    const bool vertexShader = (shaderStage == ShaderStageVertex);
-                    result = pInfo->pLayout->BuildLlpcPipelineMapping(
-                        shaderStage,
-                        pMappingBuffer,
-                        vertexShader ? pInfo->pVertexInput : nullptr,
-                        pShaderInfo,
-                        vertexShader ? pVbInfo : nullptr);
-                }
-            }
-        }
-
-        uint64_t pipeHash = 0;
-        enableLlpc = true;
-
-        if (result == VK_SUCCESS)
-        {
-            if (enableLlpc)
-            {
-                if ((pPipelineCache != nullptr) && (pPipelineCache->GetPipelineCacheType() == PipelineCacheTypeLlpc))
-                {
-                    pInfo->pipelineLlpc.pShaderCache = pPipelineCache->GetShaderCache(DefaultDeviceIndex).pLlpcShaderCache;
-                }
-                Llpc::Result llpcResult = pDevice->GetLlpcCompiler()->BuildGraphicsPipeline(&pInfo->pipelineLlpc, &pipelineOut);
-                if (llpcResult != Llpc::Result::Success)
-                {
-                    // There shouldn't be anything to free for the failure case
-                    VK_ASSERT(pLlpcPipelineBuffer == nullptr);
-
-                    {
-                        result = VK_ERROR_INITIALIZATION_FAILED;
-                    }
-                }
-            }
-            else
-            if (settings.enablePipelineDump)
-            {
-                // LLPC isn't enabled but pipeline dump is required, call LLPC dump interface explicitly
-                void* pHandle = Llpc::IPipelineDumper::BeginPipelineDump(settings.pipelineDumpDir, nullptr, &pInfo->pipelineLlpc);
-                Llpc::IPipelineDumper::EndPipelineDump(pHandle);
-            }
-
-            if (enableLlpc)
-            {
-                if (result == VK_SUCCESS)
-                {
-                    // Make sure that this is the same pointer we will free once the PAL pipeline is created
-                    VK_ASSERT(pLlpcPipelineBuffer == pipelineOut.pipelineBin.pCode);
-
-                    // Update pipeline create info with the pipeline binary
-                    pPipelineBinaries[DefaultDeviceIndex]   = pLlpcPipelineBuffer;
-                    pipelineBinarySizes[DefaultDeviceIndex] = pipelineOut.pipelineBin.codeSize;
-                }
-            }
-        }
-    }
-
-    if (pMappingBuffer != nullptr)
-    {
-        pDevice->VkInstance()->FreeMem(pMappingBuffer);
-    }
-
-    return result;
-}
-
 // =====================================================================================================================
 // Create a graphics pipeline object.
 VkResult GraphicsPipeline::Create(
@@ -803,21 +620,30 @@ VkResult GraphicsPipeline::Create(
     // Parse the create info and build patched AMDIL shaders
     CreateInfo    createInfo                         = {};
     VbBindingInfo vbInfo                             = {};
+    PipelineCompiler::GraphicsPipelineCreateInfo binaryCreateInfo = {};
     size_t        pipelineBinarySizes[MaxPalDevices] = {};
-    void*         pPipelineBinaries[MaxPalDevices]   = {};
+    const void*   pPipelineBinaries[MaxPalDevices]   = {};
     Pal::Result   palResult                          = Pal::Result::Success;
+    PipelineCompiler*     pDefaultCompiler = pDevice->GetCompiler();
 
-    ConvertGraphicsPipelineInfo(pDevice, pCreateInfo, &createInfo);
-
-    VkResult result = CreateGraphicsPipelineBinaries(
+    VkResult result = pDefaultCompiler->ConvertGraphicsPipelineInfo(pDevice, pCreateInfo, &binaryCreateInfo, &vbInfo);
+    const uint32_t numPalDevices = pDevice->NumPalDevices();
+    for (uint32_t i = 0; (result == VK_SUCCESS) && (i < numPalDevices); ++i)
+    {
+        result = pDevice->GetCompiler(i)->CreateGraphicsPipelineBinary(
         pDevice,
+        i,
         pPipelineCache,
-        &createInfo,
-        &vbInfo,
-        pipelineBinarySizes,
-        pPipelineBinaries);
+        &binaryCreateInfo,
+        &pipelineBinarySizes[i],
+        &pPipelineBinaries[i]);
+    }
 
-    const uint32_t numPalDevices = pDevice->NumPalDevices();
+    if (result == VK_SUCCESS)
+    {
+        ConvertGraphicsPipelineInfo(pDevice, pCreateInfo, &createInfo);
+
+    }
 
     RenderStateCache* pRSCache = pDevice->GetRenderStateCache();
 
@@ -934,7 +760,7 @@ VkResult GraphicsPipeline::Create(
 
     const bool viewIndexFromDeviceIndex = Util::TestAnyFlagSet(
         pCreateInfo->flags,
-        VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT);
+        VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHX);
 
     // On success, wrap it up in a Vulkan object.
     if (result == VK_SUCCESS)
@@ -960,11 +786,11 @@ VkResult GraphicsPipeline::Create(
     {
         if (pPipelineBinaries[deviceIdx] != nullptr)
         {
-            {
-                pDevice->VkInstance()->FreeMem(pPipelineBinaries[deviceIdx]);
-            }
+            pDevice->GetCompiler(deviceIdx)->FreeGraphicsPipelineBinary(
+                &binaryCreateInfo, pPipelineBinaries[deviceIdx], pipelineBinarySizes[deviceIdx]);
         }
     }
+    pDefaultCompiler->FreeGraphicsPipelineCreateInfo(&binaryCreateInfo);
 
     if (result != VK_SUCCESS)
     {
diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp
index d4ad84fe..91c0fdc2 100644
--- a/icd/api/vk_image.cpp
+++ b/icd/api/vk_image.cpp
@@ -1745,7 +1745,8 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements2KHR(
     VkMemoryRequirements2KHR*                   pMemoryRequirements)
 {
     const Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    VK_ASSERT(pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
+    VK_ASSERT((pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
+              pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
 
     union
     {
@@ -1785,7 +1786,8 @@ VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements2KHR(
     VkSparseImageMemoryRequirements2KHR*            pSparseMemoryRequirements)
 {
     const Device* pDevice = ApiDevice::ObjectFromHandle(device);
-    VK_ASSERT(pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
+    VK_ASSERT((pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
+              pDevice->IsExtensionEnabled(DeviceExtensions::KHR_GET_MEMORY_REQUIREMENTS2));
 
     union
     {
diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp
index 44b53b59..5227a5b3 100644
--- a/icd/api/vk_instance.cpp
+++ b/icd/api/vk_instance.cpp
@@ -104,9 +104,7 @@ VkResult Instance::EnumerateVersion(
     uint32_t*                       pApiVersion)
 {
     // Report 1.1 support
-    *pApiVersion = VK_MAKE_VERSION(VULKAN_API_MAJOR_VERSION,
-                                   VULKAN_API_MINOR_VERSION,
-                                   VULKAN_API_BUILD_VERSION);
+    *pApiVersion = (VK_API_VERSION_1_1 | VK_HEADER_VERSION);
 
     return VK_SUCCESS;
 }
@@ -162,9 +160,7 @@ VkResult Instance::Create(
     uint32_t apiVersion = VK_MAKE_VERSION(1,0,0);
 #else
     // Default to the highest supported API version
-    uint32_t apiVersion = VK_MAKE_VERSION(VULKAN_API_MAJOR_VERSION,
-                                          VULKAN_API_MINOR_VERSION,
-                                          VULKAN_API_BUILD_VERSION);
+    uint32_t apiVersion = (VK_API_VERSION_1_0 | VK_HEADER_VERSION);
 #endif
 
     if ((pAppInfo != nullptr) && (pAppInfo->apiVersion != 0))
@@ -182,18 +178,6 @@ VkResult Instance::Create(
         apiVersion = pAppInfo->apiVersion;
     }
 
-#ifdef ICD_VULKAN_1_1
-    // Implicitly enable instance extensions that are core in the API version
-    if (apiVersion >= VK_MAKE_VERSION(1, 1, 0))
-    {
-        enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_DEVICE_GROUP_CREATION);
-        enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_EXTERNAL_FENCE_CAPABILITIES);
-        enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_EXTERNAL_MEMORY_CAPABILITIES);
-        enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_EXTERNAL_SEMAPHORE_CAPABILITIES);
-        enabledInstanceExtensions.EnableExtension(InstanceExtensions::KHR_GET_PHYSICAL_DEVICE_PROPERTIES2);
-    };
-#endif
-
     // pAllocCb is never NULL here because the entry point will fill it in if the
     // application doesn't.
     VK_ASSERT(pAllocCb != nullptr);
diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp
index 9bacecf2..059b9743 100644
--- a/icd/api/vk_memory.cpp
+++ b/icd/api/vk_memory.cpp
@@ -719,7 +719,8 @@ VkResult Memory::OpenExternalMemory(
 // Returns the external shared handle of the memory object.
 Pal::OsExternalHandle Memory::GetShareHandle(VkExternalMemoryHandleTypeFlagBitsKHR handleType)
 {
-    VK_ASSERT(m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_EXTERNAL_MEMORY_FD) ||
+    VK_ASSERT((m_pDevice->VkPhysicalDevice()->GetEnabledAPIVersion() >= VK_MAKE_VERSION(1, 1, 0)) ||
+              m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_EXTERNAL_MEMORY_FD)             ||
               m_pDevice->IsExtensionEnabled(DeviceExtensions::KHR_EXTERNAL_MEMORY_WIN32));
 
     return PalMemory()->GetSharedExternalHandle();
diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp
index 49fe6502..44b7ad7b 100644
--- a/icd/api/vk_physical_device.cpp
+++ b/icd/api/vk_physical_device.cpp
@@ -256,14 +256,15 @@ PhysicalDevice::PhysicalDevice(
 #ifdef ICD_BUILD_APPPROFILE
     m_appProfile(appProfile),
 #endif
-    m_supportedExtensions()
+    m_supportedExtensions(),
+    m_compiler(this)
 {
     memset(&m_limits, 0, sizeof(m_limits));
     memset(m_formatFeatureMsaaTarget, 0, sizeof(m_formatFeatureMsaaTarget));
     memset(&m_queueFamilies, 0, sizeof(m_queueFamilies));
     memset(&m_memoryProperties, 0, sizeof(m_memoryProperties));
     memset(&m_gpaProps, 0, sizeof(m_gpaProps));
-    for (uint32_t i = 0; i< VK_MEMORY_TYPE_NUM; i++)
+    for (uint32_t i = 0; i < VK_MEMORY_TYPE_NUM; i++)
     {
         m_memoryPalHeapToVkIndex[i] = VK_MEMORY_TYPE_NUM; // invalid index
         m_memoryVkIndexToPalHeap[i] = Pal::GpuHeapCount; // invalid index
@@ -353,37 +354,18 @@ static void GetFormatFeatureFlags(
         VkFormatFeatureFlags depthFlags = PalToVkFormatFeatureFlags(
             formatProperties.features[depthFormatIdx][tilingIdx]);
 
-        if (depthFlags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)
+        if ((depthFlags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) != 0)
         {
             retFlags |= (depthFlags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT);
         }
-    }
-
-    const uint32_t minMaxFeatureBits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT;
-
-    // Handle the various special cases for Min\Max Image Filtering support
-    if ((retFlags & minMaxFeatureBits) != 0)
-    {
-        const auto& info = Pal::Formats::FormatInfoTable[static_cast<size_t>(swizzledFormat.format)];
-
-        // min/max filtering is supported only for single-component formats unless multiChannelMinMaxFilter == true
-        // Depth-stencil is considered a single-component format because stencil and depth are separate, single
-        // channel images and in Hw, you can only sample from one of them at a time.
-        bool supported = (info.componentCount == 1)             ||
-                         Formats::IsDepthStencilFormat(format)  ||
-                         (multiChannelMinMaxFilter == true);
 
-        if ((Formats::IsDepthStencilFormat(format) == false) &&
-            ((info.numericSupport == Pal::Formats::NumericSupportFlags::Uint) ||
-             (info.numericSupport == Pal::Formats::NumericSupportFlags::Sint)))
+        // According to the Vulkan Spec (section 32.2.0)
+        // Re: VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT - If the format is a depth / stencil format,
+        // this bit only indicates that the depth aspect(not the stencil aspect) of an image of this format
+        // supports min/max filtering.
+        if ((depthFlags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT) != 0)
         {
-            // TODO: Disable Uint and Sint via Pal.
-            supported = false;
-        }
-
-        if (supported == false)
-        {
-            retFlags &= ~minMaxFeatureBits;
+            retFlags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT;
         }
     }
 
@@ -586,7 +568,13 @@ VkResult PhysicalDevice::Initialize()
         PopulateGpaProperties();
     }
 
-    return PalToVkResult(result);
+    VkResult vkResult = PalToVkResult(result);
+    if (vkResult == VK_SUCCESS)
+    {
+        vkResult = m_compiler.Initialize();
+    }
+
+    return vkResult;
 }
 
 // =====================================================================================================================
@@ -648,7 +636,7 @@ void PhysicalDevice::PopulateFormatProperties()
     Pal::MergedFormatPropertiesTable fmtProperties = {};
     m_pPalDevice->GetFormatProperties(&fmtProperties);
 
-    const bool multiChannelMinMaxFilter = m_properties.gfxipProperties.flags.supportPerChannelMinMaxFilter != 0;
+    const bool multiChannelMinMaxFilter = IsPerChannelMinMaxFilteringSupported();
 
     for (uint32_t i = 0; i < VK_SUPPORTED_FORMAT_COUNT; i++)
     {
@@ -732,6 +720,7 @@ void PhysicalDevice::LateInitialize()
 // =====================================================================================================================
 VkResult PhysicalDevice::Destroy(void)
 {
+    m_compiler.Destroy();
     this->~PhysicalDevice();
 
     VkInstance()->FreeMem(ApiPhysicalDevice::FromObject(this));
@@ -1291,9 +1280,7 @@ void PhysicalDevice::GetSparseImageFormatProperties(
 uint32_t PhysicalDevice::GetSupportedAPIVersion() const
 {
     // Currently all of our HW supports Vulkan 1.1
-    uint32_t apiVersion = VK_MAKE_VERSION(VULKAN_API_MAJOR_VERSION,
-                                          VULKAN_API_MINOR_VERSION,
-                                          VULKAN_API_BUILD_VERSION);
+    uint32_t apiVersion = (VK_API_VERSION_1_1 | VK_HEADER_VERSION);
 
     // For sanity check we do at least want to make sure that all the necessary extensions are supported and exposed.
     // The spec does not require Vulkan 1.1 implementations to expose the corresponding 1.0 extensions, but we'll
@@ -1343,9 +1330,7 @@ VkResult PhysicalDevice::GetDeviceProperties(
 #ifdef ICD_VULKAN_1_1
     pProperties->apiVersion    = GetSupportedAPIVersion();
 #else
-    pProperties->apiVersion    = VK_MAKE_VERSION(VULKAN_API_MAJOR_VERSION,
-                                                 VULKAN_API_MINOR_VERSION,
-                                                 VULKAN_API_BUILD_VERSION);
+    pProperties->apiVersion    = (VK_API_VERSION_1_0 | VK_HEADER_VERSION);
 #endif
 
     // Radeon Settings UI diplays driverVersion using sizes 10.10.12 like apiVersion, but our driverVersion uses 10.22.
@@ -2461,7 +2446,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions(
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(AMD_RASTERIZATION_ORDER));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(AMD_DRAW_INDIRECT_COUNT));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(AMD_NEGATIVE_VIEWPORT_HEIGHT));
-
+    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_SUBGROUP_BALLOT));
+    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_SUBGROUP_VOTE));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_STENCIL_EXPORT));
     availableExtensions.AddExtension(VK_DEVICE_EXTENSION(EXT_SHADER_VIEWPORT_INDEX_LAYER));
 
@@ -2636,11 +2622,11 @@ void PhysicalDevice::PopulateQueueFamilies()
     }
 
     // Determine the queue family to PAL engine type mapping and populate its properties
-    for (uint32_t i = 0; i < Pal::EngineTypeCount; ++i)
+    for (uint32_t engineType = 0; engineType < Pal::EngineTypeCount; ++engineType)
     {
         // Only add queue families for PAL engine types that have at least one queue present and that supports some
         // functionality exposed in Vulkan.
-        const auto& engineProps = m_properties.engineProperties[i];
+        const auto& engineProps = m_properties.engineProperties[engineType];
 
         // Update supportedQueueFlags based on what is enabled, as well as specific engine properties.
         // In particular, sparse binding support requires the engine to support virtual memory remap.
@@ -2650,35 +2636,52 @@ void PhysicalDevice::PopulateQueueFamilies()
             supportedQueueFlags &= ~VK_QUEUE_SPARSE_BINDING_BIT;
         }
 
-        if ((engineProps.engineCount != 0) && ((vkQueueFlags[i] & supportedQueueFlags) != 0))
+        if ((engineProps.engineCount != 0) && ((vkQueueFlags[engineType] & supportedQueueFlags) != 0))
         {
-            m_queueFamilies[m_queueFamilyCount].palEngineType = static_cast<Pal::EngineType>(i);
+            m_queueFamilies[m_queueFamilyCount].palEngineType = static_cast<Pal::EngineType>(engineType);
 
             const Pal::QueueType primaryQueueType = palQueueTypes[GetQueueFamilyPalEngineType(m_queueFamilyCount)];
             VK_ASSERT((engineProps.queueSupport & (1 << primaryQueueType)) != 0);
             m_queueFamilies[m_queueFamilyCount].palQueueType = primaryQueueType;
 
             uint32_t palImageLayoutFlag = 0;
-            switch (i)
+            uint32_t transferGranularityOverride = 0;
+
+            switch (engineType)
             {
-            case Pal::EngineTypeUniversal:        palImageLayoutFlag = Pal::LayoutUniversalEngine;     break;
-            case Pal::EngineTypeCompute:          palImageLayoutFlag = Pal::LayoutComputeEngine;       break;
-            case Pal::EngineTypeExclusiveCompute: palImageLayoutFlag = Pal::LayoutComputeEngine;       break;
-            case Pal::EngineTypeDma:              palImageLayoutFlag = Pal::LayoutDmaEngine;           break;
-            default: break; // no-op
+            case Pal::EngineTypeUniversal:
+                palImageLayoutFlag          = Pal::LayoutUniversalEngine;
+                transferGranularityOverride = m_settings.transferGranularityUniversalOverride;
+                break;
+            case Pal::EngineTypeCompute:
+            case Pal::EngineTypeExclusiveCompute:
+                palImageLayoutFlag          = Pal::LayoutComputeEngine;
+                transferGranularityOverride = m_settings.transferGranularityComputeOverride;
+                break;
+            case Pal::EngineTypeDma:
+                palImageLayoutFlag          = Pal::LayoutDmaEngine;
+                transferGranularityOverride = m_settings.transferGranularityDmaOverride;
+                break;
+            default:
+                break; // no-op
             }
 
             m_queueFamilies[m_queueFamilyCount].palImageLayoutFlag = palImageLayoutFlag;
 
-            VkQueueFamilyProperties& queueFamilyProps = m_queueFamilies[m_queueFamilyCount].properties;
-
-            queueFamilyProps.queueFlags = (vkQueueFlags[i] & supportedQueueFlags);
-
-            queueFamilyProps.queueCount = engineProps.engineCount;
+            VkQueueFamilyProperties* pQueueFamilyProps     = &m_queueFamilies[m_queueFamilyCount].properties;
 
-            queueFamilyProps.timestampValidBits = (engineProps.flags.supportsTimestamps != 0) ? 64 : 0;
+            pQueueFamilyProps->queueFlags                  = (vkQueueFlags[engineType] & supportedQueueFlags);
+            pQueueFamilyProps->queueCount                  = engineProps.engineCount;
+            pQueueFamilyProps->timestampValidBits          = (engineProps.flags.supportsTimestamps != 0) ? 64 : 0;
+            pQueueFamilyProps->minImageTransferGranularity = PalToVkExtent3d(engineProps.minTiledImageCopyAlignment);
 
-            queueFamilyProps.minImageTransferGranularity = PalToVkExtent3d(engineProps.minTiledImageCopyAlignment);
+            // Override reported transfer granularity via panel setting
+            if ((transferGranularityOverride & 0xf0000000) != 0)
+            {
+                pQueueFamilyProps->minImageTransferGranularity.width  = ((transferGranularityOverride >> 0)  & 0xff);
+                pQueueFamilyProps->minImageTransferGranularity.height = ((transferGranularityOverride >> 8)  & 0xff);
+                pQueueFamilyProps->minImageTransferGranularity.depth  = ((transferGranularityOverride >> 16) & 0xff);
+            }
 
             m_queueFamilyCount++;
         }
@@ -3055,7 +3058,7 @@ void PhysicalDevice::GetDeviceProperties2(
         VkPhysicalDeviceSubgroupProperties*                      pSubgroupProperties;
 #endif
         VkPhysicalDeviceExternalMemoryHostPropertiesEXT*         pExternalMemoryHostProperties;
-        VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT*        pSamplerFilterMinmaxPropertiesEXT;
+        VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT*        pMinMaxProperties;
         VkPhysicalDeviceShaderCorePropertiesAMD*                 pShaderCoreProperties;
     };
 
@@ -3137,7 +3140,9 @@ void PhysicalDevice::GetDeviceProperties2(
                                                        VK_SHADER_STAGE_GEOMETRY_BIT |
                                                        VK_SHADER_STAGE_FRAGMENT_BIT |
                                                        VK_SHADER_STAGE_COMPUTE_BIT;
-            pSubgroupProperties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT;
+            pSubgroupProperties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
+                                                       VK_SUBGROUP_FEATURE_VOTE_BIT |
+                                                       VK_SUBGROUP_FEATURE_BALLOT_BIT;
             pSubgroupProperties->quadOperationsInAllStages = VK_TRUE;
 
             break;
@@ -3146,8 +3151,8 @@ void PhysicalDevice::GetDeviceProperties2(
 
         case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT:
         {
-            pSamplerFilterMinmaxPropertiesEXT->filterMinmaxImageComponentMapping  = VK_FALSE;
-            pSamplerFilterMinmaxPropertiesEXT->filterMinmaxSingleComponentFormats = VK_TRUE;
+            pMinMaxProperties->filterMinmaxImageComponentMapping  = IsPerChannelMinMaxFilteringSupported();
+            pMinMaxProperties->filterMinmaxSingleComponentFormats = VK_TRUE;
             break;
         }
 
diff --git a/icd/api/vk_pipeline_cache.cpp b/icd/api/vk_pipeline_cache.cpp
index bcce0e5e..f692b24d 100644
--- a/icd/api/vk_pipeline_cache.cpp
+++ b/icd/api/vk_pipeline_cache.cpp
@@ -151,7 +151,7 @@ VkResult PipelineCache::Create(
             Llpc::ShaderCacheCreateInfo createInfo = {};
             for (uint32_t i = 0; i < numPalDevices; i++)
             {
-                auto pCompiler = pDevice->GetLlpcCompiler();
+                auto pCompiler = pDevice->GetCompiler()->GetLlpcCompiler();
 
                 if (useInitialData)
                 {
diff --git a/icd/api/vk_shader.cpp b/icd/api/vk_shader.cpp
index a2272ca5..a881649e 100644
--- a/icd/api/vk_shader.cpp
+++ b/icd/api/vk_shader.cpp
@@ -174,7 +174,7 @@ VkResult ShaderModule::Init(const Device* pDevice)
         moduleInfo.shaderBin.pCode    = m_pCode;
         moduleInfo.shaderBin.codeSize = m_codeSize;
 
-        Llpc::Result llpcResult = pDevice->GetLlpcCompiler()->BuildShaderModule(&moduleInfo, &m_llpcConvertOut);
+        Llpc::Result llpcResult = pDevice->GetCompiler()->GetLlpcCompiler()->BuildShaderModule(&moduleInfo, &m_llpcConvertOut);
 
         if ((llpcResult != Llpc::Result::Success) && (llpcResult != Llpc::Result::Delayed))
         {
diff --git a/icd/make/importdefs b/icd/make/importdefs
index bdf63ef2..e94b91fe 100644
--- a/icd/make/importdefs
+++ b/icd/make/importdefs
@@ -1,13 +1,13 @@
 # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION.  It describes the version of the PAL interface
 # that the ICD supports.  PAL uses this value to enable backwards-compatibility for older interface versions.  It must
 # be updated on each PAL promotion after handling all of the interface changes described in palLib.h.
-ICD_PAL_CLIENT_MAJOR_VERSION = 387
+ICD_PAL_CLIENT_MAJOR_VERSION = 388
 
 # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1.  It describes
 # the interface version of the gpuopen shared module (part of PAL) that the ICD supports.
 ICD_GPUOPEN_CLIENT_MAJOR_VERSION = 26
 ICD_GPUOPEN_CLIENT_MINOR_VERSION = 0
 
-# This will become the value of SCPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_SCPC=1.  It describes the verson of the
+# This will become the value of SCPC_CLIENT_INTERFACE_MAJOR_VERSION if ICD_BUILD_SCPC=1.  It describes the version of the
 # interface version of SCPC (currently part of PAL) that the ICD supports.
 ICD_SCPC_CLIENT_MAJOR_VERSION = 2
diff --git a/icd/res/ver.h b/icd/res/ver.h
index 058c2559..52b792e4 100644
--- a/icd/res/ver.h
+++ b/icd/res/ver.h
@@ -29,14 +29,6 @@
 #define MKSTR(x) #x
 #define MAKE_VERSION_STRING(x) MKSTR(x)
 
-#define VULKAN_API_MAJOR_VERSION    1
-#ifdef ICD_VULKAN_1_1
-#define VULKAN_API_MINOR_VERSION    1
-#else
-#define VULKAN_API_MINOR_VERSION    0
-#endif
-#define VULKAN_API_BUILD_VERSION    70
-
 // This value is used for the VkPhysicalDeviceProperties uint32 driverVersion which is OS agnostic
 #define VULKAN_ICD_MAJOR_VERSION    2
 
@@ -44,7 +36,7 @@
 #define VERSION_MAJOR_STR           MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0"
 
 // Bump up after each promotion to mainline
-#define VULKAN_ICD_BUILD_VERSION    18
+#define VULKAN_ICD_BUILD_VERSION    19
 
 // String version is needed with leading zeros and extra termination (unicode)
 #define VERSION_NUMBER_MINOR        VULKAN_ICD_BUILD_VERSION
diff --git a/icd/settings/settings.cfg b/icd/settings/settings.cfg
index 9b4a9169..cf05f6a8 100644
--- a/icd/settings/settings.cfg
+++ b/icd/settings/settings.cfg
@@ -1243,6 +1243,47 @@ Node = "Memory"
         VariableDefault = "false";
         SettingScope    = "PrivateDriverKey";
     }
+
+    Leaf
+    {
+        SettingName     = "TransferGranularityUniversalOverride";
+        SettingType     = "HEX_STR";
+        Description     = "Override reported minImageTransferGranularity field for graphics queue families.  This\r\n
+                           is encoded as a hex string of the form 0xb000zzyyxx, where 'xx', 'yy', and 'zz' are the\r\n
+                           reported transfer granularities in the X, Y and Z extents respectively, and 'b' is\r\n
+                           a control flag: if 'b' is non-zero, this override is applies; otherwise the standard\r\n
+                           transfer granularity is used.\r\n";
+        VariableName    = "transferGranularityUniversalOverride";
+        VariableType    = "uint32_t";
+        VariableDefault = "0";
+        SettingScope    = "PrivateDriverKey";
+    }
+
+    Leaf
+    {
+        SettingName     = "TransferGranularityComputeOverride";
+        SettingType     = "HEX_STR";
+        Description     = "Override reported minImageTransferGranularity field for compute queue families.  For how\r\n
+                          this value is interpreted by the driver, see the description for\r\n
+                          TransferGranularityUniversalOverride.\r\n";
+        VariableName    = "transferGranularityComputeOverride";
+        VariableType    = "uint32_t";
+        VariableDefault = "0";
+        SettingScope    = "PrivateDriverKey";
+    }
+
+    Leaf
+    {
+        SettingName     = "TransferGranularityDmaOverride";
+        SettingType     = "HEX_STR";
+        Description     = "Override reported minImageTransferGranularity field for DMA (i.e. SDMA) queue families.\r\n
+                          For how this value is interpreted by the driver, see the description for\r\n
+                          TransferGranularityUniversalOverride.\r\n";
+        VariableName    = "transferGranularityDmaOverride";
+        VariableType    = "uint32_t";
+        VariableDefault = "0";
+        SettingScope    = "PrivateDriverKey";
+    }
 }
 
 Node = "Optimization"