From 3e2d125e523ad0b748f52907165054b63ad21425 Mon Sep 17 00:00:00 2001
From: Jacob He <jacob.he@amd.com>
Date: Mon, 22 Jan 2018 16:40:24 +0800
Subject: [PATCH] Update XGL from commit: 2072cab 1. Implement
 VK_AMD_buffer_marker extension 2. Implement VK_EXT_debug_report extension 3.
 Pass layout to InitImmutableDescriptors(). This removes 80% of the time in
 DescriptorSet::Reassign() 4. Calculate location of bindings for descriptor
 set layout to avoid a memory lookup 5. Disable depth clamping when
 enableDepthClamp is set to false 6. Fix CTS
 dEQP-VK.tessellation.shader_input_output.barrier failure, simplify the
 TessFactorToBuffer offset calculation 7. Fix CTS
 dEQP-VK.glsl.440.linkage.varying.component group testing failure

---
 CMakeLists.txt                                |   6 +
 icd/CMakeLists.txt                            |   1 +
 icd/api/include/internal_mem_mgr.h            |   9 +-
 .../khronos/devext/vk_amd_buffer_marker.h     |  54 ++
 icd/api/include/khronos/vulkan.h              |   1 +
 icd/api/include/vk_buffer.h                   |   1 +
 icd/api/include/vk_cmdbuffer.h                |  19 +-
 icd/api/include/vk_conv.h                     |  31 +
 icd/api/include/vk_debug_report.h             | 102 ++
 icd/api/include/vk_descriptor_set.h           |   4 +-
 icd/api/include/vk_descriptor_set_layout.h    |  11 +-
 icd/api/include/vk_extensions.h               |   3 +
 icd/api/include/vk_instance.h                 |  39 +
 icd/api/internal_mem_mgr.cpp                  |  22 +-
 icd/api/llpc/CMakeLists.txt                   |   4 +
 icd/api/llpc/context/llpcCopyShader.cpp       | 119 ++-
 icd/api/llpc/context/llpcCopyShader.h         |  13 +-
 icd/api/llpc/context/llpcGraphicsContext.cpp  |  41 +-
 icd/api/llpc/context/llpcPipelineContext.h    |  10 +-
 icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp   | 136 ++-
 .../lower/llpcSpirvLowerResourceCollect.cpp   |  15 -
 .../patch/generate/gfx6/glslArithOpEmu.ll     |   0
 .../patch/generate/gfx9/glslArithOpEmu.ll     |   0
 icd/api/llpc/patch/generate/glslNullFsEmu.ll  |  14 +-
 .../llpc/patch/llpcPatchEntryPointMutate.cpp  |  17 +-
 .../llpc/patch/llpcPatchEntryPointMutate.h    |   5 +-
 .../llpc/patch/llpcPatchInOutImportExport.cpp | 877 ++++++++++++------
 .../llpc/patch/llpcPatchInOutImportExport.h   |  80 +-
 .../llpc/patch/llpcPatchResourceCollect.cpp   | 117 +--
 icd/api/llpc/patch/llpcVertexFetch.cpp        | 143 +--
 icd/api/llpc/patch/llpcVertexFetch.h          |   2 +-
 icd/api/llpc/translator/SPIRVInternal.h       |   5 +-
 icd/api/llpc/translator/SPIRVReader.cpp       |  17 +-
 icd/api/open_strings/entry_points.txt         |   6 +
 icd/api/open_strings/extensions.txt           |   2 +
 icd/api/open_strings/g_entry_points_decl.h    |  16 +
 icd/api/open_strings/g_entry_points_impl.h    |   4 +
 icd/api/open_strings/g_extensions_decl.h      |   4 +
 icd/api/open_strings/g_extensions_impl.h      |   2 +
 icd/api/open_strings/g_func_table.cpp         |  20 +
 icd/api/open_strings/g_func_table.h           |   8 +
 icd/api/vert_buf_binding_mgr.cpp              |   4 +-
 icd/api/vk_buffer.cpp                         |  16 +-
 icd/api/vk_cmdbuffer.cpp                      |  81 +-
 icd/api/vk_conv.cpp                           | 515 +++++++++-
 icd/api/vk_debug_report.cpp                   | 175 ++++
 icd/api/vk_descriptor_set.cpp                 |  18 +-
 icd/api/vk_descriptor_set_layout.cpp          |  14 +-
 icd/api/vk_dispatch.cpp                       |   6 +
 icd/api/vk_instance.cpp                       | 179 +++-
 icd/api/vk_memory.cpp                         |  19 -
 icd/api/vk_physical_device.cpp                |   2 +
 icd/api/vk_pipeline.cpp                       |   6 -
 icd/api/vk_query.cpp                          |  11 +-
 icd/make/importdefs                           |   2 +-
 icd/res/ver.h                                 |   2 +-
 icd/settings/settings.cpp                     |  10 +-
 57 files changed, 2392 insertions(+), 648 deletions(-)
 create mode 100644 icd/api/include/khronos/devext/vk_amd_buffer_marker.h
 create mode 100644 icd/api/include/vk_debug_report.h
 mode change 100644 => 100755 icd/api/llpc/patch/generate/gfx6/glslArithOpEmu.ll
 mode change 100644 => 100755 icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll
 create mode 100644 icd/api/vk_debug_report.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index acfc0f05..e5659abe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,6 +57,12 @@ option(USE_NEXT_SDK "Use next SDK?" OFF)
 
 option(ICD_BUILD_VIDEO "Build Video Support?" OFF)
 
+option(ICD_UPSTREAM_LLVM "Build with upstream LLVM?" OFF)
+
+if(NOT ICD_BUILD_LLPC)
+    set(ICD_UPSTREAM_LLVM OFF CACHE BOOL "ICD_UPSTREAM_LLVM is overrided to false." FORCE)
+endif()
+
 option(ICD_GPUOPEN_DEVMODE_BUILD "Build ${PROJECT_NAME} with GPU Open Developer Mode driver support?" ON)
 
 option(ICD_MEMTRACK "Turn on memory tracking?" ${CMAKE_BUILD_TYPE_DEBUG})
diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt
index 554eaf02..1bd513af 100644
--- a/icd/CMakeLists.txt
+++ b/icd/CMakeLists.txt
@@ -169,6 +169,7 @@ target_sources(xgl PRIVATE
     api/vk_cmd_pool.cpp
     api/vk_compute_pipeline.cpp
     api/vk_conv.cpp
+    api/vk_debug_report.cpp
     api/vk_descriptor_set.cpp
     api/vk_descriptor_set_layout.cpp
     api/vk_descriptor_pool.cpp
diff --git a/icd/api/include/internal_mem_mgr.h b/icd/api/include/internal_mem_mgr.h
index 6a30880a..c745c706 100644
--- a/icd/api/include/internal_mem_mgr.h
+++ b/icd/api/include/internal_mem_mgr.h
@@ -146,6 +146,12 @@ class InternalMemory
         return m_gpuVA[idx];
     }
 
+    void* CpuAddr(int32_t idx = DefaultDeviceIndex) const
+    {
+        VK_ASSERT((idx >= 0) && (idx < static_cast<int32_t>(MaxPalDevices)));
+        return m_memoryPool.groupMemory.CpuAddr(idx);
+    }
+
     Pal::gpusize Offset() const
         { return m_offset; }
 
@@ -210,7 +216,8 @@ class InternalMemMgr
         Pal::IGpuMemoryBindable*        pBindable,
         bool                            readOnly,
         InternalMemory*                 pInternalMemory,
-        bool                            removeInvisibleHeap = false);
+        bool                            removeInvisibleHeap = false,
+        bool                            persistentMapped    = false);
 
     void FreeGpuMem(
         const InternalMemory*           pInternalMemory);
diff --git a/icd/api/include/khronos/devext/vk_amd_buffer_marker.h b/icd/api/include/khronos/devext/vk_amd_buffer_marker.h
new file mode 100644
index 00000000..97373e85
--- /dev/null
+++ b/icd/api/include/khronos/devext/vk_amd_buffer_marker.h
@@ -0,0 +1,54 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2014-2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+/**
+ **********************************************************************************************************************
+ * @file  vk_amd_buffer_marker.h
+ * @brief Header for VK_AMD_buffer marker extension.
+ **********************************************************************************************************************
+ */
+#ifndef VK_AMD_BUFFER_MARKER_H_
+#define VK_AMD_BUFFER_MARKER_H_
+
+#define VK_AMD_buffer_marker                  1
+#define VK_AMD_BUFFER_MARKER_SPEC_VERSION     1
+#define VK_AMD_BUFFER_MARKER_EXTENSION_NUMBER 180
+
+#define VK_AMD_BUFFER_MARKER_EXTENSION_NAME   "VK_AMD_buffer_marker"
+
+typedef void (VKAPI_PTR *PFN_vkCmdWriteBufferMarkerAMD)(
+    VkCommandBuffer         commandBuffer,
+    VkPipelineStageFlagBits pipelineStage,
+    VkBuffer                dstBuffer,
+    VkDeviceSize            dstOffset,
+    uint32_t                marker);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdWriteBufferMarkerAMD(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlagBits                     pipelineStage,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    uint32_t                                    marker);
+
+#endif /* VK_AMD_BUFFER_MARKER_H_ */
diff --git a/icd/api/include/khronos/vulkan.h b/icd/api/include/khronos/vulkan.h
index c730e525..708937ae 100644
--- a/icd/api/include/khronos/vulkan.h
+++ b/icd/api/include/khronos/vulkan.h
@@ -55,6 +55,7 @@
 // Internal (under development) extension definitions
 
 #include "devext/vk_amd_gpa_interface.h"
+#include "devext/vk_amd_buffer_marker.h"
 
 enum class DynamicStatesInternal : uint32_t {
     VIEWPORT = 0,
diff --git a/icd/api/include/vk_buffer.h b/icd/api/include/vk_buffer.h
index 0d4fbef8..62523c7e 100644
--- a/icd/api/include/vk_buffer.h
+++ b/icd/api/include/vk_buffer.h
@@ -126,6 +126,7 @@ class Buffer : public NonDispatchable<VkBuffer, Buffer>
            BufferFlags                  internalFlags);
 
     void CalcBarrierUsage(
+        const Device*      pDevice,
         VkBufferUsageFlags usage);
 
     Pal::IGpuMemory*        m_pGpuMemory[MaxPalDevices];
diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h
index 8a06f86a..93983e2c 100644
--- a/icd/api/include/vk_cmdbuffer.h
+++ b/icd/api/include/vk_cmdbuffer.h
@@ -499,6 +499,12 @@ class CmdBuffer
         uint32_t                                    length,
         const void*                                 values);
 
+    void WriteBufferMarker(
+        VkPipelineStageFlagBits pipelineStage,
+        VkBuffer                dstBuffer,
+        VkDeviceSize            dstOffset,
+        uint32_t                marker);
+
     VK_INLINE void SetDeviceMask(uint32_t deviceMask)
     {
         // Ensure we are enabling valid devices within the group
@@ -541,9 +547,10 @@ class CmdBuffer
         return m_pPalCmdBuffers[idx];
     }
 
-    static Pal::uint32 ConvertBarrierSrcAccessFlags(VkAccessFlags accessMask);
-    static Pal::uint32 ConvertBarrierDstAccessFlags(VkAccessFlags accessMask);
+    static Pal::uint32 ConvertBarrierSrcAccessFlags(const Device* pDevice, VkAccessFlags accessMask);
+    static Pal::uint32 ConvertBarrierDstAccessFlags(const Device* pDevice, VkAccessFlags accessMask);
     static void ConvertBarrierCacheFlags(
+               const Device*           pDevice,
                VkAccessFlags           srcAccess,
                VkAccessFlags           dstAccess,
                uint32_t                supportInputCacheMask,
@@ -1337,6 +1344,14 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerInsertEXT(
 VKAPI_ATTR void VKAPI_CALL vkCmdSetSampleLocationsEXT(
     VkCommandBuffer                             commandBuffer,
     const VkSampleLocationsInfoEXT*             pSampleLocationsInfo);
+
+VKAPI_ATTR void VKAPI_CALL vkCmdWriteBufferMarkerAMD(
+    VkCommandBuffer         commandBuffer,
+    VkPipelineStageFlagBits pipelineStage,
+    VkBuffer                dstBuffer,
+    VkDeviceSize            dstOffset,
+    uint32_t                marker);
+
 } // namespace entry
 
 } // namespace vk
diff --git a/icd/api/include/vk_conv.h b/icd/api/include/vk_conv.h
index c11eee4e..a2326e95 100644
--- a/icd/api/include/vk_conv.h
+++ b/icd/api/include/vk_conv.h
@@ -1510,6 +1510,37 @@ VK_INLINE Pal::HwPipePoint VkToPalSrcPipePointForTimestampWrite(VkPipelineStageF
     return srcPipePoint;
 }
 
+// =====================================================================================================================
+// Converts Vulkan source pipeline stage flags to PAL buffer marker writes (top/bottom only)
+VK_INLINE Pal::HwPipePoint VkToPalSrcPipePointForMarkers(
+    VkPipelineStageFlags flags,
+    Pal::EngineType      engineType)
+{
+    // This function is written against the following three engine types.  If you hit this assert then check if this
+    // new engine supports top of pipe writes at all (e.g. SDMA doesn't).
+    VK_ASSERT(engineType == Pal::EngineTypeDma ||
+              engineType == Pal::EngineTypeUniversal ||
+              engineType == Pal::EngineTypeCompute);
+
+    // Flags that allow signaling at top-of-pipe (anything else maps to bottom)
+    constexpr VkPipelineStageFlags SrcTopOfPipeFlags =
+        VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+    Pal::HwPipePoint srcPipePoint;
+
+    if (((flags & ~SrcTopOfPipeFlags) == 0) &&
+        (engineType != Pal::EngineTypeDma)) // SDMA engines only support bottom of pipe writes
+    {
+        srcPipePoint = Pal::HwPipeTop;
+    }
+    else
+    {
+        srcPipePoint = Pal::HwPipeBottom;
+    }
+
+    return srcPipePoint;
+}
+
 // Helper structure for mapping stage flag sets to PAL pipe points
 struct HwPipePointMappingEntry
 {
diff --git a/icd/api/include/vk_debug_report.h b/icd/api/include/vk_debug_report.h
new file mode 100644
index 00000000..9f09e68c
--- /dev/null
+++ b/icd/api/include/vk_debug_report.h
@@ -0,0 +1,102 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#ifndef __VK_DEBUG_REPORT_H__
+#define __VK_DEBUG_REPORT_H__
+
+#pragma once
+
+#include "include/vk_dispatch.h"
+
+namespace vk
+{
+
+// =====================================================================================================================
+// Vulkan implementation of VK_EXT_debug_report extension
+class DebugReportCallback : public NonDispatchable<VkDebugReportCallbackEXT, DebugReportCallback>
+{
+public:
+    static VkResult Create(
+        Instance*                                 pInstance,
+        const VkDebugReportCallbackCreateInfoEXT* pCreateInfo,
+        const VkAllocationCallbacks*              pAllocator,
+        VkDebugReportCallbackEXT*                 pCallback);
+
+    void Destroy(
+        Instance*                                 pInstance,
+        const VkAllocationCallbacks*              pAllocator);
+
+    void Message(
+        Instance*                                 pInstance,
+        VkDebugReportFlagsEXT                     flags,
+        VkDebugReportObjectTypeEXT                objectType,
+        uint64_t                                  object,
+        size_t                                    location,
+        int32_t                                   messageCode,
+        const char*                               pLayerPrefix,
+        const char*                               pMessage);
+
+    VkDebugReportFlagsEXT GetFlags();
+
+    PFN_vkDebugReportCallbackEXT GetCallbackFunc();
+
+    void* GetUserData();
+
+protected:
+    DebugReportCallback()
+    {
+    };
+
+private:
+    VkDebugReportCallbackCreateInfoEXT m_createInfo;
+};
+
+namespace entry
+{
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDebugReportCallbackEXT(
+    VkInstance                                instance,
+    const VkDebugReportCallbackCreateInfoEXT* pCreateInfo,
+    const VkAllocationCallbacks*              pAllocator,
+    VkDebugReportCallbackEXT*                 pCallback);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDebugReportCallbackEXT(
+    VkInstance                                instance,
+    VkDebugReportCallbackEXT                  callback,
+    const VkAllocationCallbacks*              pAllocator);
+
+VKAPI_ATTR void VKAPI_CALL vkDebugReportMessageEXT(
+    VkInstance                                instance,
+    VkDebugReportFlagsEXT                     flags,
+    VkDebugReportObjectTypeEXT                objectType,
+    uint64_t                                  object,
+    size_t                                    location,
+    int32_t                                   messageCode,
+    const char*                               pLayerPrefix,
+    const char*                               pMessage);
+} // namespace entry
+
+} // namespace vk
+
+#endif /* __VK_DEBUG_REPORT_H__ */
diff --git a/icd/api/include/vk_descriptor_set.h b/icd/api/include/vk_descriptor_set.h
index 9ffc761f..73a04864 100644
--- a/icd/api/include/vk_descriptor_set.h
+++ b/icd/api/include/vk_descriptor_set.h
@@ -195,7 +195,9 @@ class DescriptorSet : public NonDispatchable<VkDescriptorSet, DescriptorSet>
         void*                       pAllocHandle,
         VkDescriptorSet*            pHandle);
 
-    void InitImmutableDescriptors(uint32_t numPalDevices);
+    void InitImmutableDescriptors(
+        const DescriptorSetLayout*  pLayout,
+        uint32_t                    numPalDevices);
 
     void* AllocHandle() const
         { return m_pAllocHandle; }
diff --git a/icd/api/include/vk_descriptor_set_layout.h b/icd/api/include/vk_descriptor_set_layout.h
index 3be67b11..fb436351 100644
--- a/icd/api/include/vk_descriptor_set_layout.h
+++ b/icd/api/include/vk_descriptor_set_layout.h
@@ -89,7 +89,6 @@ class DescriptorSetLayout : public NonDispatchable<VkDescriptorSetLayout, Descri
     struct CreateInfo
     {
         uint32_t        count;              // Total number of layout entries
-        BindingInfo*    bindings;           // Array of entries
         uint32_t        activeStageMask;    // Shader stage mask describing which stages in which at least one
                                             // binding of this layout's set is active
         uint32_t        numDynamicDescriptors; // Number of dynamic descriptors in this layout
@@ -110,7 +109,12 @@ class DescriptorSetLayout : public NonDispatchable<VkDescriptorSetLayout, Descri
         Device*                                     pDevice,
         const VkAllocationCallbacks*                pAllocator);
 
-    const BindingInfo& Binding(uint32_t bindingIndex) const { return m_info.bindings[bindingIndex]; }
+    const BindingInfo& Binding(uint32_t bindingIndex) const
+    {
+        // The bindings are allocated immediately after the object.  See DescriptorSetLayout::Create().
+        BindingInfo* pBindings = static_cast<BindingInfo*>(Util::VoidPtrInc(this, sizeof(*this)));
+        return pBindings[bindingIndex];
+    }
 
     const CreateInfo& Info() const { return m_info; }
 
@@ -133,7 +137,8 @@ class DescriptorSetLayout : public NonDispatchable<VkDescriptorSetLayout, Descri
     static VkResult ConvertCreateInfo(
         const Device*                                pDevice,
         const VkDescriptorSetLayoutCreateInfo*       pIn,
-        CreateInfo*                                  pInfo);
+        CreateInfo*                                  pInfo,
+        BindingInfo*                                 pOutBindings);
 
     static void ConvertBindingInfo(
         const VkDescriptorSetLayoutBinding* pBindingInfo,
diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h
index 2f5969e9..91f0a1a8 100644
--- a/icd/api/include/vk_extensions.h
+++ b/icd/api/include/vk_extensions.h
@@ -180,6 +180,7 @@ class InstanceExtensions : public Extensions<InstanceExtensions>
         KHX_DEVICE_GROUP_CREATION,
         KHR_EXTERNAL_SEMAPHORE_CAPABILITIES,
         KHR_EXTERNAL_FENCE_CAPABILITIES,
+        EXT_DEBUG_REPORT,
         Count
     };
 };
@@ -237,6 +238,8 @@ class DeviceExtensions : public Extensions<DeviceExtensions>
         KHR_EXTERNAL_FENCE_WIN32,
         KHR_WIN32_KEYED_MUTEX,
         EXT_GLOBAL_PRIORITY,
+        AMD_BUFFER_MARKER,
+
         Count
     };
 };
diff --git a/icd/api/include/vk_instance.h b/icd/api/include/vk_instance.h
index 897d9e4f..991ba268 100644
--- a/icd/api/include/vk_instance.h
+++ b/icd/api/include/vk_instance.h
@@ -42,11 +42,14 @@
 #include "include/vk_dispatch.h"
 #include "include/vk_utils.h"
 #include "include/vk_extensions.h"
+#include "include/vk_debug_report.h"
 
 #include "palDeveloperHooks.h"
 #include "palLib.h"
 #include "palScreen.h"
 #include "palSysMemory.h"
+#include "palList.h"
+#include "palMutex.h"
 
 namespace Pal
 {
@@ -194,6 +197,26 @@ class Instance
 
     VkResult QueryApplicationProfile(RuntimeSettings* pRuntimeSettings = nullptr);
 
+    VkResult RegisterDebugCallback(
+        DebugReportCallback* pCallback);
+
+    void UnregisterDebugCallback(
+        DebugReportCallback* pCallback);
+
+    void LogMessage(uint32_t    level,
+                    uint64_t    categoryMask,
+                    const char* pFormat,
+                    va_list     args);
+
+    void CallExternalCallbacks(
+        VkDebugReportFlagsEXT       flags,
+        VkDebugReportObjectTypeEXT  objectType,
+        uint64_t                    object,
+        size_t                      location,
+        int32_t                     messageCode,
+        const char*                 pLayerPrefix,
+        const char*                 pMessage);
+
 private:
     Instance(
         const VkAllocationCallbacks*        pAllocCb,
@@ -218,6 +241,13 @@ class Instance
         Pal::Developer::CallbackType type,
         void*                        pCbData);
 
+    static void PAL_STDCALL LogCallback(
+        void*       pClientData,
+        Pal::uint32 level,
+        Pal::uint64 categoryMask,
+        const char* pFormat,
+        va_list     args);
+
     Pal::IPlatform*                     m_pPalPlatform;             // Pal Platform object.
     VkAllocationCallbacks               m_allocCallbacks;
 
@@ -258,6 +288,15 @@ class Instance
     ChillSettings m_chillSettings; // Dynamic chill settings structure
 #endif
 
+    Util::List<DebugReportCallback*, PalAllocator>  m_debugReportCallbacks;             // List of registered Debug
+                                                                                        // Report Callbacks
+    Util::Mutex                                     m_logCallbackInternalOnlyMutex;     // Serialize internal log
+                                                                                        // message translation prior
+                                                                                        // to calling external callbacks
+    Util::Mutex                                     m_logCallbackInternalExternalMutex; // Serialize all calls to
+                                                                                        // external callbacks from
+                                                                                        // internal and external sources
+
 #ifdef PAL_ENABLE_PRINTS_ASSERTS
     mutable uint32_t m_dispatchTableQueryCount;
 #endif
diff --git a/icd/api/internal_mem_mgr.cpp b/icd/api/internal_mem_mgr.cpp
index 996263cc..207178ce 100644
--- a/icd/api/internal_mem_mgr.cpp
+++ b/icd/api/internal_mem_mgr.cpp
@@ -595,7 +595,8 @@ VkResult InternalMemMgr::AllocAndBindGpuMem(
     Pal::IGpuMemoryBindable* pBindable,
     bool                     readOnly,
     InternalMemory*          pInternalMemory,
-    bool                     removeInvisibleHeap)
+    bool                     removeInvisibleHeap,
+    bool                     persistentMapped)
 {
     VK_ASSERT(pBindable != nullptr);
     VK_ASSERT(pInternalMemory != nullptr);
@@ -610,20 +611,21 @@ VkResult InternalMemMgr::AllocAndBindGpuMem(
         return VK_SUCCESS;
     }
 
+    // Fill in the GPU memory object creation info based on the memory requirements
+    InternalMemCreateInfo createInfo = {};
+
     if (removeInvisibleHeap)
     {
         FilterInvisibleHeap(&memReqs);
     }
 
-    // Fill in the GPU memory object creation info based on the memory requirements
-    InternalMemCreateInfo createInfo = {};
-
-    createInfo.pal.size       = memReqs.size;
-    createInfo.pal.alignment  = memReqs.alignment;
-    createInfo.pal.vaRange    = Pal::VaRange::Default;
-    createInfo.pal.priority   = Pal::GpuMemPriority::Normal;
-    createInfo.pal.heapCount  = memReqs.heapCount;
-    createInfo.flags.readOnly = readOnly;
+    createInfo.pal.size               = memReqs.size;
+    createInfo.pal.alignment          = memReqs.alignment;
+    createInfo.pal.vaRange            = Pal::VaRange::Default;
+    createInfo.pal.priority           = Pal::GpuMemPriority::Normal;
+    createInfo.pal.heapCount          = memReqs.heapCount;
+    createInfo.flags.readOnly         = readOnly;
+    createInfo.flags.persistentMapped = persistentMapped ? 1 : 0;
 
     for (uint32_t h = 0; h < memReqs.heapCount; ++h)
     {
diff --git a/icd/api/llpc/CMakeLists.txt b/icd/api/llpc/CMakeLists.txt
index 5a382c4f..f7546781 100644
--- a/icd/api/llpc/CMakeLists.txt
+++ b/icd/api/llpc/CMakeLists.txt
@@ -89,6 +89,10 @@ target_compile_definitions(llpc PRIVATE ${TARGET_ARCHITECTURE_ENDIANESS}ENDIAN_C
 target_compile_definitions(llpc PRIVATE _SPIRV_LLVM_API)
 target_compile_definitions(llpc PRIVATE LLPC_BUILD_GFX9)
 
+if(ICD_UPSTREAM_LLVM)
+    target_compile_definitions(llpc PRIVATE LLVM_SOURCE_PROMOTION=1)
+endif()
+
 target_include_directories(llpc
     PUBLIC
         ${PROJECT_SOURCE_DIR}/include
diff --git a/icd/api/llpc/context/llpcCopyShader.cpp b/icd/api/llpc/context/llpcCopyShader.cpp
index fb9da3ed..43eb986c 100644
--- a/icd/api/llpc/context/llpcCopyShader.cpp
+++ b/icd/api/llpc/context/llpcCopyShader.cpp
@@ -95,6 +95,25 @@ Result CopyShader::Run(
     auto& inOutUsage = m_pContext->GetShaderResourceUsage(ShaderStageCopyShader)->inOutUsage;
     inOutUsage.gs.pGsVsRingBufDesc = pGsVsRingBufDesc;
 
+    if (m_pContext->IsGsOnChip())
+    {
+        // Construct LDS type: [ldsSize * i32], address space 3
+        auto ldsSize = m_pContext->GetGpuProperty()->ldsSizePerCu;
+        auto pLdsTy = ArrayType::get(m_pContext->Int32Ty(), ldsSize / sizeof(uint32_t));
+
+        m_pLds = new GlobalVariable(*m_pModule,
+                                    pLdsTy,
+                                    false,
+                                    GlobalValue::ExternalLinkage,
+                                    nullptr,
+                                    "lds",
+                                    nullptr,
+                                    GlobalValue::NotThreadLocal,
+                                    ADDR_SPACE_LOCAL);
+        LLPC_ASSERT(m_pLds != nullptr);
+        m_pLds->setAlignment(sizeof(uint32_t));
+    }
+
     // Export GS outputs to FS
     if (result == Result::Success)
     {
@@ -175,9 +194,15 @@ void CopyShader::ExportOutput()
 
     for (auto& byteSizeMap : genericOutByteSizes)
     {
+        // <location, <component, byteSize>>
         uint32_t loc = byteSizeMap.first;
 
-        uint32_t byteSize = byteSizeMap.second;
+        uint32_t byteSize = 0;
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            byteSize += byteSizeMap.second[i];
+        }
+
         LLPC_ASSERT(byteSize % 4 == 0);
         uint32_t dwordSize = byteSize / 4;
         auto pOutputTy = VectorType::get(m_pContext->FloatTy(), dwordSize);
@@ -359,8 +384,8 @@ Result CopyShader::DoPatch()
 }
 
 // =====================================================================================================================
-// Calculates GS to VS buffer offset from input/output location
-Value* CopyShader::CalcGsVsRingBufferOffsetForOutput(
+// Calculates GS to VS ring offset from input location
+Value* CopyShader::CalcGsVsRingOffsetForInput(
     uint32_t        location,    // Output location
     uint32_t        compIdx,     // Output component
     Instruction*    pInsertPos)  // [in] Where to insert the instruction
@@ -369,22 +394,38 @@ Value* CopyShader::CalcGsVsRingBufferOffsetForOutput(
 
     auto pResUsage = m_pContext->GetShaderResourceUsage(ShaderStageGeometry);
 
-    uint32_t outputVertices = pResUsage->builtInUsage.gs.outputVertices;
+    Value* pRingOffset = nullptr;
+    if (m_pContext->IsGsOnChip())
+    {
+        // ringOffset = esGsLdsSize + vertexOffset + location * 4 + compIdx
+        pRingOffset = ConstantInt::get(m_pContext->Int32Ty(), pResUsage->inOutUsage.gs.esGsLdsSize);
 
-    // byteOffset = vertexOffset * 4 + (location * 4 + compIdx) * 64 * maxVertices
-    Value* pRingBufOffset = BinaryOperator::CreateMul(pVertexOffset,
-                                                      ConstantInt::get(m_pContext->Int32Ty(), 4),
-                                                      "",
-                                                      pInsertPos);
+        pRingOffset = BinaryOperator::CreateAdd(pRingOffset, pVertexOffset, "", pInsertPos);
 
-    pRingBufOffset = BinaryOperator::CreateAdd(pRingBufOffset,
-                                               ConstantInt::get(m_pContext->Int32Ty(),
-                                                                (location * 4 + compIdx) * 64 *
-                                                                outputVertices),
-                                               "",
-                                               pInsertPos);
+        pRingOffset = BinaryOperator::CreateAdd(pRingOffset,
+                                                ConstantInt::get(m_pContext->Int32Ty(), (location * 4) + compIdx),
+                                                "",
+                                                pInsertPos);
+    }
+    else
+    {
+        uint32_t outputVertices = pResUsage->builtInUsage.gs.outputVertices;
+
+        // ringOffset = vertexOffset * 4 + (location * 4 + compIdx) * 64 * maxVertices
+        pRingOffset = BinaryOperator::CreateMul(pVertexOffset,
+                                                ConstantInt::get(m_pContext->Int32Ty(), 4),
+                                                "",
+                                                pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateAdd(pRingOffset,
+                                                ConstantInt::get(m_pContext->Int32Ty(),
+                                                                 (location * 4 + compIdx) * 64 *
+                                                                 outputVertices),
+                                                "",
+                                                pInsertPos);
+    }
 
-    return pRingBufOffset;
+    return pRingOffset;
 }
 
 // =====================================================================================================================
@@ -394,21 +435,39 @@ Value* CopyShader::LoadValueFromGsVsRingBuffer(
     uint32_t        compIdx,    // Output component
     Instruction*    pInsertPos) // [in] Where to insert the load instruction
 {
-    Value* pRingBufOffset = CalcGsVsRingBufferOffsetForOutput(location, compIdx, pInsertPos);
-    auto& inOutUsage = m_pContext->GetShaderResourceUsage(ShaderStageCopyShader)->inOutUsage;
+    Value* pLoadValue = nullptr;
+    Value* pRingOffset = CalcGsVsRingOffsetForInput(location, compIdx, pInsertPos);
 
-    std::vector<Value*> args;
-    args.push_back(inOutUsage.gs.pGsVsRingBufDesc);
-    args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
-    args.push_back(pRingBufOffset);
-    args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));  // glc
-    args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));  // slc
-    return EmitCall(m_pModule,
-                    "llvm.amdgcn.buffer.load.f32",
-                    m_pContext->FloatTy(),
-                    args,
-                    NoAttrib,
-                    pInsertPos);
+    if (m_pContext->IsGsOnChip())
+    {
+        std::vector<Value*> idxs;
+        idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
+        idxs.push_back(pRingOffset);
+
+        Value* pLoadPtr = GetElementPtrInst::Create(nullptr, m_pLds, idxs, "", pInsertPos);
+        pLoadValue = new LoadInst(pLoadPtr, "", false, m_pLds->getAlignment(), pInsertPos);
+
+        pLoadValue = BitCastInst::Create(Instruction::BitCast, pLoadValue, m_pContext->FloatTy(), "", pInsertPos);
+    }
+    else
+    {
+        auto& inOutUsage = m_pContext->GetShaderResourceUsage(ShaderStageCopyShader)->inOutUsage;
+
+        std::vector<Value*> args;
+        args.push_back(inOutUsage.gs.pGsVsRingBufDesc);
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
+        args.push_back(pRingOffset);
+        args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));  // glc
+        args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));  // slc
+        pLoadValue = EmitCall(m_pModule,
+                              "llvm.amdgcn.buffer.load.f32",
+                              m_pContext->FloatTy(),
+                              args,
+                              NoAttrib,
+                              pInsertPos);
+    }
+
+    return pLoadValue;
 }
 
 // =====================================================================================================================
diff --git a/icd/api/llpc/context/llpcCopyShader.h b/icd/api/llpc/context/llpcCopyShader.h
index 45992a45..b85c9b92 100644
--- a/icd/api/llpc/context/llpcCopyShader.h
+++ b/icd/api/llpc/context/llpcCopyShader.h
@@ -55,9 +55,9 @@ class CopyShader
     void ExportOutput();
     Result DoPatch();
 
-    llvm::Value* CalcGsVsRingBufferOffsetForOutput(uint32_t           location,
-                                                   uint32_t           compIdx,
-                                                   llvm::Instruction* pInsertPos);
+    llvm::Value* CalcGsVsRingOffsetForInput(uint32_t           location,
+                                            uint32_t           compIdx,
+                                            llvm::Instruction* pInsertPos);
 
     llvm::Value* LoadValueFromGsVsRingBuffer(uint32_t           location,
                                              uint32_t           compIdx,
@@ -74,9 +74,10 @@ class CopyShader
     // Start offset of currently-processed vertex in GS-VS ring buffer
     static const uint32_t EntryArgIdxVertexOffset = 2;
 
-    llvm::Module*       m_pModule;                      // LLVM module for copy shader
-    Context*            m_pContext;                     // LLPC context
-    llvm::Function*     m_pEntryPoint;                  // Entry point of copy shader module
+    llvm::Module*           m_pModule;                      // LLVM module for copy shader
+    Context*                m_pContext;                     // LLPC context
+    llvm::Function*         m_pEntryPoint;                  // Entry point of copy shader module
+    llvm::GlobalVariable*   m_pLds;                         // Global variable to model LDS
 };
 
 } // Llpc
diff --git a/icd/api/llpc/context/llpcGraphicsContext.cpp b/icd/api/llpc/context/llpcGraphicsContext.cpp
index b38fdba8..df506384 100644
--- a/icd/api/llpc/context/llpcGraphicsContext.cpp
+++ b/icd/api/llpc/context/llpcGraphicsContext.cpp
@@ -72,7 +72,8 @@ GraphicsContext::GraphicsContext(
     m_pPipelineInfo(pPipelineInfo),
     m_stageMask(0),
     m_activeStageCount(0),
-    m_tessOffchip(cl::EnableTessOffChip)
+    m_tessOffchip(cl::EnableTessOffChip),
+    m_gsOnChip(false)
 {
 #ifdef LLPC_BUILD_GFX9
     if (gfxIp.major >= 9)
@@ -332,7 +333,7 @@ uint64_t GraphicsContext::GetShaderHashCode(
 // Determines whether or not GS on-chip mode is valid for this pipeline.
 bool GraphicsContext::CanGsOnChip()
 {
-    bool gsOnChip = false;
+    bool gsOnChip = true;
 
     uint32_t stageMask = GetShaderStageMask();
     const bool hasTs = ((stageMask & (ShaderStageToMask(ShaderStageTessControl) |
@@ -345,12 +346,12 @@ bool GraphicsContext::CanGsOnChip()
     {
         uint32_t gsPrimsPerSubgroup = m_pGpuProperty->gsOnChipDefaultPrimsPerSubgroup;
 
-        const uint32_t esGsItemSize    = 4 * pEsResUsage->inOutUsage.outputMapLocCount;
-        const uint32_t gsInstanceCount = pGsResUsage->builtInUsage.gs.invocations;
-        const uint32_t gsVsItemSize    = 4 *
-                                         pGsResUsage->inOutUsage.outputMapLocCount *
-                                         pGsResUsage->builtInUsage.gs.outputVertices *
-                                         gsInstanceCount;
+        const uint32_t esGsRingItemSize = 4 * pEsResUsage->inOutUsage.outputMapLocCount;
+        const uint32_t gsInstanceCount  = pGsResUsage->builtInUsage.gs.invocations;
+        const uint32_t gsVsRingItemSize = 4 *
+                                          pGsResUsage->inOutUsage.outputMapLocCount *
+                                          pGsResUsage->builtInUsage.gs.outputVertices *
+                                          gsInstanceCount;
 
         uint32_t vertsPerPrim = 1;
         bool     useAdjacency = false;
@@ -393,11 +394,11 @@ bool GraphicsContext::CanGsOnChip()
         }
 
         // Compute GS-VS LDS size based on target GS primitives per subgroup
-        uint32_t gsVsLdsSize = (gsVsItemSize * gsPrimsPerSubgroup);
+        uint32_t gsVsLdsSize = (gsVsRingItemSize * gsPrimsPerSubgroup);
 
         // Compute ES-GS LDS size based on the worst case number of ES vertices needed to create the target number of
         // GS primitives per subgroup.
-        uint32_t esGsLdsSize = esGsItemSize * esMinVertsPerSubgroup * gsPrimsPerSubgroup;
+        uint32_t esGsLdsSize = esGsRingItemSize * esMinVertsPerSubgroup * gsPrimsPerSubgroup;
 
         // Total LDS use per subgroup aligned to the register granularity
         uint32_t gsOnChipLdsSize = Pow2Align((esGsLdsSize + gsVsLdsSize), m_pGpuProperty->ldsSizeDwordGranularity);
@@ -412,18 +413,18 @@ bool GraphicsContext::CanGsOnChip()
         // If total LDS usage is too big, refactor partitions based on ratio of ES-GS and GS-VS item sizes.
         if (gsOnChipLdsSize > maxLdsSize)
         {
-            const uint32_t esGsItemSizePerPrim = esGsItemSize * esMinVertsPerSubgroup;
-            const uint32_t itemSizeTotal       = esGsItemSizePerPrim + gsVsItemSize;
+            const uint32_t esGsItemSizePerPrim = esGsRingItemSize * esMinVertsPerSubgroup;
+            const uint32_t itemSizeTotal       = esGsItemSizePerPrim + gsVsRingItemSize;
 
             esGsLdsSize = RoundUpToMultiple((esGsItemSizePerPrim * maxLdsSize) / itemSizeTotal, esGsItemSizePerPrim);
-            gsVsLdsSize = RoundDownToMultiple(maxLdsSize - esGsLdsSize, gsVsItemSize);
+            gsVsLdsSize = RoundDownToMultiple(maxLdsSize - esGsLdsSize, gsVsRingItemSize);
 
             gsOnChipLdsSize = maxLdsSize;
         }
 
         // Based on the LDS space, calculate how many GS prims per subgroup and ES vertices per subgroup can be dispatched.
-        gsPrimsPerSubgroup          = (gsVsLdsSize / gsVsItemSize);
-        uint32_t esVertsPerSubgroup = (esGsLdsSize / esGsItemSize);
+        gsPrimsPerSubgroup          = (gsVsLdsSize / gsVsRingItemSize);
+        uint32_t esVertsPerSubgroup = (esGsLdsSize / esGsRingItemSize);
 
         LLPC_ASSERT(esVertsPerSubgroup >= esMinVertsPerSubgroup);
 
@@ -441,7 +442,15 @@ bool GraphicsContext::CanGsOnChip()
         esVertsPerSubgroup -= (esMinVertsPerSubgroup - 1);
 
         // TODO: Accept GsOffChipDefaultThreshold from panel option
-        constexpr uint32_t GsOffChipDefaultThreshold = 64;
+        // TODO: Value of GsOffChipDefaultThreshold should be 64, due to an issue it's changed to 32 in order to test
+        // on-chip GS code generation before fixing that issue.
+        // The issue is because we only remove unused builtin output till final GS output store generation, when
+        // determining onchip/offchip mode, unused builtin output like PointSize and Clip/CullDistance is factored in
+        // LDS usage and deactivates onchip GS when GsOffChipDefaultThreshold  is 64. To fix this we will probably
+        // need to clear unused builtin ouput before determining onchip/offchip GS mode.
+        constexpr uint32_t GsOffChipDefaultThreshold = 32;
+
+        pGsResUsage->inOutUsage.gs.esGsLdsSize  = esGsLdsSize;
 
         if (((gsPrimsPerSubgroup * gsInstanceCount) < GsOffChipDefaultThreshold) || (esVertsPerSubgroup == 0))
         {
diff --git a/icd/api/llpc/context/llpcPipelineContext.h b/icd/api/llpc/context/llpcPipelineContext.h
index 8187a3a1..76b64ba6 100644
--- a/icd/api/llpc/context/llpcPipelineContext.h
+++ b/icd/api/llpc/context/llpcPipelineContext.h
@@ -371,13 +371,17 @@ struct ResourceUsage
             // outputs to fragment shader, always from vertex stream 0)
             std::unordered_map<uint32_t, uint32_t> builtInOutLocs;
 
-            // Map from tightly packed locations to byte sizes of generic outputs (used by copy shader to export
-            // generic outputs to fragment shader, always from vertex stream 0)
-            std::unordered_map<uint32_t, uint32_t> genericOutByteSizes;
+            // Map from tightly packed locations to byte sizes of generic outputs (used by copy shader to
+            // export generic outputs to fragment shader, always from vertex stream 0):
+            //   <location, <component, byteSize>>
+            std::unordered_map<uint32_t, uint32_t[4]> genericOutByteSizes;
 
             llvm::Value* pEsGsOffsets;          // ES -> GS offsets (GS in)
             llvm::Value* pGsVsRingBufDesc;      // GS -> VS ring buffer descriptor (GS out);
             llvm::Value* pEmitCounterPtr;
+
+            uint32_t esGsLdsSize;               // ES -> GS ring LDS size (GS in)
+            uint32_t gsVsRingItemSize;          // Size of each primitive written to the GSVS Ring ( in dwords)
         } gs;
 
         struct
diff --git a/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp b/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp
index e62e3154..fd338ec8 100644
--- a/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp
+++ b/icd/api/llpc/lower/llpcSpirvLowerGlobal.cpp
@@ -1400,23 +1400,64 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport(
             }
         }
 
-        if ((m_shaderStage == ShaderStageTessControl) ||
-            (m_shaderStage == ShaderStageTessEval) ||
+        if ((m_shaderStage == ShaderStageTessControl) || (m_shaderStage == ShaderStageTessEval) ||
             (interpLoc != InterpLocUnknown))
         {
-            // NOTE: For tessellation shader and fragment shader with interpolation functions, we add element indexing
-            // as an addition parameter to do addressing for the input/output.
-            if (pElemIdx == nullptr)
+            if (inOutMeta.IsBuiltIn)
             {
-                // When element indexing is not specified, we set it to don't-care value
-                pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), InvalidValue);
+                if (pElemIdx == nullptr)
+                {
+                    // When element indexing is not specified, we set it to don't-care value
+                    pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), InvalidValue);
+                }
             }
+            else
+            {
+                LLPC_ASSERT(pInOutTy->isSingleValueType());
+
+                uint32_t elemIdx = inOutMeta.Component;
+                LLPC_ASSERT(inOutMeta.Component <= 3);
+                if (pInOutTy->getScalarSizeInBits() == 64)
+                {
+                    LLPC_ASSERT(inOutMeta.Component % 2 == 0); // Must be even for 64-bit type
+                    elemIdx = inOutMeta.Component / 2;
+                }
+
+                if (pElemIdx != nullptr)
+                {
+                    pElemIdx = BinaryOperator::CreateAdd(pElemIdx,
+                                                         ConstantInt::get(m_pContext->Int32Ty(), elemIdx),
+                                                         "",
+                                                         pInsertPos);
+                }
+                else
+                {
+                    pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), elemIdx);
+                }
+            }
+
             args.push_back(pElemIdx);
         }
         else
         {
             // Element indexing is not valid for other shader stages
             LLPC_ASSERT(pElemIdx == nullptr);
+
+            if ((inOutMeta.IsBuiltIn == false) && (m_shaderStage != ShaderStageCompute))
+            {
+                LLPC_ASSERT(pInOutTy->isSingleValueType());
+
+                uint32_t elemIdx = inOutMeta.Component;
+                LLPC_ASSERT(inOutMeta.Component <= 3);
+                if (pInOutTy->getScalarSizeInBits() == 64)
+                {
+                    LLPC_ASSERT(inOutMeta.Component % 2 == 0); // Must be even for 64-bit type
+                    elemIdx = inOutMeta.Component / 2;
+                }
+
+                pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), elemIdx);
+                args.push_back(pElemIdx);
+            }
         }
 
         if ((m_shaderStage == ShaderStageTessControl) ||
@@ -1452,7 +1493,7 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport(
         }
 
         //
-        // VS:  @llpc.input.import.generic.%Type%(i32 location)
+        // VS:  @llpc.input.import.generic.%Type%(i32 location, i32 elemIdx)
         //      @llpc.input.import.builtin.%BuiltIn%(i32 builtInId)
         //
         // TCS: @llpc.input.import.generic.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 vertexIdx)
@@ -1465,12 +1506,13 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport(
         // TES: @llpc.input.import.generic.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 vertexIdx)
         //      @llpc.input.import.builtin.%BuiltIn%.%Type%(i32 builtInId, i32 elemIdx, i32 vertexIdx)
 
-        // GS:  @llpc.input.import.generic.%Type%(i32 location, i32 vertexIdx)
+        // GS:  @llpc.input.import.generic.%Type%(i32 location, i32 elemIdx, i32 vertexIdx)
         //      @llpc.input.import.builtin.%BuiltIn%(i32 builtInId, i32 vertexIdx)
         //
-        // FS:  @llpc.input.import.generic.%Type%(i32 location, i32 interpMode, i32 interpLoc)
+        // FS:  @llpc.input.import.generic.%Type%(i32 location, i32 elemIdx, i32 interpMode, i32 interpLoc)
         //      @llpc.input.import.builtin.%BuiltIn%(i32 builtInId)
-        //      @llpc.input.import.interpolant.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 interpMode, <2 x float> ij)
+        //      @llpc.input.import.interpolant.%Type%(i32 location, i32 locOffset, i32 elemIdx,
+        //                                            i32 interpMode, <2 x float> ij)
         //
         // CS:  @llpc.input.import.builtin.%BuiltIn%(i32 builtInId)
         //
@@ -1681,15 +1723,65 @@ void SpirvLowerGlobal::AddCallInstForOutputExport(
 
         if (m_shaderStage == ShaderStageTessControl)
         {
-            // NOTE: For tessellation control shader, we add element indexing as an addition parameter to do addressing
-            // for the output.
-            if (pElemIdx == nullptr)
+            if (outputMeta.IsBuiltIn)
             {
-                // When element indexing is not specified, we set it to don't-care value
-                pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), InvalidValue);
+                if (pElemIdx == nullptr)
+                {
+                    // When element indexing is not specified, we set it to don't-care value
+                    pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), InvalidValue);
+                }
+            }
+            else
+            {
+                LLPC_ASSERT(pOutputTy->isSingleValueType());
+
+                uint32_t elemIdx = outputMeta.Component;
+                LLPC_ASSERT(outputMeta.Component <= 3);
+                if (pOutputTy->getScalarSizeInBits() == 64)
+                {
+                    LLPC_ASSERT(outputMeta.Component % 2 == 0); // Must be even for 64-bit type
+                    elemIdx = outputMeta.Component / 2;
+                }
+
+                if (pElemIdx != nullptr)
+                {
+                    pElemIdx = BinaryOperator::CreateAdd(pElemIdx,
+                                                         ConstantInt::get(m_pContext->Int32Ty(), elemIdx),
+                                                         "",
+                                                         pInsertPos);
+                }
+                else
+                {
+                    pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), elemIdx);
+                }
             }
+
             args.push_back(pElemIdx);
+        }
+        else
+        {
+            // Element indexing is not valid for other shader stages
+            LLPC_ASSERT(pElemIdx == nullptr);
+
+            if ((outputMeta.IsBuiltIn == false) && (m_shaderStage != ShaderStageCompute))
+            {
+                LLPC_ASSERT(pOutputTy->isSingleValueType());
+
+                uint32_t elemIdx = outputMeta.Component;
+                LLPC_ASSERT(outputMeta.Component <= 3);
+                if (pOutputTy->getScalarSizeInBits() == 64)
+                {
+                    LLPC_ASSERT(outputMeta.Component % 2 == 0); // Must be even for 64-bit type
+                    elemIdx = outputMeta.Component / 2;
+                }
 
+                pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), elemIdx);
+                args.push_back(pElemIdx);
+            }
+        }
+
+        if (m_shaderStage == ShaderStageTessControl)
+        {
             // NOTE: For tessellation control shader, we add vertex indexing as an addition parameter to do addressing
             // for the output.
             if (pVertexIdx == nullptr)
@@ -1701,8 +1793,8 @@ void SpirvLowerGlobal::AddCallInstForOutputExport(
         }
         else
         {
-            // Element and vertex indexing is not valid for other shader stages
-            LLPC_ASSERT((pElemIdx == nullptr) && (pVertexIdx == nullptr));
+            // Vertex indexing is not valid for other shader stages
+            LLPC_ASSERT(pVertexIdx == nullptr);
         }
 
         if (m_shaderStage == ShaderStageGeometry)
@@ -1720,7 +1812,7 @@ void SpirvLowerGlobal::AddCallInstForOutputExport(
         args.push_back(pOutputValue);
 
         //
-        // VS:  @llpc.output.export.generic.%Type%(i32 location, %Type% outputValue)
+        // VS:  @llpc.output.export.generic.%Type%(i32 location, i32 elemIdx, %Type% outputValue)
         //      @llpc.output.export.builtin.%BuiltIn%(i32 builtInId, %Type% outputValue)
         //
         // TCS: @llpc.output.export.generic.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 vertexIdx,
@@ -1728,13 +1820,13 @@ void SpirvLowerGlobal::AddCallInstForOutputExport(
         //      @llpc.output.export.builtin.%BuiltIn%.%Type%(i32 builtInId, i32 elemIdx, i32 vertexIdx,
         //                                                   %Type% outputValue)
         //
-        // TES: @llpc.output.export.generic.%Type%(i32 location, %Type% outputValue)
+        // TES: @llpc.output.export.generic.%Type%(i32 location, i32 elemIdx, %Type% outputValue)
         //      @llpc.output.export.builtin.%BuiltIn%.%Type%(i32 builtInId, %Type% outputValue)
 
-        // GS:  @llpc.output.export.generic.%Type%(i32 location, i32 streamId, %Type% outputValue)
+        // GS:  @llpc.output.export.generic.%Type%(i32 location, i32 elemIdx, i32 streamId, %Type% outputValue)
         //      @llpc.output.export.builtin.%BuiltIn%(i32 builtInId, i32 streamId, %Type% outputValue)
         //
-        // FS:  @llpc.output.export.generic.%Type%(i32 location, %Type% outputValue)
+        // FS:  @llpc.output.export.generic.%Type%(i32 location, i32 elemIdx, %Type% outputValue)
         //      @llpc.output.export.builtin.%BuiltIn%(i32 builtInId, %Type% outputValue)
         //
         EmitCall(m_pModule, instName, m_pContext->VoidTy(), args, NoAttrib, pInsertPos);
diff --git a/icd/api/llpc/lower/llpcSpirvLowerResourceCollect.cpp b/icd/api/llpc/lower/llpcSpirvLowerResourceCollect.cpp
index 22d2ff13..c1c32da0 100644
--- a/icd/api/llpc/lower/llpcSpirvLowerResourceCollect.cpp
+++ b/icd/api/llpc/lower/llpcSpirvLowerResourceCollect.cpp
@@ -945,21 +945,6 @@ void SpirvLowerResourceCollect::CollectInOutUsage(
                         m_pResUsage->builtInUsage.fs.runAtSampleRate = true;
                     }
                 }
-                else
-                {
-                    LLPC_ASSERT(addrSpace == SPIRAS_Output);
-
-                    // Collect CB shader mask
-                    LLPC_ASSERT(pBaseTy->isSingleValueType());
-                    const uint32_t compCount = pBaseTy->isVectorTy() ? pBaseTy->getVectorNumElements() : 1;
-                    const uint32_t channelMask = ((1 << compCount) - 1);
-
-                    LLPC_ASSERT(startLoc + locCount <= MaxColorTargets);
-                    for (uint32_t i = 0; i < locCount; ++i)
-                    {
-                        m_pResUsage->inOutUsage.fs.cbShaderMask |= (channelMask << 4 * (startLoc + i));
-                    }
-                }
             }
         }
     }
diff --git a/icd/api/llpc/patch/generate/gfx6/glslArithOpEmu.ll b/icd/api/llpc/patch/generate/gfx6/glslArithOpEmu.ll
old mode 100644
new mode 100755
diff --git a/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll b/icd/api/llpc/patch/generate/gfx9/glslArithOpEmu.ll
old mode 100644
new mode 100755
diff --git a/icd/api/llpc/patch/generate/glslNullFsEmu.ll b/icd/api/llpc/patch/generate/glslNullFsEmu.ll
index dbd71739..46a902ef 100644
--- a/icd/api/llpc/patch/generate/glslNullFsEmu.ll
+++ b/icd/api/llpc/patch/generate/glslNullFsEmu.ll
@@ -22,8 +22,8 @@ target triple = "spir64-unknown-unknown"
 ;
 ; #version 450
 ;
-; layout (location = 0) in vec4 fragIn;
-; layout (location = 0) out vec4 fragOut;
+; layout (location = 0) in float fragIn;
+; layout (location = 0) out float fragOut;
 ;
 ; void main()
 ; {
@@ -31,16 +31,16 @@ target triple = "spir64-unknown-unknown"
 ; }
 ;
 
-define dllexport void @main() #0 !spirv.ExecutionModel !5
+define dllexport amdgpu_ps void @main() #0 !spirv.ExecutionModel !5
 {
 .entry:
-    %0 = tail call float @llpc.input.import.generic.f32(i32 0, i32 0, i32 1) #0
-    tail call void @llpc.output.export.generic.f32(i32 0, float %0) #0
+    %0 = tail call float @llpc.input.import.generic.f32(i32 0, i32 0, i32 0, i32 1) #0
+    tail call void @llpc.output.export.generic.f32(i32 0, i32 0, float %0) #0
     ret void
 }
 
-declare float @llpc.input.import.generic.f32(i32, i32, i32) #0
-declare void @llpc.output.export.generic.f32(i32, float) #0
+declare float @llpc.input.import.generic.f32(i32, i32, i32, i32) #0
+declare void @llpc.output.export.generic.f32(i32, i32, float) #0
 
 attributes #0 = { nounwind }
 
diff --git a/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp b/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp
index 3a4d8412..5b002f88 100644
--- a/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp
+++ b/icd/api/llpc/patch/llpcPatchEntryPointMutate.cpp
@@ -80,7 +80,7 @@ char PatchEntryPointMutate::ID = 0;
 PatchEntryPointMutate::PatchEntryPointMutate()
     :
     Patch(ID),
-    m_hasTes(false),
+    m_hasTs(false),
     m_hasGs(false)
 {
     initializePatchEntryPointMutatePass(*PassRegistry::getPassRegistry());
@@ -96,8 +96,9 @@ bool PatchEntryPointMutate::runOnModule(
     Patch::Init(&module);
 
     const uint32_t stageMask = m_pContext->GetShaderStageMask();
-    m_hasTes    = ((stageMask & ShaderStageToMask(ShaderStageTessEval)) != 0);
-    m_hasGs     = ((stageMask & ShaderStageToMask(ShaderStageGeometry)) != 0);
+    m_hasTs = ((stageMask & (ShaderStageToMask(ShaderStageTessControl) |
+                             ShaderStageToMask(ShaderStageTessEval))) != 0);
+    m_hasGs = ((stageMask & ShaderStageToMask(ShaderStageGeometry)) != 0);
 
     const auto& dataLayout = m_pModule->getDataLayout();
 
@@ -594,7 +595,7 @@ bool PatchEntryPointMutate::runOnModule(
     }
 
 	// Setup ES-GS ring buffer descriptor
-    if (((m_shaderStage == ShaderStageVertex) && m_hasGs && (m_hasTes == false)) ||
+    if (((m_shaderStage == ShaderStageVertex) && m_hasGs && (m_hasTs == false)) ||
         ((m_shaderStage == ShaderStageTessEval) && m_hasGs))
     {
         // Setup ES-GS ring buffer descriptor for VS or TES output
@@ -630,9 +631,8 @@ bool PatchEntryPointMutate::runOnModule(
     switch (m_shaderStage)
     {
     case ShaderStageVertex:
-        callingConv = m_hasTes ?
-                          CallingConv::AMDGPU_LS :
-                          (m_hasGs ? CallingConv::AMDGPU_ES : CallingConv::AMDGPU_VS);
+        callingConv = m_hasTs ? CallingConv::AMDGPU_LS :
+                                (m_hasGs ? CallingConv::AMDGPU_ES : CallingConv::AMDGPU_VS);
         break;
     case ShaderStageTessControl:
         callingConv = CallingConv::AMDGPU_HS;
@@ -651,6 +651,7 @@ bool PatchEntryPointMutate::runOnModule(
     }
     pEntryPoint->setCallingConv(callingConv);
     pEntryPoint->setDLLStorageClass(GlobalValue::DefaultStorageClass);
+
     // Set the entry name required by PAL ABI
     auto entryStage = Util::Abi::PipelineSymbolType::CsMainEntry;
     switch (callingConv)
@@ -1165,7 +1166,7 @@ FunctionType* PatchEntryPointMutate::GenerateEntryPointType(
     {
     case ShaderStageVertex:
         {
-            if (m_hasGs && (m_hasTes == false))
+            if (m_hasGs && (m_hasTs == false))
             {
                 argTys.push_back(m_pContext->Int32Ty()); // ES to GS offset
                 entryArgIdxs.vs.esGsOffset = argIdx;
diff --git a/icd/api/llpc/patch/llpcPatchEntryPointMutate.h b/icd/api/llpc/patch/llpcPatchEntryPointMutate.h
index a5c2253d..4f45b095 100644
--- a/icd/api/llpc/patch/llpcPatchEntryPointMutate.h
+++ b/icd/api/llpc/patch/llpcPatchEntryPointMutate.h
@@ -76,8 +76,9 @@ class PatchEntryPointMutate:
 
     // Reserved argument count for single DWORD descriptor table pointer
     static const uint32_t   TablePtrReservedArgCount = 2;
-    bool                    m_hasTes;                   // Whether the pipeline has tessllation evaluation shader
-    bool                    m_hasGs;                    // Whether the pipeline has geometry shader
+
+    bool    m_hasTs;    // Whether the pipeline has tessllation shader
+    bool    m_hasGs;    // Whether the pipeline has geometry shader
 };
 
 } // Llpc
diff --git a/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp b/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp
index 121158c2..90b7dd0b 100644
--- a/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp
+++ b/icd/api/llpc/patch/llpcPatchInOutImportExport.cpp
@@ -71,7 +71,8 @@ PatchInOutImportExport::PatchInOutImportExport()
 #endif
     m_hasTs(false),
     m_hasGs(false),
-    m_pLds(nullptr)
+    m_pLds(nullptr),
+    m_pThreadId(nullptr)
 {
     memset(&m_gfxIp, 0, sizeof(m_gfxIp));
 
@@ -264,8 +265,24 @@ bool PatchInOutImportExport::runOnModule(
                              ShaderStageToMask(ShaderStageTessEval))) != 0);
     m_hasGs = ((stageMask & ShaderStageToMask(ShaderStageGeometry)) != 0);
 
+    // Calculate and store thread ID, it will be used in on-chip GS offset calculation
+    if (m_hasGs && m_pContext->IsGsOnChip())
+    {
+        auto pInsertPos = m_pEntryPoint->begin()->getFirstInsertionPt();
+
+        std::vector<Value*> args;
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), -1));
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
+        m_pThreadId = EmitCall(m_pModule, "llvm.amdgcn.mbcnt.lo", m_pContext->Int32Ty(), args, NoAttrib, &*pInsertPos);
+
+        args.clear();
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), -1));
+        args.push_back(m_pThreadId);
+        m_pThreadId = EmitCall(m_pModule, "llvm.amdgcn.mbcnt.hi", m_pContext->Int32Ty(), args, NoAttrib, &*pInsertPos);
+    }
+
     // Create the global variable that is to model LDS
-    if (m_hasTs)
+    if (m_hasTs || (m_hasGs && m_pContext->IsGsOnChip()))
     {
         // Construct LDS type: [ldsSize * i32], address space 3
         auto ldsSize = m_pContext->GetGpuProperty()->ldsSizePerCu;
@@ -491,13 +508,18 @@ void PatchInOutImportExport::visitCallInst(
             {
             case ShaderStageVertex:
                 {
-                    pInput = PatchVsGenericInputImport(pInputTy, loc, &callInst);
+                    LLPC_ASSERT(callInst.getNumArgOperands() == 2);
+                    const uint32_t compIdx = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
+                    pInput = PatchVsGenericInputImport(pInputTy, loc, compIdx, &callInst);
                     break;
                 }
             case ShaderStageTessControl:
                 {
                     LLPC_ASSERT(callInst.getNumArgOperands() == 4);
-                    auto pElemIdx   = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2);
+
+                    auto pElemIdx = callInst.getOperand(2);
+                    LLPC_ASSERT(IsDontCareValue(pElemIdx) == false);
+
                     auto pVertexIdx = callInst.getOperand(3);
                     LLPC_ASSERT(IsDontCareValue(pVertexIdx) == false);
 
@@ -507,7 +529,10 @@ void PatchInOutImportExport::visitCallInst(
             case ShaderStageTessEval:
                 {
                     LLPC_ASSERT(callInst.getNumArgOperands() == 4);
-                    auto pElemIdx   = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2);
+
+                    auto pElemIdx = callInst.getOperand(2);
+                    LLPC_ASSERT(IsDontCareValue(pElemIdx) == false);
+
                     auto pVertexIdx = IsDontCareValue(callInst.getOperand(3)) ? nullptr : callInst.getOperand(3);
 
                     pInput = PatchTesGenericInputImport(pInputTy, loc, pLocOffset, pElemIdx, pVertexIdx, &callInst);
@@ -515,40 +540,48 @@ void PatchInOutImportExport::visitCallInst(
                 }
             case ShaderStageGeometry:
                 {
-                    LLPC_ASSERT(callInst.getNumArgOperands() == 2);
-                    Value* pVertexIdx = callInst.getOperand(1);
+                    LLPC_ASSERT(callInst.getNumArgOperands() == 3);
+
+                    const uint32_t compIdx = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
+
+                    Value* pVertexIdx = callInst.getOperand(2);
                     LLPC_ASSERT(IsDontCareValue(pVertexIdx) == false);
-                    pInput = PatchGsGenericInputImport(pInputTy, loc, pVertexIdx, &callInst);
+
+                    pInput = PatchGsGenericInputImport(pInputTy, loc, compIdx, pVertexIdx, &callInst);
                     break;
                 }
             case ShaderStageFragment:
                 {
                     uint32_t interpMode = InterpModeSmooth;
                     uint32_t interpLoc = InterpLocCenter;
-                    Value* pLocOffset = nullptr;
-                    Value* pCompIdx = nullptr;
+
+                    Value* pElemIdx = callInst.getOperand(1);
+                    LLPC_ASSERT(IsDontCareValue(pElemIdx) == false);
+
                     Value* pIJ = nullptr;
 
                     if (isGenericInputImport)
                     {
-                        LLPC_ASSERT(callInst.getNumArgOperands() == 3);
-                        interpMode = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
-                        interpLoc = cast<ConstantInt>(callInst.getOperand(2))->getZExtValue();
+                        LLPC_ASSERT(callInst.getNumArgOperands() == 4);
+
+                        interpMode = cast<ConstantInt>(callInst.getOperand(2))->getZExtValue();
+                        interpLoc = cast<ConstantInt>(callInst.getOperand(3))->getZExtValue();
                     }
                     else
                     {
                         LLPC_ASSERT(isInterpolantInputImport);
                         LLPC_ASSERT(callInst.getNumArgOperands() == 5);
+
                         interpMode = cast<ConstantInt>(callInst.getOperand(3))->getZExtValue();
                         interpLoc = InterpLocUnknown;
-                        pCompIdx  = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2);
+
                         pIJ = callInst.getOperand(4);
                     }
 
                     pInput = PatchFsGenericInputImport(pInputTy,
                                                        loc,
                                                        pLocOffset,
-                                                       pCompIdx,
+                                                       pElemIdx,
                                                        pIJ,
                                                        interpMode,
                                                        interpLoc,
@@ -630,7 +663,8 @@ void PatchInOutImportExport::visitCallInst(
             LLPC_ASSERT(loc != InvalidValue);
 
             LLPC_ASSERT(callInst.getNumArgOperands() == 4);
-            auto pElemIdx   = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2);
+            auto pElemIdx = callInst.getOperand(2);
+            LLPC_ASSERT(IsDontCareValue(pElemIdx) == false);
             auto pVertexIdx = IsDontCareValue(callInst.getOperand(3)) ? nullptr : callInst.getOperand(3);
 
             pOutput = PatchTcsGenericOutputImport(pOutputTy, loc, pLocOffset, pElemIdx, pVertexIdx, &callInst);
@@ -771,13 +805,18 @@ void PatchInOutImportExport::visitCallInst(
                 {
                 case ShaderStageVertex:
                     {
-                        PatchVsGenericOutputExport(pOutput, loc, &callInst);
+                        LLPC_ASSERT(callInst.getNumArgOperands() == 3);
+                        const uint32_t compIdx = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
+                        PatchVsGenericOutputExport(pOutput, loc, compIdx, &callInst);
                         break;
                     }
                 case ShaderStageTessControl:
                     {
                         LLPC_ASSERT(callInst.getNumArgOperands() == 5);
-                        auto pElemIdx   = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2);
+
+                        auto pElemIdx = callInst.getOperand(2);
+                        LLPC_ASSERT(IsDontCareValue(pElemIdx) == false);
+
                         auto pVertexIdx = IsDontCareValue(callInst.getOperand(3)) ? nullptr : callInst.getOperand(3);
 
                         PatchTcsGenericOutputExport(pOutput, loc, pLocOffset, pElemIdx, pVertexIdx, &callInst);
@@ -785,19 +824,24 @@ void PatchInOutImportExport::visitCallInst(
                     }
                 case ShaderStageTessEval:
                     {
-                        PatchTesGenericOutputExport(pOutput, loc, &callInst);
+                        LLPC_ASSERT(callInst.getNumArgOperands() == 3);
+                        const uint32_t compIdx = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
+                        PatchTesGenericOutputExport(pOutput, loc, compIdx, &callInst);
                         break;
                     }
                 case ShaderStageGeometry:
                     {
-                        LLPC_ASSERT(callInst.getNumArgOperands() == 3);
-                        uint32_t streamId = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
-                        PatchGsGenericOutputExport(pOutput, loc, streamId, &callInst);
+                        LLPC_ASSERT(callInst.getNumArgOperands() == 4);
+                        const uint32_t compIdx = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
+                        const uint32_t streamId = cast<ConstantInt>(callInst.getOperand(2))->getZExtValue();
+                        PatchGsGenericOutputExport(pOutput, loc, compIdx, streamId, &callInst);
                         break;
                     }
                 case ShaderStageFragment:
                     {
-                        PatchFsGenericOutputExport(pOutput, loc, &callInst);
+                        LLPC_ASSERT(callInst.getNumArgOperands() == 3);
+                        const uint32_t compIdx = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
+                        PatchFsGenericOutputExport(pOutput, loc, compIdx, &callInst);
                         break;
                     }
                 case ShaderStageCopyShader:
@@ -1412,6 +1456,48 @@ void PatchInOutImportExport::visitReturnInst(
                 EmitCall(m_pModule, "llvm.amdgcn.exp.f32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos));
         }
 
+        // Export fragment colors
+        for (uint32_t location = 0; location < MaxColorTargets; ++location)
+        {
+            auto& expFragColor = m_expFragColors[location];
+            if (expFragColor.size() > 0)
+            {
+                Value* pOutput = nullptr;
+                uint32_t compCount = expFragColor.size();
+                LLPC_ASSERT(compCount <= 4);
+
+                // Set CB shader mask
+                auto pResUsage = m_pContext->GetShaderResourceUsage(ShaderStageFragment);
+                const uint32_t channelMask = ((1 << compCount) - 1);
+                pResUsage->inOutUsage.fs.cbShaderMask |= (channelMask << (4 * location));
+
+                // Construct exported fragment colors
+                if (compCount == 1)
+                {
+                    pOutput = expFragColor[0];
+                }
+                else
+                {
+                    pOutput = UndefValue::get(VectorType::get(m_pContext->Int32Ty(), compCount));
+                    for (uint32_t i = 0; i < compCount; ++i)
+                    {
+                        pOutput = InsertElementInst::Create(pOutput,
+                                                            expFragColor[i],
+                                                            ConstantInt::get(m_pContext->Int32Ty(), i),
+                                                            "",
+                                                            pInsertPos);
+                    }
+                }
+
+                // Do fragment color exporting
+                auto pExport = m_pFragColorExport->Run(pOutput, location, pInsertPos);
+                if (pExport != nullptr)
+                {
+                    m_pLastExport = cast<CallInst>(pExport);
+                }
+            }
+        }
+
         // NOTE: If outputs are present in fragment shader, we have to export a dummy one
         if (m_pLastExport == nullptr)
         {
@@ -1451,13 +1537,14 @@ void PatchInOutImportExport::visitReturnInst(
 Value* PatchInOutImportExport::PatchVsGenericInputImport(
     Type*        pInputTy,        // [in] Type of input value
     uint32_t     location,        // Location of the input
+    uint32_t     compIdx,         // Index used for vector element indexing
     Instruction* pInsertPos)      // [in] Where to insert the patch instruction
 {
     Value* pInput = UndefValue::get(pInputTy);
 
     // Do vertex fetch operations (returns <n x i32>)
     LLPC_ASSERT(m_pVertexFetch != nullptr);
-    auto pVertex = m_pVertexFetch->Run(pInputTy, location, pInsertPos);
+    auto pVertex = m_pVertexFetch->Run(pInputTy, location, compIdx, pInsertPos);
 
     // Cast vertex fetch results if necessary
     const Type* pVertexTy = pVertex->getType();
@@ -1480,14 +1567,14 @@ Value* PatchInOutImportExport::PatchTcsGenericInputImport(
     Type*        pInputTy,        // [in] Type of input value
     uint32_t     location,        // Base location of the input
     Value*       pLocOffset,      // [in] Relative location offset
-    Value*       pCompIdx,        // [in] Index used for vector element indexing (could be null)
+    Value*       pCompIdx,        // [in] Index used for vector element indexing
     Value*       pVertexIdx,      // [in] Input array outermost index used for vertex indexing
     Instruction* pInsertPos)      // [in] Where to insert the patch instruction
 {
-    LLPC_ASSERT(pVertexIdx != nullptr);
+    LLPC_ASSERT((pCompIdx != nullptr) && (pVertexIdx != nullptr));
 
     auto pLdsOffset = CalcLdsOffsetForTcsInput(pInputTy, location, pLocOffset, pCompIdx, pVertexIdx, pInsertPos);
-    return ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+    return ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
 }
 
 // =====================================================================================================================
@@ -1496,12 +1583,14 @@ Value* PatchInOutImportExport::PatchTesGenericInputImport(
     Type*        pInputTy,        // [in] Type of input value
     uint32_t     location,        // Base location of the input
     Value*       pLocOffset,      // [in] Relative location offset
-    Value*       pCompIdx,        // [in] Index used for vector element indexing (could be null)
+    Value*       pCompIdx,        // [in] Index used for vector element indexing
     Value*       pVertexIdx,      // [in] Input array outermost index used for vertex indexing (could be null)
     Instruction* pInsertPos)      // [in] Where to insert the patch instruction
 {
+    LLPC_ASSERT(pCompIdx != nullptr);
+
     auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, location, pLocOffset, pCompIdx, pVertexIdx, pInsertPos);
-    return ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+    return ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
 }
 
 // =====================================================================================================================
@@ -1509,6 +1598,7 @@ Value* PatchInOutImportExport::PatchTesGenericInputImport(
 Value* PatchInOutImportExport::PatchGsGenericInputImport(
     Type*        pInputTy,        // [in] Type of input value
     uint32_t     location,        // Location of the input
+    uint32_t     compIdx,         // Index used for vector element indexing
     Value*       pVertexIdx,      // [in] Input array outermost index used for vertex indexing
     Instruction* pInsertPos)      // [in] Where to insert the patch instruction
 {
@@ -1520,6 +1610,9 @@ Value* PatchInOutImportExport::PatchGsGenericInputImport(
     const uint32_t bitWidth = pInputTy->getScalarSizeInBits();
     if (bitWidth == 64)
     {
+        // For 64-bit data type, the component indexing must multiply by 2
+        compIdx *= 2;
+
         if (pInputTy->isVectorTy())
         {
             pInputTy = VectorType::get(m_pContext->FloatTy(), pInputTy->getVectorNumElements() * 2);
@@ -1534,7 +1627,7 @@ Value* PatchInOutImportExport::PatchGsGenericInputImport(
         LLPC_ASSERT(bitWidth == 32);
     }
 
-    Value* pInput = LoadValueFromEsGsRingBuffer(pInputTy, location, 0, pVertexIdx, pInsertPos);
+    Value* pInput = LoadValueFromEsGsRing(pInputTy, location, compIdx, pVertexIdx, pInsertPos);
 
     if (pInputTy != pOrigInputTy)
     {
@@ -1641,7 +1734,7 @@ Value* PatchInOutImportExport::PatchFsGenericInputImport(
     const uint32_t numChannels = (bitWidth * compCout) / 32;
 
     Type* pInterpTy = (numChannels > 1) ? VectorType::get(m_pContext->FloatTy(), numChannels) : m_pContext->FloatTy();
-    Value* pInterp = nullptr;
+    Value* pInterp = UndefValue::get(pInterpTy);
 
     uint32_t startChannel = 0;
     if (pCompIdx != nullptr)
@@ -1665,10 +1758,10 @@ Value* PatchInOutImportExport::PatchFsGenericInputImport(
             LLPC_ASSERT((pBasicTy->isFloatTy()) && (numChannels <= 4));
 
             args.clear();
-            args.push_back(pI);                                                // i
-            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i));        // attr_chan
-            args.push_back(pLoc);                                         // attr
-            args.push_back(pPrimMask);                                         // m0
+            args.push_back(pI);                                             // i
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i));     // attr_chan
+            args.push_back(pLoc);                                           // attr
+            args.push_back(pPrimMask);                                      // m0
 
             pCompValue = EmitCall(m_pModule,
                                   "llvm.amdgcn.interp.p1",
@@ -1678,11 +1771,11 @@ Value* PatchInOutImportExport::PatchFsGenericInputImport(
                                   pInsertPos);
 
             args.clear();
-            args.push_back(pCompValue);                                        // p1
-            args.push_back(pJ);                                                // j
-            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i));        // attr_chan
-            args.push_back(pLoc);                                              // attr
-            args.push_back(pPrimMask);                                         // m0
+            args.push_back(pCompValue);                                     // p1
+            args.push_back(pJ);                                             // j
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i));     // attr_chan
+            args.push_back(pLoc);                                           // attr
+            args.push_back(pPrimMask);                                      // m0
 
             pCompValue = EmitCall(m_pModule,
                                   "llvm.amdgcn.interp.p2",
@@ -1716,8 +1809,7 @@ Value* PatchInOutImportExport::PatchFsGenericInputImport(
         }
         else
         {
-            auto pVec = (i == 0) ? UndefValue::get(pInterpTy) : pInterp;
-            pInterp = InsertElementInst::Create(pVec,
+            pInterp = InsertElementInst::Create(pInterp,
                                                 pCompValue,
                                                 ConstantInt::get(m_pContext->Int32Ty(), i - startChannel),
                                                 "",
@@ -1745,12 +1837,14 @@ Value* PatchInOutImportExport::PatchTcsGenericOutputImport(
     Type*        pOutputTy,       // [in] Type of output value
     uint32_t     location,        // Base location of the output
     Value*       pLocOffset,      // [in] Relative location offset
-    Value*       pCompIdx,        // [in] Index used for vector element indexing (could be null)
+    Value*       pCompIdx,        // [in] Index used for vector element indexing
     Value*       pVertexIdx,      // [in] Input array outermost index used for vertex indexing (could be null)
     Instruction* pInsertPos)      // [in] Where to insert the patch instruction
 {
+    LLPC_ASSERT(pCompIdx != nullptr);
+
     auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, location, pLocOffset, pCompIdx, pVertexIdx, pInsertPos);
-    return ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos);
+    return ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos);
 }
 
 // =====================================================================================================================
@@ -1758,23 +1852,28 @@ Value* PatchInOutImportExport::PatchTcsGenericOutputImport(
 void PatchInOutImportExport::PatchVsGenericOutputExport(
     Value*       pOutput,        // [in] Output value
     uint32_t     location,       // Location of the output
+    uint32_t     compIdx,        // Index used for vector element indexing
     Instruction* pInsertPos)     // [in] Where to insert the patch instruction
 {
+    auto pOutputTy = pOutput->getType();
+
     if (m_hasTs)
     {
-        auto pLdsOffset = CalcLdsOffsetForVsOutput(location, pInsertPos);
+        auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy, location, compIdx, pInsertPos);
         WriteValueToLds(pOutput, pLdsOffset, pInsertPos);
     }
     else
     {
         if (m_hasGs)
         {
-            auto pOutputTy = pOutput->getType();
             LLPC_ASSERT(pOutputTy->isIntOrIntVectorTy() || pOutputTy->isFPOrFPVectorTy());
 
             const uint32_t bitWidth = pOutputTy->getScalarSizeInBits();
             if (bitWidth == 64)
             {
+                // For 64-bit data type, the component indexing must multiply by 2
+                compIdx *= 2;
+
                 uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() * 2 : 2;
                 pOutputTy = VectorType::get(m_pContext->FloatTy(), compCount);
                 pOutput = BitCastInst::Create(Instruction::BitCast, pOutput, pOutputTy, "", pInsertPos);
@@ -1784,11 +1883,11 @@ void PatchInOutImportExport::PatchVsGenericOutputExport(
                 LLPC_ASSERT(bitWidth == 32);
             }
 
-            StoreValueToEsGsRingBuffer(pOutput, location, 0, pInsertPos);
+            StoreValueToEsGsRing(pOutput, location, compIdx, pInsertPos);
         }
         else
         {
-            AddExportInstForGenericOutput(pOutput, location, pInsertPos);
+            AddExportInstForGenericOutput(pOutput, location, compIdx, pInsertPos);
         }
     }
 }
@@ -1799,10 +1898,12 @@ void PatchInOutImportExport::PatchTcsGenericOutputExport(
     Value*       pOutput,        // [in] Output value
     uint32_t     location,       // Base location of the output
     Value*       pLocOffset,     // [in] Relative location offset
-    Value*       pCompIdx,       // [in] Index used for vector element indexing (could be null)
+    Value*       pCompIdx,       // [in] Index used for vector element indexing
     Value*       pVertexIdx,     // [in] Input array outermost index used for vertex indexing (could be null)
     Instruction* pInsertPos)     // [in] Where to insert the patch instruction
 {
+    LLPC_ASSERT(pCompIdx != nullptr);
+
     Type* pOutputTy = pOutput->getType();
     auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, location, pLocOffset, pCompIdx, pVertexIdx, pInsertPos);
     WriteValueToLds(pOutput, pLdsOffset, pInsertPos);
@@ -1813,6 +1914,7 @@ void PatchInOutImportExport::PatchTcsGenericOutputExport(
 void PatchInOutImportExport::PatchTesGenericOutputExport(
     Value*       pOutput,        // [in] Output value
     uint32_t     location,       // Location of the output
+    uint32_t     compIdx,        // Index used for vector element indexing
     Instruction* pInsertPos)     // [in] Where to insert the patch instruction
 {
     if (m_hasGs)
@@ -1823,6 +1925,9 @@ void PatchInOutImportExport::PatchTesGenericOutputExport(
         const uint32_t bitWidth = pOutputTy->getScalarSizeInBits();
         if (bitWidth == 64)
         {
+            // For 64-bit data type, the component indexing must multiply by 2
+            compIdx *= 2;
+
             uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() * 2 : 2;
             pOutputTy = VectorType::get(m_pContext->FloatTy(), compCount);
             pOutput = BitCastInst::Create(Instruction::BitCast, pOutput, pOutputTy, "", pInsertPos);
@@ -1832,11 +1937,11 @@ void PatchInOutImportExport::PatchTesGenericOutputExport(
             LLPC_ASSERT(bitWidth == 32);
         }
 
-        StoreValueToEsGsRingBuffer(pOutput, location, 0, pInsertPos);
+        StoreValueToEsGsRing(pOutput, location, compIdx, pInsertPos);
     }
     else
     {
-        AddExportInstForGenericOutput(pOutput, location, pInsertPos);
+        AddExportInstForGenericOutput(pOutput, location, compIdx, pInsertPos);
     }
 }
 
@@ -1845,6 +1950,7 @@ void PatchInOutImportExport::PatchTesGenericOutputExport(
 void PatchInOutImportExport::PatchGsGenericOutputExport(
     Value*       pOutput,        // [in] Output value
     uint32_t     location,       // Location of the output
+    uint32_t     compIdx,        // Index used for vector element indexing
     uint32_t     streamId,       // ID of output vertex stream
     Instruction* pInsertPos)     // [in] Where to insert the patch instruction
 {
@@ -1856,6 +1962,9 @@ void PatchInOutImportExport::PatchGsGenericOutputExport(
     const uint32_t bitWidth = pOutputTy->getScalarSizeInBits();
     if (bitWidth == 64)
     {
+        // For 64-bit data type, the component indexing must multiply by 2
+        compIdx *= 2;
+
         if (pOutputTy->isVectorTy())
         {
             pOutputTy = VectorType::get(m_pContext->FloatTy(), pOutputTy->getVectorNumElements() * 2);
@@ -1871,13 +1980,14 @@ void PatchInOutImportExport::PatchGsGenericOutputExport(
     const uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() : 1;
     const uint32_t byteSize = pCompTy->getScalarSizeInBits() / 8 * compCount;
 
+    LLPC_ASSERT(compIdx <= 4);
     auto& genericOutByteSizes =
         m_pContext->GetShaderResourceUsage(ShaderStageGeometry)->inOutUsage.gs.genericOutByteSizes;
-    genericOutByteSizes[location] = byteSize;
+    genericOutByteSizes[location][compIdx] = byteSize;
 
     if (compCount == 1)
     {
-        StoreValueToGsVsRingBuffer(pOutput, location, 0, pInsertPos);
+        StoreValueToGsVsRingBuffer(pOutput, location, compIdx, pInsertPos);
     }
     else
     {
@@ -1887,7 +1997,7 @@ void PatchInOutImportExport::PatchGsGenericOutputExport(
                                                     ConstantInt::get(m_pContext->Int32Ty(), i),
                                                     "",
                                                     pInsertPos);
-            StoreValueToGsVsRingBuffer(pComp, location + (i / 4), i % 4, pInsertPos);
+            StoreValueToGsVsRingBuffer(pComp, location + ((compIdx + i) / 4), (compIdx + i) % 4, pInsertPos);
         }
     }
 
@@ -1898,13 +2008,73 @@ void PatchInOutImportExport::PatchGsGenericOutputExport(
 void PatchInOutImportExport::PatchFsGenericOutputExport(
     Value*       pOutput,         // [in] Output value
     uint32_t     location,        // Location of the output
+    uint32_t     compIdx,         // Index used for vector element indexing
     Instruction* pInsertPos)      // [in] Where to insert the patch instruction
 {
-    // "Done" flag is valid for exporting MRT
-    auto pExport = m_pFragColorExport->Run(pOutput, location, pInsertPos);
-    if (pExport != nullptr)
+    Type* pOutputTy = pOutput->getType();
+
+    const uint32_t bitWidth = pOutputTy->getScalarSizeInBits();
+    LLPC_ASSERT((bitWidth == 16) || (bitWidth == 32));
+
+    auto pCompTy = pOutputTy->isVectorTy() ? pOutputTy->getVectorElementType() : pOutputTy;
+    uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() : 1;
+
+    std::vector<Value*> outputComps;
+    for (uint32_t i = 0; i < compCount; ++i)
+    {
+        Value* pOutputComp = nullptr;
+        if (compCount == 1)
+        {
+            pOutputComp = pOutput;
+        }
+        else
+        {
+            pOutputComp = ExtractElementInst::Create(pOutput,
+                                                     ConstantInt::get(m_pContext->Int32Ty(), i),
+                                                     "",
+                                                     pInsertPos);
+        }
+
+        // Translate components of exported output values to i32 values
+        if (pCompTy->isFloatingPointTy())
+        {
+            if (bitWidth == 16)
+            {
+                pOutputComp = new BitCastInst(pOutputComp, m_pContext->Int16Ty(), "", pInsertPos);
+                pOutputComp = new ZExtInst(pOutputComp, m_pContext->Int32Ty(), "", pInsertPos);
+            }
+            else
+            {
+                LLPC_ASSERT(bitWidth == 32);
+                pOutputComp = new BitCastInst(pOutputComp, m_pContext->Int32Ty(), "", pInsertPos);
+            }
+        }
+        else if (pCompTy->isIntegerTy())
+        {
+            if (bitWidth == 16)
+            {
+                pOutputComp = new ZExtInst(pOutputComp, m_pContext->Int32Ty(), "", pInsertPos);
+            }
+            else
+            {
+                LLPC_ASSERT(bitWidth == 32);
+            }
+        }
+
+        outputComps.push_back(pOutputComp);
+    }
+
+    LLPC_ASSERT(location < MaxColorTargets);
+    auto& expFragColor = m_expFragColors[location];
+
+    while (compIdx + compCount > expFragColor.size())
+    {
+        expFragColor.push_back(UndefValue::get(m_pContext->Int32Ty()));
+    }
+
+    for (uint32_t i = 0; i < compCount; ++i)
     {
-        m_pLastExport = cast<CallInst>(pExport);
+        expFragColor[compIdx + i] = outputComps[i];
     }
 }
 
@@ -1980,7 +2150,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport(
             const uint32_t loc = builtInInLocMap[builtInId];
 
             auto pLdsOffset = CalcLdsOffsetForTcsInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-            pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+            pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
 
             break;
         }
@@ -1991,7 +2161,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport(
             const uint32_t loc = builtInInLocMap[builtInId];
 
             auto pLdsOffset = CalcLdsOffsetForTcsInput(pInputTy, loc, nullptr, nullptr, pVertexIdx, pInsertPos);
-            pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+            pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
 
             break;
         }
@@ -2012,7 +2182,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport(
                     auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i);
                     auto pLdsOffset =
                         CalcLdsOffsetForTcsInput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                    auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos);
+                    auto pElem = ReadValueFromLds(false, pElemTy, pLdsOffset, pInsertPos);
 
                     std::vector<uint32_t> idxs;
                     idxs.push_back(i);
@@ -2022,7 +2192,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInInputImport(
             else
             {
                 auto pLdsOffset = CalcLdsOffsetForTcsInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+                pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
             }
 
             break;
@@ -2079,7 +2249,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport(
             const uint32_t loc = builtInInLocMap[builtInId];
 
             auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-            pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+            pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
 
             break;
         }
@@ -2090,7 +2260,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport(
             const uint32_t loc = builtInInLocMap[builtInId];
 
             auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, loc, nullptr, nullptr, pVertexIdx, pInsertPos);
-            pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+            pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
 
             break;
         }
@@ -2111,7 +2281,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport(
                     auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i);
                     auto pLdsOffset =
                         CalcLdsOffsetForTesInput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                    auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos);
+                    auto pElem = ReadValueFromLds(false, pElemTy, pLdsOffset, pInsertPos);
 
                     std::vector<uint32_t> idxs;
                     idxs.push_back(i);
@@ -2121,7 +2291,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport(
             else
             {
                 auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+                pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
             }
 
             break;
@@ -2179,7 +2349,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport(
                     auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i);
                     auto pLdsOffset =
                         CalcLdsOffsetForTesInput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                    auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos);
+                    auto pElem = ReadValueFromLds(false, pElemTy, pLdsOffset, pInsertPos);
                     std::vector<uint32_t> idxs;
                     idxs.push_back(i);
                     pInput = InsertValueInst::Create(pInput, pElem, idxs, "", pInsertPos);
@@ -2188,7 +2358,7 @@ Value* PatchInOutImportExport::PatchTesBuiltInInputImport(
             else
             {
                 auto pLdsOffset = CalcLdsOffsetForTesInput(pInputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                pInput = ReadValueFromLds(pInputTy, pLdsOffset, pInsertPos);
+                pInput = ReadValueFromLds(false, pInputTy, pLdsOffset, pInsertPos);
             }
 
             break;
@@ -2225,11 +2395,11 @@ Value* PatchInOutImportExport::PatchGsBuiltInInputImport(
     case BuiltInPosition:
     case BuiltInPointSize:
         {
-            pInput = LoadValueFromEsGsRingBuffer(pInputTy,
-                                                 loc,
-                                                 0,
-                                                 pVertexIdx,
-                                                 pInsertPos);
+            pInput = LoadValueFromEsGsRing(pInputTy,
+                                                      loc,
+                                                      0,
+                                                      pVertexIdx,
+                                                      pInsertPos);
             break;
         }
     case BuiltInClipDistance:
@@ -2237,11 +2407,11 @@ Value* PatchInOutImportExport::PatchGsBuiltInInputImport(
             pInput = UndefValue::get(pInputTy);
             for (uint32_t i = 0; i < builtInUsage.clipDistanceIn; ++i)
             {
-                auto pComp = LoadValueFromEsGsRingBuffer(pInputTy->getArrayElementType(),
-                    loc + i / 4,
-                    i % 4,
-                    pVertexIdx,
-                    pInsertPos);
+                auto pComp = LoadValueFromEsGsRing(pInputTy->getArrayElementType(),
+                                                              loc + i / 4,
+                                                              i % 4,
+                                                              pVertexIdx,
+                                                              pInsertPos);
 
                 std::vector<uint32_t> idxs;
                 idxs.push_back(i);
@@ -2254,11 +2424,11 @@ Value* PatchInOutImportExport::PatchGsBuiltInInputImport(
             pInput = UndefValue::get(pInputTy);
             for (uint32_t i = 0; i < builtInUsage.cullDistanceIn; ++i)
             {
-                auto pComp = LoadValueFromEsGsRingBuffer(pInputTy->getArrayElementType(),
-                                                         loc + i / 4,
-                                                         i % 4,
-                                                         pVertexIdx,
-                                                         pInsertPos);
+                auto pComp = LoadValueFromEsGsRing(pInputTy->getArrayElementType(),
+                                                              loc + i / 4,
+                                                              i % 4,
+                                                              pVertexIdx,
+                                                              pInsertPos);
 
                 std::vector<uint32_t> idxs;
                 idxs.push_back(i);
@@ -2704,7 +2874,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport(
             uint32_t loc = builtInOutLocMap[builtInId];
 
             auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-            pOutput = ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos);
+            pOutput = ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos);
 
             break;
         }
@@ -2717,7 +2887,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport(
             uint32_t loc = builtInOutLocMap[builtInId];
 
             auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, loc, nullptr, nullptr, pVertexIdx, pInsertPos);
-            pOutput = ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos);
+            pOutput = ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos);
 
             break;
         }
@@ -2748,7 +2918,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport(
                     auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i);
                     auto pLdsOffset =
                         CalcLdsOffsetForTcsOutput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                    auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos);
+                    auto pElem = ReadValueFromLds(true, pElemTy, pLdsOffset, pInsertPos);
 
                     std::vector<uint32_t> idxs;
                     idxs.push_back(i);
@@ -2758,7 +2928,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport(
             else
             {
                 auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                pOutput = ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos);
+                pOutput = ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos);
             }
 
             break;
@@ -2791,7 +2961,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport(
                     auto pElemIdx = ConstantInt::get(m_pContext->Int32Ty(), i);
                     auto pLdsOffset =
                         CalcLdsOffsetForTcsOutput(pElemTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                    auto pElem = ReadValueFromLds(pElemTy, pLdsOffset, pInsertPos);
+                    auto pElem = ReadValueFromLds(true, pElemTy, pLdsOffset, pInsertPos);
 
                     std::vector<uint32_t> idxs;
                     idxs.push_back(i);
@@ -2801,7 +2971,7 @@ Value* PatchInOutImportExport::PatchTcsBuiltInOutputImport(
             else
             {
                 auto pLdsOffset = CalcLdsOffsetForTcsOutput(pOutputTy, loc, nullptr, pElemIdx, pVertexIdx, pInsertPos);
-                pOutput = ReadValueFromLds(pOutputTy, pLdsOffset, pInsertPos);
+                pOutput = ReadValueFromLds(true, pOutputTy, pLdsOffset, pInsertPos);
             }
 
             break;
@@ -2829,8 +2999,6 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
     auto& builtInUsage = pResUsage->builtInUsage.vs;
     auto& builtInOutLocMap = pResUsage->inOutUsage.builtInOutputLocMap;
 
-    const auto pUndef = UndefValue::get(m_pContext->FloatTy());
-
     std::vector<Value*> args;
 
     switch (builtInId)
@@ -2846,7 +3014,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
             if (m_hasTs)
             {
                 uint32_t loc = builtInOutLocMap[builtInId];
-                auto pLdsOffset = CalcLdsOffsetForVsOutput(loc, pInsertPos);
+                auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy, loc, 0, pInsertPos);
                 WriteValueToLds(pOutput, pLdsOffset, pInsertPos);
             }
             else
@@ -2856,7 +3024,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
                     LLPC_ASSERT(builtInOutLocMap.find(builtInId) != builtInOutLocMap.end());
                     uint32_t loc = builtInOutLocMap[builtInId];
 
-                    StoreValueToEsGsRingBuffer(pOutput, loc, 0, pInsertPos);
+                    StoreValueToEsGsRing(pOutput, loc, 0, pInsertPos);
                 }
                 else
                 {
@@ -2884,7 +3052,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
             if (m_hasTs)
             {
                 uint32_t loc = builtInOutLocMap[builtInId];
-                auto pLdsOffset = CalcLdsOffsetForVsOutput(loc, pInsertPos);
+                auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy, loc, 0, pInsertPos);
                 WriteValueToLds(pOutput, pLdsOffset, pInsertPos);
             }
             else
@@ -2894,7 +3062,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
                     LLPC_ASSERT(builtInOutLocMap.find(builtInId) != builtInOutLocMap.end());
                     uint32_t loc = builtInOutLocMap[builtInId];
 
-                    StoreValueToEsGsRingBuffer(pOutput, loc, 0, pInsertPos);
+                    StoreValueToEsGsRing(pOutput, loc, 0, pInsertPos);
                 }
                 else
                 {
@@ -2924,7 +3092,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
                 LLPC_ASSERT(pOutputTy->isArrayTy());
 
                 uint32_t loc = builtInOutLocMap[builtInId];
-                auto pLdsOffset = CalcLdsOffsetForVsOutput(loc, pInsertPos);
+                auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy->getArrayElementType(), loc, 0, pInsertPos);
 
                 for (int i = 0; i < pOutputTy->getArrayNumElements(); ++i)
                 {
@@ -2952,7 +3120,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
                         std::vector<uint32_t> idxs;
                         idxs.push_back(i);
                         auto pElem = ExtractValueInst::Create(pOutput, idxs, "", pInsertPos);
-                        StoreValueToEsGsRingBuffer(pElem, loc + i / 4, i % 4, pInsertPos);
+                        StoreValueToEsGsRing(pElem, loc + i / 4, i % 4, pInsertPos);
                     }
                 }
                 else
@@ -2984,7 +3152,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
                 LLPC_ASSERT(pOutputTy->isArrayTy());
 
                 uint32_t loc = builtInOutLocMap[builtInId];
-                auto pLdsOffset = CalcLdsOffsetForVsOutput(loc, pInsertPos);
+                auto pLdsOffset = CalcLdsOffsetForVsOutput(pOutputTy->getArrayElementType(), loc, 0, pInsertPos);
 
                 for (int i = 0; i < pOutputTy->getArrayNumElements(); ++i)
                 {
@@ -3012,7 +3180,7 @@ void PatchInOutImportExport::PatchVsBuiltInOutputExport(
                         std::vector<uint32_t> idxs;
                         idxs.push_back(i);
                         auto pElem = ExtractValueInst::Create(pOutput, idxs, "", pInsertPos);
-                        StoreValueToEsGsRingBuffer(pElem, loc + i / 4, i % 4, pInsertPos);
+                        StoreValueToEsGsRing(pElem, loc + i / 4, i % 4, pInsertPos);
                     }
                 }
                 else
@@ -3367,7 +3535,7 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport(
                 LLPC_ASSERT(builtInOutLocMap.find(builtInId) != builtInOutLocMap.end());
                 uint32_t loc = builtInOutLocMap[builtInId];
 
-                StoreValueToEsGsRingBuffer(pOutput, loc, 0, pInsertPos);
+                StoreValueToEsGsRing(pOutput, loc, 0, pInsertPos);
             }
             else
             {
@@ -3396,7 +3564,7 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport(
                 LLPC_ASSERT(builtInOutLocMap.find(builtInId) != builtInOutLocMap.end());
                 uint32_t loc = builtInOutLocMap[builtInId];
 
-                StoreValueToEsGsRingBuffer(pOutput, loc, 0, pInsertPos);
+                StoreValueToEsGsRing(pOutput, loc, 0, pInsertPos);
             }
             else
             {
@@ -3430,7 +3598,7 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport(
                     std::vector<uint32_t> idxs;
                     idxs.push_back(i);
                     auto pElem = ExtractValueInst::Create(pOutput, idxs, "", pInsertPos);
-                    StoreValueToEsGsRingBuffer(pElem, loc + i / 4, i % 4, pInsertPos);
+                    StoreValueToEsGsRing(pElem, loc + i / 4, i % 4, pInsertPos);
                 }
             }
             else
@@ -3466,7 +3634,7 @@ void PatchInOutImportExport::PatchTesBuiltInOutputExport(
                     std::vector<uint32_t> idxs;
                     idxs.push_back(i);
                     auto pElem = ExtractValueInst::Create(pOutput, idxs, "", pInsertPos);
-                    StoreValueToEsGsRingBuffer(pElem, loc + i / 4, i % 4, pInsertPos);
+                    StoreValueToEsGsRing(pElem, loc + i / 4, i % 4, pInsertPos);
                 }
             }
             else
@@ -3773,7 +3941,7 @@ void PatchInOutImportExport::PatchCopyShaderGenericOutputExport(
     uint32_t     location,       // Location of the output
     Instruction* pInsertPos)     // [in] Where to insert the patch instruction
 {
-    AddExportInstForGenericOutput(pOutput, location, pInsertPos);
+    AddExportInstForGenericOutput(pOutput, location, 0, pInsertPos);
 }
 
 // =====================================================================================================================
@@ -3850,8 +4018,8 @@ void PatchInOutImportExport::PatchCopyShaderBuiltInOutputExport(
 }
 
 // =====================================================================================================================
-// Stores value to ES-GS ring buffer.
-void PatchInOutImportExport::StoreValueToEsGsRingBuffer(
+// Stores value to ES-GS ring (buffer or LDS).
+void PatchInOutImportExport::StoreValueToEsGsRing(
     Value*       pStoreValue,   // [in] Value to store
     uint32_t     location,      // Output location
     uint32_t     compIdx,       // Output component index
@@ -3871,7 +4039,7 @@ void PatchInOutImportExport::StoreValueToEsGsRingBuffer(
                                                          "",
                                                          pInsertPos);
 
-            StoreValueToEsGsRingBuffer(pStoreComp, location + i / 4, i % 4, pInsertPos);
+            StoreValueToEsGsRing(pStoreComp, location + i / 4, i % 4, pInsertPos);
         }
     }
     else
@@ -3886,7 +4054,7 @@ void PatchInOutImportExport::StoreValueToEsGsRingBuffer(
             LLPC_ASSERT(pStoreTy->isIntegerTy());
         }
 
-        // Call buffer store intrinsic
+        // Call buffer store intrinsic or LDS store
         const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage;
         LLPC_ASSERT(inOutUsage.pEsGsRingBufDesc != nullptr);
 
@@ -3905,28 +4073,40 @@ void PatchInOutImportExport::StoreValueToEsGsRingBuffer(
             pRingBufDesc = inOutUsage.pEsGsRingBufDesc;
         }
 
-        auto pRingBufOffset = CalcEsGsRingBufferOffsetForOutput(location, compIdx, pInsertPos);
+        auto pRingOffset = CalcEsGsRingOffsetForOutput(location, compIdx, pEsGsOffset, pInsertPos);
 
-        // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit control
-        // of soffset. This is required by swizzle enabled mode when address range checking should be complied with.
-        std::vector<Value*> args;
-        args.push_back(pStoreValue);                                                    // vdata
-        args.push_back(pRingBufDesc);                                                   // rsrc
-        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // vindex
-        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // voffset
-        args.push_back(pEsGsOffset);                                                    // soffset
-        args.push_back(pRingBufOffset);                                                 // offset
-        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32));    // dfmt
-        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_UINT));   // nfmt
-        args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // glc
-        args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // slc
-        EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.i32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos);
+        if (m_pContext->IsGsOnChip())
+        {
+            std::vector<Value*> idxs;
+            idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
+            idxs.push_back(pRingOffset);
+
+            Value* pStorePtr = GetElementPtrInst::Create(nullptr, m_pLds, idxs, "", pInsertPos);
+            new StoreInst(pStoreValue, pStorePtr, false, m_pLds->getAlignment(), pInsertPos);
+        }
+        else
+        {
+            // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit control
+            // of soffset. This is required by swizzle enabled mode when address range checking should be complied with.
+            std::vector<Value*> args;
+            args.push_back(pStoreValue);                                                    // vdata
+            args.push_back(pRingBufDesc);                                                   // rsrc
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // vindex
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // voffset
+            args.push_back(pEsGsOffset);                                                    // soffset
+            args.push_back(pRingOffset);                                                    // offset
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32));    // dfmt
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_UINT));   // nfmt
+            args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // glc
+            args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // slc
+            EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.i32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos);
+        }
     }
 }
 
 // =====================================================================================================================
-// Loads value from ES-GS ring buffer.
-Value* PatchInOutImportExport::LoadValueFromEsGsRingBuffer(
+// Loads value from ES-GS ring (buffer or LDS).
+Value* PatchInOutImportExport::LoadValueFromEsGsRing(
     Type*        pLoadTy,       // [in] Load value type
     uint32_t     location,      // Input location
     uint32_t     compIdx,       // Input component index
@@ -3951,56 +4131,55 @@ Value* PatchInOutImportExport::LoadValueFromEsGsRingBuffer(
 
         for (uint32_t i = compIdx; i < compCount; ++i)
         {
-            auto pRingBufOffset = CalcEsGsRingBufferOffsetForInput(location + i / 4,
-                                                                   i % 4,
-                                                                   pVertexIdx,
-                                                                   pInsertPos);
+            auto pLoadCompValue = LoadValueFromEsGsRing(pCompTy,
+                                                        location + i / 4,
+                                                        i % 4,
+                                                        pVertexIdx,
+                                                        pInsertPos);
 
+            pLoadValue = InsertElementInst::Create(pLoadValue,
+                                                   pLoadCompValue,
+                                                   ConstantInt::get(m_pContext->Int32Ty(), i),
+                                                   "",
+                                                   pInsertPos);
+        }
+    }
+    else
+    {
+        Value* pRingOffset = CalcEsGsRingOffsetForInput(location, compIdx, pVertexIdx, pInsertPos);
+        if (m_pContext->IsGsOnChip())
+        {
+            std::vector<Value*> idxs;
+            idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
+            idxs.push_back(pRingOffset);
+
+            Value* pLoadPtr = GetElementPtrInst::Create(nullptr, m_pLds, idxs, "", pInsertPos);
+            pLoadValue = new LoadInst(pLoadPtr, "", false, m_pLds->getAlignment(), pInsertPos);
+
+            if (pLoadTy->isFloatTy())
+            {
+                pLoadValue = BitCastInst::Create(Instruction::BitCast, pLoadValue, pLoadTy, "", pInsertPos);
+            }
+        }
+        else
+        {
             std::vector<Value*> args;
             args.push_back(inOutUsage.pEsGsRingBufDesc);
             args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
-            args.push_back(pRingBufOffset);
-            args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));    // glc
+            args.push_back(pRingOffset);
+            args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));   // glc
             args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));   // slc
-            auto pComp = EmitCall(m_pModule,
+            pLoadValue = EmitCall(m_pModule,
                                   "llvm.amdgcn.buffer.load.f32",
                                   m_pContext->FloatTy(),
                                   args,
                                   NoAttrib,
                                   pInsertPos);
 
-            if (pCompTy->isIntegerTy())
+            if (pLoadTy->isIntegerTy())
             {
-                pComp = BitCastInst::Create(Instruction::BitCast, pComp, pCompTy, "", pInsertPos);
+                pLoadValue = BitCastInst::Create(Instruction::BitCast, pLoadValue, pLoadTy, "", pInsertPos);
             }
-
-            pLoadValue = InsertElementInst::Create(pLoadValue,
-                                                   pComp,
-                                                   ConstantInt::get(m_pContext->Int32Ty(), i),
-                                                   "",
-                                                   pInsertPos);
-        }
-    }
-    else
-    {
-        auto pRingBufOffset = CalcEsGsRingBufferOffsetForInput(location, compIdx, pVertexIdx, pInsertPos);
-
-        std::vector<Value*> args;
-        args.push_back(inOutUsage.pEsGsRingBufDesc);
-        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
-        args.push_back(pRingBufOffset);
-        args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));   // glc
-        args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));   // slc
-        pLoadValue = EmitCall(m_pModule,
-                              "llvm.amdgcn.buffer.load.f32",
-                              m_pContext->FloatTy(),
-                              args,
-                              NoAttrib,
-                              pInsertPos);
-
-        if (pLoadTy->isIntegerTy())
-        {
-            pLoadValue = BitCastInst::Create(Instruction::BitCast, pLoadValue, pLoadTy, "", pInsertPos);
         }
     }
 
@@ -4039,96 +4218,209 @@ void PatchInOutImportExport::StoreValueToGsVsRingBuffer(
 
     auto pEmitCounter = new LoadInst(inOutUsage.gs.pEmitCounterPtr, "", pInsertPos);
 
-    auto pRingBufOffset = CalcGsVsRingBufferOffsetForOutput(location, compIdx, pEmitCounter, pInsertPos);
+    auto pRingOffset = CalcGsVsRingOffsetForOutput(location, compIdx, pEmitCounter, pGsVsOffset, pInsertPos);
 
-    // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit
-    // control of soffset. This is required by swizzle enabled mode when address range checking should be
-    // complied with.
-    std::vector<Value*> args;
-    args.push_back(pStoreValue);                                                    // vdata
-    args.push_back(inOutUsage.gs.pGsVsRingBufDesc);                                 // rsrc
-    args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // vindex
-    args.push_back(pRingBufOffset);                                                 // voffset
-    args.push_back(pGsVsOffset);                                                    // soffset
-    args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // offset
-    args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32));    // dfmt
-    args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_UINT));   // nfmt
-    args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // glc
-    args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // slc
-    EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.i32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos);
+    if (m_pContext->IsGsOnChip())
+    {
+        std::vector<Value*> idxs;
+        idxs.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));
+        idxs.push_back(pRingOffset);
+
+        Value* pStorePtr = GetElementPtrInst::Create(nullptr, m_pLds, idxs, "", pInsertPos);
+        new StoreInst(pStoreValue, pStorePtr, false, m_pLds->getAlignment(), pInsertPos);
+    }
+    else
+    {
+        // NOTE: Here we use tbuffer_store instruction instead of buffer_store because we have to do explicit
+        // control of soffset. This is required by swizzle enabled mode when address range checking should be
+        // complied with.
+        std::vector<Value*> args;
+        args.push_back(pStoreValue);                                                    // vdata
+        args.push_back(inOutUsage.gs.pGsVsRingBufDesc);                                 // rsrc
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // vindex
+        args.push_back(pRingOffset);                                                    // voffset
+        args.push_back(pGsVsOffset);                                                    // soffset
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // offset
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32));    // dfmt
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_UINT));   // nfmt
+        args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // glc
+        args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // slc
+        EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.i32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos);
+    }
 }
 
 // =====================================================================================================================
-// Calculates the byte offset to store the output value to ES-GS ring buffer based on the specified output info.
-Value* PatchInOutImportExport::CalcEsGsRingBufferOffsetForOutput(
+// Calculates the byte offset to store the output value to ES-GS ring based on the specified output info.
+Value* PatchInOutImportExport::CalcEsGsRingOffsetForOutput(
     uint32_t        location,    // Output location
     uint32_t        compIdx,     // Output component index
+    Value*          pEsGsOffset, // [in] ES-GS ring offset in bytes
     Instruction*    pInsertPos)  // [in] Where to insert the instruction
 {
-    return ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx) * 4);
+    Value* pRingOffset = nullptr;
+    if (m_pContext->IsGsOnChip())
+    {
+        // ringOffset = esGsOffset + threadId * esGsRingItemSize + location * 4 + compIdx
+        const auto pResUsage        = m_pContext->GetShaderResourceUsage(m_shaderStage);
+        const auto& inOutUsage      = pResUsage->inOutUsage;
+        uint32_t esGsRingItemSize   = inOutUsage.outputMapLocCount * 4;
+        pEsGsOffset = BinaryOperator::CreateExact(Instruction::LShr,
+                                                  pEsGsOffset,
+                                                  ConstantInt::get(m_pContext->Int32Ty(), 2),
+                                                  "",
+                                                  pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateMul(m_pThreadId,
+                                                ConstantInt::get(m_pContext->Int32Ty(), esGsRingItemSize),
+                                                "",
+                                                pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateAdd(pRingOffset, pEsGsOffset, "", pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateAdd(pRingOffset,
+                                                ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx)),
+                                                "",
+                                                pInsertPos);
+    }
+    else
+    {
+        // ringOffset = (location * 4 + compIdx) * 4
+        pRingOffset = ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx) * 4);
+    }
+    return pRingOffset;
 }
 
 // =====================================================================================================================
-// Calculates the byte offset to load the input value from ES-GS ring buffer based on the specified input info.
-Value* PatchInOutImportExport::CalcEsGsRingBufferOffsetForInput(
+// Calculates the byte offset to load the input value from ES-GS ring based on the specified input info.
+Value* PatchInOutImportExport::CalcEsGsRingOffsetForInput(
     uint32_t        location,    // Input location
     uint32_t        compIdx,     // Input Component index
     Value*          pVertexIdx,  // [in] Vertex index
     Instruction*    pInsertPos)  // [in] Where to insert the instruction
 {
-    const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage;
-    LLPC_ASSERT(inOutUsage.gs.pEsGsOffsets != nullptr);
+    Value* pRingOffset = nullptr;
+    if (m_pContext->IsGsOnChip())
+    {
+        const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage;
+        LLPC_ASSERT(inOutUsage.gs.pEsGsOffsets != nullptr);
 
-    Value* pVertexOffset = ExtractElementInst::Create(inOutUsage.gs.pEsGsOffsets,
-                                                      pVertexIdx,
-                                                      "",
-                                                      pInsertPos);
+        Value* pVertexOffset = ExtractElementInst::Create(inOutUsage.gs.pEsGsOffsets,
+                                                          pVertexIdx,
+                                                          "",
+                                                          pInsertPos);
 
-    // byteOffset = vertexOffset[N] * 4 + (location * 4 + compIdx) * 64 * 4;
-    auto pRingBufOffset = BinaryOperator::CreateMul(pVertexOffset,
-                                                    ConstantInt::get(m_pContext->Int32Ty(), 4),
-                                                    "",
-                                                    pInsertPos);
+        // ringOffset = vertexOffset[N] + (location * 4 + compIdx);
+        pRingOffset =
+            BinaryOperator::CreateAdd(pVertexOffset,
+                                      ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx)),
+                                      "",
+                                      pInsertPos);
+    }
+    else
+    {
+        const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage;
+        LLPC_ASSERT(inOutUsage.gs.pEsGsOffsets != nullptr);
 
-    pRingBufOffset =
-        BinaryOperator::CreateAdd(pRingBufOffset,
-                                  ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx) * 64 * 4),
-                                  "",
-                                  pInsertPos);
+        Value* pVertexOffset = ExtractElementInst::Create(inOutUsage.gs.pEsGsOffsets,
+                                                          pVertexIdx,
+                                                          "",
+                                                          pInsertPos);
+
+        // ringOffset = vertexOffset[N] * 4 + (location * 4 + compIdx) * 64 * 4;
+        pRingOffset = BinaryOperator::CreateMul(pVertexOffset,
+                                                ConstantInt::get(m_pContext->Int32Ty(), 4),
+                                                "",
+                                                pInsertPos);
 
-    return pRingBufOffset;
+        pRingOffset =
+            BinaryOperator::CreateAdd(pRingOffset,
+                                      ConstantInt::get(m_pContext->Int32Ty(), (location * 4 + compIdx) * 64 * 4),
+                                      "",
+                                      pInsertPos);
+    }
+
+    return pRingOffset;
 }
 
 // =====================================================================================================================
-// Calculates the byte offset to store the output value to GS-VS ring buffer based on the specified output info.
-Value* PatchInOutImportExport::CalcGsVsRingBufferOffsetForOutput(
+// Calculates the offset to store the output value to GS-VS ring based on the specified output info.
+Value* PatchInOutImportExport::CalcGsVsRingOffsetForOutput(
     uint32_t        location,    // Output location
     uint32_t        compIdx,     // Output component
     Value*          pVertexIdx,  // [in] Vertex index
+    Value*          pGsVsOffset, // [in] ES-GS ring offset in bytes
     Instruction*    pInsertPos)  // [in] Where to insert the instruction
 {
     auto pResUsage = m_pContext->GetShaderResourceUsage(ShaderStageGeometry);
 
-    uint32_t outputVertices = pResUsage->builtInUsage.gs.outputVertices;
+    Value* pRingOffset = nullptr;
+    if (m_pContext->IsGsOnChip())
+    {
+        // ringOffset = esGsLdsSize +
+        //              gsVsOffset +
+        //              threadId * gsVsRingItemSize +
+        //              (vertexIdx * vertexSize) + location * 4 + compIdx
 
-    // byteOffset = ((location * 4 + compIdx) * maxVertices + vertexIdx) * 4;
-    auto pRingBufOffset = BinaryOperator::CreateAdd(ConstantInt::get(m_pContext->Int32Ty(),
-                                                                    (location * 4 + compIdx) * outputVertices),
-                                                    pVertexIdx,
-                                                    "",
-                                                    pInsertPos);
+        uint32_t gsVsRingItemSize = 4 *
+                                    pResUsage->inOutUsage.outputMapLocCount *
+                                    pResUsage->builtInUsage.gs.outputVertices;
 
-    pRingBufOffset = BinaryOperator::CreateMul(pRingBufOffset,
-                                               ConstantInt::get(m_pContext->Int32Ty(), 4),
-                                               "",
-                                               pInsertPos);
+        auto pEsGsLdsSize = ConstantInt::get(m_pContext->Int32Ty(), pResUsage->inOutUsage.gs.esGsLdsSize);
+
+        pGsVsOffset = BinaryOperator::CreateExact(Instruction::LShr,
+                                                  pGsVsOffset,
+                                                  ConstantInt::get(m_pContext->Int32Ty(), 2),
+                                                  "",
+                                                  pInsertPos);
+
+        auto pRingItemOffset = BinaryOperator::CreateMul(m_pThreadId,
+                                                         ConstantInt::get(m_pContext->Int32Ty(), gsVsRingItemSize),
+                                                         "",
+                                                         pInsertPos);
+
+        uint32_t vertexSize = pResUsage->inOutUsage.outputMapLocCount * 4;
+        auto pVertexItemOffset = BinaryOperator::CreateMul(pVertexIdx,
+                                                           ConstantInt::get(m_pContext->Int32Ty(), vertexSize),
+                                                           "",
+                                                           pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateAdd(pEsGsLdsSize, pGsVsOffset, "", pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateAdd(pRingOffset, pRingItemOffset, "", pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateAdd(pRingOffset, pVertexItemOffset, "", pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateAdd(pRingOffset,
+                                                ConstantInt::get(m_pContext->Int32Ty(), (location * 4) + compIdx),
+                                                "",
+                                                pInsertPos);
+
+    }
+    else
+    {
+        // ringOffset = ((location * 4 + compIdx) * maxVertices + vertexIdx) * 4;
+
+        uint32_t outputVertices = pResUsage->builtInUsage.gs.outputVertices;
+
+        pRingOffset = BinaryOperator::CreateAdd(ConstantInt::get(m_pContext->Int32Ty(),
+                                                                 (location * 4 + compIdx) * outputVertices),
+                                                pVertexIdx,
+                                                "",
+                                                pInsertPos);
+
+        pRingOffset = BinaryOperator::CreateMul(pRingOffset,
+                                                ConstantInt::get(m_pContext->Int32Ty(), 4),
+                                                "",
+                                                pInsertPos);
+    }
 
-    return pRingBufOffset;
+    return pRingOffset;
 }
 
 // =====================================================================================================================
 // Reads value from LDS.
 Value* PatchInOutImportExport::ReadValueFromLds(
+    bool         isOutput,    // is the value from output variable
     Type*        pReadTy,     // [in] Type of value read from LDS
     Value*       pLdsOffset,  // [in] Start offset to do LDS read operations
     Instruction* pInsertPos)  // [in] Where to insert read instructions
@@ -4143,12 +4435,21 @@ Value* PatchInOutImportExport::ReadValueFromLds(
 
     std::vector<Value*> loadValues(numChannels);
 
-    if (m_pContext->IsTessOffChip() && m_shaderStage == ShaderStageTessEval) // Read from off-chip LDS buffer
+    const bool isTcsOutput = (isOutput && (m_shaderStage == ShaderStageTessControl));
+    const bool isTesInput = ((isOutput == false) && (m_shaderStage == ShaderStageTessEval));
+
+    if (m_pContext->IsTessOffChip() && (isTcsOutput || isTesInput)) // Read from off-chip LDS buffer
     {
-        auto& entryArgIdxs = m_pContext->GetShaderInterfaceData(m_shaderStage)->entryArgIdxs.tes;
-        const auto& inOutUsage = m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage.tes;
+        const auto& offChipLdsBase = (m_shaderStage == ShaderStageTessEval) ?
+            m_pContext->GetShaderInterfaceData(m_shaderStage)->entryArgIdxs.tes.offChipLdsBase :
+            m_pContext->GetShaderInterfaceData(m_shaderStage)->entryArgIdxs.tcs.offChipLdsBase;
+
+        const auto& pOffChipLdsDesc = (m_shaderStage == ShaderStageTessEval) ?
+            m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage.tes.pOffChipLdsDesc :
+            m_pContext->GetShaderResourceUsage(m_shaderStage)->inOutUsage.tcs.pOffChipLdsDesc;
+
+        auto pOffChipLdsBase = GetFunctionArgument(m_pEntryPoint, offChipLdsBase);
 
-        auto pOcldsBufferBase = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.offChipLdsBase);
         // Convert DWORD off-chip LDS offset to byte offset
         pLdsOffset = BinaryOperator::CreateMul(pLdsOffset,
                                                ConstantInt::get(m_pContext->Int32Ty(), 4),
@@ -4158,10 +4459,10 @@ Value* PatchInOutImportExport::ReadValueFromLds(
         for (uint32_t i = 0; i < numChannels; ++i)
         {
             std::vector<Value*> args;
-            args.push_back(inOutUsage.pOffChipLdsDesc);                                     // rsrc
+            args.push_back(pOffChipLdsDesc);                                                // rsrc
             args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // vindex
             args.push_back(pLdsOffset);                                                     // voffset
-            args.push_back(pOcldsBufferBase);                                               // soffset
+            args.push_back(pOffChipLdsBase);                                                // soffset
             args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i * 4));                 // offset
             args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32));    // dfmt
             args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_FLOAT));  // nfmt
@@ -4455,39 +4756,29 @@ void PatchInOutImportExport::StoreTessFactorToBuffer(
                                                     ConstantInt::get(m_pContext->Int32Ty(), 4),
                                                     "",
                                                     pInsertPos);
-        pTfBufferOffset = BinaryOperator::CreateAdd(pTfBufferOffset,
-                                                    ConstantInt::get(m_pContext->Int32Ty(), tessFactorOffset * 4),
-                                                    "",
-                                                    pInsertPos);
 
-        if (m_pContext->IsTessOffChip())
+        for (uint32_t i = 0; i < tessFactors.size(); ++i)
         {
-            // NOTE: GFX9 does not support dynamic tessellation control, so additional 4-byte offset is not required for
-            // tessellation off-chip mode.
-            const auto gfxIp = m_pContext->GetGfxIpVersion();
-            if (gfxIp.major != 9)
+            uint32_t  tessFactorByteOffset = i * 4 + tessFactorOffset * 4;
+            if (m_pContext->GetGfxIpVersion().major != 9)
             {
-                pTfBufferOffset = BinaryOperator::CreateAdd(pTfBufferOffset,
-                                                            ConstantInt::get(m_pContext->Int32Ty(), 4),
-                                                            "",
-                                                            pInsertPos);
+                // NOTE: GFX9 does not support dynamic tessellation control, so additional 4-byte offset is not required for
+                // tessellation off-chip mode.
+                tessFactorByteOffset += (m_pContext->IsTessOffChip() ? 4 : 0);
             }
-        }
 
-        for (uint32_t i = 0; i < tessFactors.size(); ++i)
-        {
             std::vector<Value*> args;
 
-            args.push_back(tessFactors[i]);                                 // vdata
-            args.push_back(inOutUsage.pTessFactorBufDesc);                  // rsrc
-            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));     // vindex
-            args.push_back(pTfBufferOffset);                                // voffset
-            args.push_back(pTfBufferBase);                                  // soffset
-            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), i * 4)); // offset
-            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32));       // dfmt
-            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_FLOAT));     // nfmt
-            args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));   // glc
-            args.push_back(ConstantInt::get(m_pContext->BoolTy(), false));  // slc
+            args.push_back(tessFactors[i]);                                                 // vdata
+            args.push_back(inOutUsage.pTessFactorBufDesc);                                  // rsrc
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0));                     // vindex
+            args.push_back(pTfBufferOffset);                                                // voffset
+            args.push_back(pTfBufferBase);                                                  // soffset
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), tessFactorByteOffset));  // offset
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_DATA_FORMAT_32));    // dfmt
+            args.push_back(ConstantInt::get(m_pContext->Int32Ty(), BUF_NUM_FORMAT_FLOAT));  // nfmt
+            args.push_back(ConstantInt::get(m_pContext->BoolTy(), true));                   // glc
+            args.push_back(ConstantInt::get(m_pContext->BoolTy(), false));                  // slc
 
             EmitCall(m_pModule, "llvm.amdgcn.tbuffer.store.f32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos);
         }
@@ -4627,23 +4918,40 @@ void PatchInOutImportExport::CreateTessBufferStoreFunction()
 // =====================================================================================================================
 // Calculates the DWORD offset to write value to LDS based on the specified VS output info.
 Value* PatchInOutImportExport::CalcLdsOffsetForVsOutput(
+    Type*        pOutputTy,     // [in] Type of the output
     uint32_t     location,      // Base location of the output
+    uint32_t     compIdx,       // Index used for vector element indexing
     Instruction* pInsertPos)    // [in] Where to insert calculation instructions
 {
     LLPC_ASSERT(m_shaderStage == ShaderStageVertex);
 
+    // attribOffset = location * 4 + compIdx
+    Value* pAttribOffset = ConstantInt::get(m_pContext->Int32Ty(), location * 4);
+
+    const uint32_t bitWidth = pOutputTy->getScalarSizeInBits();
+    LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64));
+
+    if (bitWidth == 64)
+    {
+        // For 64-bit data type, the component indexing must multiply by 2
+        compIdx *= 2;
+    }
+
+    pAttribOffset = BinaryOperator::CreateAdd(pAttribOffset,
+                                              ConstantInt::get(m_pContext->Int32Ty(), compIdx),
+                                              "",
+                                              pInsertPos);
+
     const auto& entryArgIdxs = m_pContext->GetShaderInterfaceData(ShaderStageVertex)->entryArgIdxs.vs;
     auto pRelVertexId = GetFunctionArgument(m_pEntryPoint, entryArgIdxs.relVertexId);
 
     const auto& calcFactor = m_pContext->GetShaderResourceUsage(ShaderStageTessControl)->inOutUsage.tcs.calcFactor;
     auto pVertexStride = ConstantInt::get(m_pContext->Int32Ty(), calcFactor.inVertexStride);
 
-    // dwordOffset = relVertexId * vertexStride + location * 4
+    // dwordOffset = relVertexId * vertexStride + attribOffset
     auto pLdsOffset = BinaryOperator::CreateMul(pRelVertexId, pVertexStride, "", pInsertPos);
-    pLdsOffset = BinaryOperator::CreateAdd(pLdsOffset,
-                                           ConstantInt::get(m_pContext->Int32Ty(), location * 4),
-                                           "",
-                                           pInsertPos);
+    pLdsOffset = BinaryOperator::CreateAdd(pLdsOffset, pAttribOffset, "", pInsertPos);
+
     return pLdsOffset;
 }
 
@@ -4930,6 +5238,7 @@ uint32_t PatchInOutImportExport::CalcPatchCountPerThreadGroup(
 void PatchInOutImportExport::AddExportInstForGenericOutput(
     Value*       pOutput,        // [in] Output value
     uint32_t     location,       // Location of the output
+    uint32_t     compIdx,        // Index used for vector element indexing
     Instruction* pInsertPos)     // [in] Where to insert the "exp" instruction
 {
     // Check if the shader stage is valid to use "exp" instruction to export output
@@ -4945,10 +5254,12 @@ void PatchInOutImportExport::AddExportInstForGenericOutput(
 
     const uint32_t compCount = pOutputTy->isVectorTy() ? pOutputTy->getVectorNumElements() : 1;
     const uint32_t bitWidth  = pOutputTy->getScalarSizeInBits();
+    LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64));
 
     // Convert the output value to floating-point export value
     Value* pExport = nullptr;
-    const uint32_t numChannels = (bitWidth * compCount) / 32;
+    const uint32_t numChannels = (bitWidth == 64) ? compCount * 2 : compCount;
+    uint32_t startChannel = (bitWidth == 64) ? compIdx * 2 : compIdx;
     Type* pExportTy = (numChannels > 1) ? VectorType::get(m_pContext->FloatTy(), numChannels) : m_pContext->FloatTy();
 
     if (pOutputTy != pExportTy)
@@ -4961,32 +5272,48 @@ void PatchInOutImportExport::AddExportInstForGenericOutput(
         pExport = pOutput;
     }
 
+    LLPC_ASSERT(numChannels <= 8);
+    Value* exportValues[8] = { nullptr };
+
+    if (numChannels == 1)
+    {
+        exportValues[0] = pExport;
+    }
+    else
+    {
+        for (uint32_t i = 0; i < numChannels; ++i)
+        {
+            exportValues[i] = ExtractElementInst::Create(pExport,
+                                                         ConstantInt::get(m_pContext->Int32Ty(), i),
+                                                         "",
+                                                         pInsertPos);
+        }
+    }
+
     std::vector<Value*> args;
 
     if (numChannels <= 4)
     {
+        LLPC_ASSERT(startChannel + numChannels <= 4);
+        const uint32_t channelMask = ((1 << (startChannel + numChannels)) - 1) - ((1 << startChannel) - 1);
+
         args.clear();
         args.push_back(ConstantInt::get(m_pContext->Int32Ty(), EXP_TARGET_PARAM_0 + location)); // tgt
-        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0xF));                           // en
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), channelMask));                   // en
 
         // src0 ~ src3
-        if (numChannels == 1)
+        for (uint32_t i = 0; i < startChannel; ++i)
         {
-            args.push_back(pExport);
+            // Inactive components (dummy)
+            args.push_back(UndefValue::get(m_pContext->FloatTy()));
         }
-        else
+
+        for (uint32_t i = startChannel; i < startChannel + numChannels; ++i)
         {
-            for (uint32_t i = 0; i < numChannels; ++i)
-            {
-                auto pCompValue = ExtractElementInst::Create(pExport,
-                                                             ConstantInt::get(m_pContext->Int32Ty(), i),
-                                                             "",
-                                                             pInsertPos);
-                args.push_back(pCompValue);
-            }
+            args.push_back(exportValues[i - startChannel]);
         }
 
-        for (uint32_t i = numChannels; i < 4; ++i)
+        for (uint32_t i = startChannel + numChannels; i < 4; ++i)
         {
             // Inactive components (dummy)
             args.push_back(UndefValue::get(m_pContext->FloatTy()));
@@ -5001,6 +5328,7 @@ void PatchInOutImportExport::AddExportInstForGenericOutput(
     else
     {
         // We have to do exporting twice for this output
+        LLPC_ASSERT(startChannel == 0); // Other values are disallowed according to GLSL spec
         LLPC_ASSERT((numChannels == 6) || (numChannels == 8));
 
         // Do the first exporting
@@ -5011,31 +5339,26 @@ void PatchInOutImportExport::AddExportInstForGenericOutput(
         // src0 ~ src3
         for (uint32_t i = 0; i < 4; ++i)
         {
-            auto pCompValue = ExtractElementInst::Create(pExport,
-                                                         ConstantInt::get(m_pContext->Int32Ty(), i),
-                                                         "",
-                                                         pInsertPos);
-            args.push_back(pCompValue);
+            args.push_back(exportValues[i]);
         }
 
         args.push_back(ConstantInt::get(m_pContext->BoolTy(), false));  // done
         args.push_back(ConstantInt::get(m_pContext->BoolTy(), false));  // vm
 
         EmitCall(m_pModule, "llvm.amdgcn.exp.f32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos);
+        ++inOutUsage.expCount;
 
         // Do the second exporting
+        const uint32_t channelMask = ((1 << (numChannels - 4)) - 1);
+
         args.clear();
         args.push_back(ConstantInt::get(m_pContext->Int32Ty(), EXP_TARGET_PARAM_0 + location + 1)); // tgt
-        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), 0xF));                               // en
+        args.push_back(ConstantInt::get(m_pContext->Int32Ty(), channelMask));                       // en
 
         // src0 ~ src3
         for (uint32_t i = 4; i < numChannels; ++i)
         {
-            auto pCompValue = ExtractElementInst::Create(pExport,
-                                                         ConstantInt::get(m_pContext->Int32Ty(), i),
-                                                         "",
-                                                         pInsertPos);
-            args.push_back(pCompValue);
+            args.push_back(exportValues[i]);
         }
 
         for (uint32_t i = numChannels; i < 8; ++i)
@@ -5048,7 +5371,7 @@ void PatchInOutImportExport::AddExportInstForGenericOutput(
         args.push_back(ConstantInt::get(m_pContext->BoolTy(), false)); // vm
 
         EmitCall(m_pModule, "llvm.amdgcn.exp.f32", m_pContext->VoidTy(), args, NoAttrib, pInsertPos);
-        inOutUsage.expCount += 2;
+        ++inOutUsage.expCount;
     }
 }
 
diff --git a/icd/api/llpc/patch/llpcPatchInOutImportExport.h b/icd/api/llpc/patch/llpcPatchInOutImportExport.h
index 386f49bd..6e688637 100644
--- a/icd/api/llpc/patch/llpcPatchInOutImportExport.h
+++ b/icd/api/llpc/patch/llpcPatchInOutImportExport.h
@@ -50,6 +50,7 @@ class PatchInOutImportExport:
 public:
     PatchInOutImportExport();
     virtual ~PatchInOutImportExport();
+
     virtual bool runOnModule(llvm::Module& module);
     virtual void visitCallInst(llvm::CallInst& callInst);
     virtual void visitReturnInst(llvm::ReturnInst& retInst);
@@ -64,7 +65,10 @@ class PatchInOutImportExport:
 private:
     LLPC_DISALLOW_COPY_AND_ASSIGN(PatchInOutImportExport);
 
-    llvm::Value* PatchVsGenericInputImport(llvm::Type* pInputTy, uint32_t location, llvm::Instruction* pInsertPos);
+    llvm::Value* PatchVsGenericInputImport(llvm::Type*        pInputTy,
+                                           uint32_t           location,
+                                           uint32_t           compIdx,
+                                           llvm::Instruction* pInsertPos);
     llvm::Value* PatchTcsGenericInputImport(llvm::Type*        pInputTy,
                                             uint32_t           location,
                                             llvm::Value*       pLocOffset,
@@ -79,6 +83,7 @@ class PatchInOutImportExport:
                                             llvm::Instruction* pInsertPos);
     llvm::Value* PatchGsGenericInputImport(llvm::Type*        pInputTy,
                                            uint32_t           location,
+                                           uint32_t           compIdx,
                                            llvm::Value*       pVertexIdx,
                                            llvm::Instruction* pInsertPos);
     llvm::Value* PatchFsGenericInputImport(llvm::Type*        pInputTy,
@@ -97,19 +102,29 @@ class PatchInOutImportExport:
                                              llvm::Value*       pVertexIdx,
                                              llvm::Instruction* pInsertPos);
 
-    void PatchVsGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos);
+    void PatchVsGenericOutputExport(llvm::Value*       pOutput,
+                                    uint32_t           location,
+                                    uint32_t           compIdx,
+                                    llvm::Instruction* pInsertPos);
     void PatchTcsGenericOutputExport(llvm::Value*       pOutput,
                                      uint32_t           location,
                                      llvm::Value*       pLocOffset,
                                      llvm::Value*       pCompIdx,
                                      llvm::Value*       pVertexIdx,
                                      llvm::Instruction* pInsertPos);
-    void PatchTesGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos);
+    void PatchTesGenericOutputExport(llvm::Value*       pOutput,
+                                     uint32_t           location,
+                                     uint32_t           compIdx,
+                                     llvm::Instruction* pInsertPos);
     void PatchGsGenericOutputExport(llvm::Value*       pOutput,
                                     uint32_t           location,
+                                    uint32_t           compIdx,
                                     uint32_t           streamId,
                                     llvm::Instruction* pInsertPos);
-    void PatchFsGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos);
+    void PatchFsGenericOutputExport(llvm::Value*       pOutput,
+                                    uint32_t           location,
+                                    uint32_t           compIdx,
+                                    llvm::Instruction* pInsertPos);
 
     llvm::Value* PatchVsBuiltInInputImport(llvm::Type* pInputTy, uint32_t builtInId, llvm::Instruction* pInsertPos);
     llvm::Value* PatchTcsBuiltInInputImport(llvm::Type*        pInputTy,
@@ -151,37 +166,39 @@ class PatchInOutImportExport:
     void PatchCopyShaderGenericOutputExport(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos);
     void PatchCopyShaderBuiltInOutputExport(llvm::Value* pOutput, uint32_t builtInId, llvm::Instruction* pInsertPos);
 
-    void StoreValueToEsGsRingBuffer(llvm::Value*        pStoreValue,
-                                    uint32_t            location,
-                                    uint32_t            compIdx,
-                                    llvm::Instruction*  pInsertPos);
+    void StoreValueToEsGsRing(llvm::Value*        pStoreValue,
+                              uint32_t            location,
+                              uint32_t            compIdx,
+                              llvm::Instruction*  pInsertPos);
 
-    llvm::Value* LoadValueFromEsGsRingBuffer(llvm::Type*         pLoadType,
-                                             uint32_t            location,
-                                             uint32_t            compIdx,
-                                             llvm::Value*        pVertexIdx,
-                                             llvm::Instruction*  pInsertPos);
+    llvm::Value* LoadValueFromEsGsRing(llvm::Type*         pLoadType,
+                                       uint32_t            location,
+                                       uint32_t            compIdx,
+                                       llvm::Value*        pVertexIdx,
+                                       llvm::Instruction*  pInsertPos);
 
     void StoreValueToGsVsRingBuffer(llvm::Value*        pStoreValue,
                                     uint32_t            location,
                                     uint32_t            compIdx,
                                     llvm::Instruction*  pInsertPos);
 
-    llvm::Value* CalcEsGsRingBufferOffsetForOutput(uint32_t           location,
-                                                   uint32_t           compIdx,
-                                                   llvm::Instruction* pInsertPos);
+    llvm::Value* CalcEsGsRingOffsetForOutput(uint32_t           location,
+                                             uint32_t           compIdx,
+                                             llvm::Value*       pEsGsOffset,
+                                             llvm::Instruction* pInsertPos);
 
-    llvm::Value* CalcEsGsRingBufferOffsetForInput(uint32_t           location,
-                                                  uint32_t           compIdx,
-                                                  llvm::Value*       pVertexIdx,
-                                                  llvm::Instruction* pInsertPos);
+    llvm::Value* CalcEsGsRingOffsetForInput(uint32_t           location,
+                                            uint32_t           compIdx,
+                                            llvm::Value*       pVertexIdx,
+                                            llvm::Instruction* pInsertPos);
 
-    llvm::Value* CalcGsVsRingBufferOffsetForOutput(uint32_t           location,
-                                                   uint32_t           compIdx,
-                                                   llvm::Value*       pVertexIdx,
-                                                   llvm::Instruction* pInsertPos);
+    llvm::Value* CalcGsVsRingOffsetForOutput(uint32_t           location,
+                                             uint32_t           compIdx,
+                                             llvm::Value*       pVertexIdx,
+                                             llvm::Value*       pGsVsOffset,
+                                             llvm::Instruction* pInsertPos);
 
-    llvm::Value* ReadValueFromLds(llvm::Type* pReadTy, llvm::Value* pLdsOffset, llvm::Instruction* pInsertPos);
+    llvm::Value* ReadValueFromLds(bool isOutput, llvm::Type* pReadTy, llvm::Value* pLdsOffset, llvm::Instruction* pInsertPos);
     void WriteValueToLds(llvm::Value* pWriteValue, llvm::Value* pLdsOffset, llvm::Instruction* pInsertPos);
 
     llvm::Value* CalcTessFactorOffset(bool isOuter, llvm::Value* pElemIdx, llvm::Instruction* pInsertPos);
@@ -198,7 +215,10 @@ class PatchInOutImportExport:
                                           uint32_t outVertexStride,
                                           uint32_t patchConstCount) const;
 
-    llvm::Value* CalcLdsOffsetForVsOutput(uint32_t location, llvm::Instruction* pInsertPos);
+    llvm::Value* CalcLdsOffsetForVsOutput(Type*              pOutputTy,
+                                          uint32_t           location,
+                                          uint32_t           compIdx,
+                                          llvm::Instruction* pInsertPos);
 
     llvm::Value* CalcLdsOffsetForTcsInput(Type*              pInputTy,
                                           uint32_t           location,
@@ -221,7 +241,10 @@ class PatchInOutImportExport:
                                           llvm::Value*       pVertexIdx,
                                           llvm::Instruction* pInsertPos);
 
-    void AddExportInstForGenericOutput(llvm::Value* pOutput, uint32_t location, llvm::Instruction* pInsertPos);
+    void AddExportInstForGenericOutput(llvm::Value*       pOutput,
+                                       uint32_t           location,
+                                       uint32_t           compIdx,
+                                       llvm::Instruction* pInsertPos);
     void AddExportInstForBuiltInOutput(llvm::Value* pOutput, uint32_t builtInId, llvm::Instruction* pInsertPos);
 
     // -----------------------------------------------------------------------------------------------------------------
@@ -253,6 +276,9 @@ class PatchInOutImportExport:
     bool                    m_hasGs;                    // Whether the pipeline has geometry shader
 
     GlobalVariable*         m_pLds;                     // Global variable to model LDS
+    llvm::Value*            m_pThreadId;                // Thread ID
+
+    std::vector<Value*>     m_expFragColors[MaxColorTargets]; // Exported fragment colors
 
     std::vector<llvm::CallInst*> m_importCalls; // List of "call" instructions to import inputs
     std::vector<llvm::CallInst*> m_exportCalls; // List of "call" instructions to export outputs
diff --git a/icd/api/llpc/patch/llpcPatchResourceCollect.cpp b/icd/api/llpc/patch/llpcPatchResourceCollect.cpp
index e74ed39b..d1c18b16 100644
--- a/icd/api/llpc/patch/llpcPatchResourceCollect.cpp
+++ b/icd/api/llpc/patch/llpcPatchResourceCollect.cpp
@@ -229,56 +229,41 @@ void PatchResourceCollect::visitCallInst(
             if ((m_shaderStage == ShaderStageTessControl) || (m_shaderStage == ShaderStageTessEval))
             {
                 auto pLocOffset = callInst.getOperand(1);
-                auto pCompIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2);
+                auto pCompIdx = callInst.getOperand(2);
+
                 if (isa<ConstantInt>(pLocOffset))
                 {
                     // Location offset is constant
                     auto locOffset = cast<ConstantInt>(pLocOffset)->getZExtValue();
                     loc += locOffset;
 
-                    if (pCompIdx != nullptr)
+                    auto bitWidth = pInputTy->getScalarSizeInBits();
+                    if (bitWidth == 64)
                     {
-                        // Vector component addressing
-                        LLPC_ASSERT(pInputTy->isVectorTy() == false); // Must be scalar type
-
-                        auto bitWidth = pInputTy->getScalarSizeInBits();
-                        if (bitWidth == 64)
+                        if (isa<ConstantInt>(pCompIdx))
                         {
-                            if (isa<ConstantInt>(pCompIdx))
-                            {
-                                auto compIdx = cast<ConstantInt>(pCompIdx)->getZExtValue();
-
-                                m_activeInputLocs.insert(loc);
-                                if (compIdx >= 2)
-                                {
-                                    // NOTE: For the addressing of .z/.w component of 64-bit vector, the count of
-                                    // occupied locations are two.
-                                    m_activeInputLocs.insert(loc + 1);
-                                }
-                            }
-                            else
+                            auto compIdx = cast<ConstantInt>(pCompIdx)->getZExtValue();
+
+                            m_activeInputLocs.insert(loc);
+                            if (compIdx >= 2)
                             {
-                                // NOTE: If vector component index is not constant, we treat this as dynamic indexing.
-                                m_hasDynIndexedInput = true;
+                                // NOTE: For the addressing of .z/.w component of 64-bit vector/scalar, the count of
+                                // occupied locations are two.
+                                m_activeInputLocs.insert(loc + 1);
                             }
                         }
                         else
                         {
-                            // NOTE: For 32-bit scalar, one location is sufficient regardless of vector component
-                            // addressing.
-                            LLPC_ASSERT(bitWidth == 32);
-                            m_activeInputLocs.insert(loc);
+                            // NOTE: If vector component index is not constant, we treat this as dynamic indexing.
+                            m_hasDynIndexedInput = true;
                         }
                     }
                     else
                     {
-                        // Not vector component addressing
+                        // NOTE: For 32-bit vector/scalar, one location is sufficient regardless of vector component
+                        // addressing.
+                        LLPC_ASSERT(bitWidth == 32);
                         m_activeInputLocs.insert(loc);
-                        if (pInputTy->getPrimitiveSizeInBits() > (8 * SizeOfVec4))
-                        {
-                            LLPC_ASSERT(pInputTy->getPrimitiveSizeInBits() <= (8 * 2 * SizeOfVec4));
-                            m_activeInputLocs.insert(loc + 1);
-                        }
                     }
                 }
                 else
@@ -353,7 +338,7 @@ void PatchResourceCollect::visitCallInst(
 
         auto loc = cast<ConstantInt>(callInst.getOperand(0))->getZExtValue();
         auto pLocOffset = callInst.getOperand(1);
-        auto pCompIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2);
+        auto pCompIdx = callInst.getOperand(2);
 
         if (isa<ConstantInt>(pLocOffset))
         {
@@ -361,49 +346,33 @@ void PatchResourceCollect::visitCallInst(
             auto locOffset = cast<ConstantInt>(pLocOffset)->getZExtValue();
             loc += locOffset;
 
-            if (pCompIdx != nullptr)
+            auto bitWidth = pOutputTy->getScalarSizeInBits();
+            if (bitWidth == 64)
             {
-                // Vector component addressing
-                LLPC_ASSERT(pOutputTy->isVectorTy() == false); // Must be scalar type
-
-                auto bitWidth = pOutputTy->getScalarSizeInBits();
-                if (bitWidth == 64)
+                if (isa<ConstantInt>(pCompIdx))
                 {
-                    if (isa<ConstantInt>(pCompIdx))
-                    {
-                        auto compIdx = cast<ConstantInt>(pCompIdx)->getZExtValue();
+                    auto compIdx = cast<ConstantInt>(pCompIdx)->getZExtValue();
 
-                        m_importedOutputLocs.insert(loc);
-                        if (compIdx >= 2)
-                        {
-                            // NOTE: For the addressing of .z/.w component of 64-bit vector, the count of
-                            // occupied locations are two.
-                            m_importedOutputLocs.insert(loc + 1);
-                        }
-                    }
-                    else
+                    m_importedOutputLocs.insert(loc);
+                    if (compIdx >= 2)
                     {
-                        // NOTE: If vector component index is not constant, we treat this as dynamic indexing.
-                        m_hasDynIndexedOutput = true;
+                        // NOTE: For the addressing of .z/.w component of 64-bit vector/scalar, the count of
+                        // occupied locations are two.
+                        m_importedOutputLocs.insert(loc + 1);
                     }
                 }
                 else
                 {
-                    // NOTE: For 32-bit scalar, one location is sufficient regardless of vector component
-                    // addressing.
-                    LLPC_ASSERT(bitWidth == 32);
-                    m_importedOutputLocs.insert(loc);
+                    // NOTE: If vector component index is not constant, we treat this as dynamic indexing.
+                    m_hasDynIndexedOutput = true;
                 }
             }
             else
             {
-                // Not vector component addressing
+                // NOTE: For 32-bit vector/scalar, one location is sufficient regardless of vector component
+                // addressing.
+                LLPC_ASSERT(bitWidth == 32);
                 m_importedOutputLocs.insert(loc);
-                if (pOutputTy->getPrimitiveSizeInBits() > (8 * SizeOfVec4))
-                {
-                    LLPC_ASSERT(pOutputTy->getPrimitiveSizeInBits() <= (8 * 2 * SizeOfVec4));
-                    m_importedOutputLocs.insert(loc + 1);
-                }
             }
         }
         else
@@ -431,25 +400,19 @@ void PatchResourceCollect::visitCallInst(
 
             auto loc = cast<ConstantInt>(callInst.getOperand(0))->getZExtValue();
             auto pLocOffset = callInst.getOperand(1);
-            auto pCompIdx = IsDontCareValue(callInst.getOperand(2)) ? nullptr : callInst.getOperand(2);
+            auto pCompIdx = callInst.getOperand(2);
 
             if (isa<ConstantInt>(pLocOffset))
             {
                 // Location offset is constant
-                if (pCompIdx != nullptr)
-                {
-                    // Vector component addressing
-                    LLPC_ASSERT(pOutputTy->isVectorTy() == false); // Must be scalar type
-
-                    auto bitWidth = pOutputTy->getScalarSizeInBits();
-                    LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64));
+                auto bitWidth = pOutputTy->getScalarSizeInBits();
+                LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64));
 
-                    if ((bitWidth == 64) && (isa<ConstantInt>(pCompIdx) == false))
-                    {
-                        // NOTE: If vector component index is not constant and it is vector component addressing for
-                        // 64-bit vector, we treat this as dynamic indexing.
-                        m_hasDynIndexedOutput = true;
-                    }
+                if ((bitWidth == 64) && (isa<ConstantInt>(pCompIdx) == false))
+                {
+                    // NOTE: If vector component index is not constant and it is vector component addressing for
+                    // 64-bit vector, we treat this as dynamic indexing.
+                    m_hasDynIndexedOutput = true;
                 }
             }
             else
diff --git a/icd/api/llpc/patch/llpcVertexFetch.cpp b/icd/api/llpc/patch/llpcVertexFetch.cpp
index b6825f96..cafa6903 100644
--- a/icd/api/llpc/patch/llpcVertexFetch.cpp
+++ b/icd/api/llpc/patch/llpcVertexFetch.cpp
@@ -983,6 +983,7 @@ VertexFetch::VertexFetch(
 Value* VertexFetch::Run(
     Type*        pInputTy,      // [in] Type of vertex input
     uint32_t     location,      // Location of vertex input
+    uint32_t     compIdx,       // Index used for vector element indexing
     Instruction* pInsertPos)    // [in] Where to insert vertex fetch instructions
 {
     Value* pVertex = nullptr;
@@ -1221,92 +1222,114 @@ Value* VertexFetch::Run(
     // Finalize vertex fetch
     Type* pBasicTy = pInputTy->isVectorTy() ? pInputTy->getVectorElementType() : pInputTy;
     const uint32_t bitWidth = pBasicTy->getScalarSizeInBits();
+    LLPC_ASSERT((bitWidth == 32) || (bitWidth == 64));
 
-    const uint32_t inputCompCount = pInputTy->isVectorTy() ? pInputTy->getVectorNumElements() : 1;
-    const uint32_t vertexCompCount = inputCompCount * bitWidth / 32;
-    const uint32_t fetchCompCount = pVertexFetch->getType()->isVectorTy() ?
-                                        pVertexFetch->getType()->getVectorNumElements() : 1;
-    if (vertexCompCount == fetchCompCount)
+    // Get default fetch values
+    Constant* pDefaults = nullptr;
+
+    if (pBasicTy->isIntegerTy())
     {
-        // Exact match, vertex input takes values from vertex fetch results
-        pVertex = pVertexFetch;
+        if (bitWidth == 32)
+        {
+            pDefaults = m_fetchDefaults.pInt;
+        }
+        else
+        {
+            LLPC_ASSERT(bitWidth == 64);
+            pDefaults = m_fetchDefaults.pInt64;
+        }
     }
-    else if (vertexCompCount < fetchCompCount)
+    else if (pBasicTy->isFloatingPointTy())
     {
-        // Vertex input takes part of values from vertex fetch results
-        if (vertexCompCount == 1)
+        if (bitWidth == 32)
         {
-            Constant* pIndex = ConstantInt::get(m_pContext->Int32Ty(), 0);
-            pVertex = ExtractElementInst::Create(pVertexFetch, pIndex, "", pInsertPos);
+            pDefaults = m_fetchDefaults.pFloat;
         }
         else
         {
-            shuffleMask.clear();
-            for (uint32_t i = 0; i < vertexCompCount; ++i)
-            {
-                shuffleMask.push_back(ConstantInt::get(m_pContext->Int32Ty(), i));
-            }
-            pVertex = new ShuffleVectorInst(pVertexFetch, pVertexFetch, ConstantVector::get(shuffleMask), "", pInsertPos);
+            LLPC_ASSERT(bitWidth == 64);
+            pDefaults = m_fetchDefaults.pDouble;
         }
     }
     else
     {
-        // Vertex input takes values from both vertex fetch results and the default fetch values
-        Constant* pDefaults = nullptr;
+        LLPC_NEVER_CALLED();
+    }
+
+    const uint32_t defaultCompCount = pDefaults->getType()->getVectorNumElements();
+    std::vector<Value*> defaultValues(defaultCompCount);
 
-        // Get default fetch values
-        if (pBasicTy->isIntegerTy())
+    for (uint32_t i = 0; i < defaultValues.size(); ++i)
+    {
+        defaultValues[i] = ExtractElementInst::Create(pDefaults,
+                                                      ConstantInt::get(m_pContext->Int32Ty(), i),
+                                                      "",
+                                                      pInsertPos);
+    }
+
+    // Get vertex fetch values
+    const uint32_t fetchCompCount = pVertexFetch->getType()->isVectorTy() ?
+                                        pVertexFetch->getType()->getVectorNumElements() : 1;
+    std::vector<Value*> fetchValues(fetchCompCount);
+
+    if (fetchCompCount == 1)
+    {
+        fetchValues[0] = pVertexFetch;
+    }
+    else
+    {
+        for (uint32_t i = 0; i < fetchCompCount; ++i)
         {
-            if (bitWidth == 32)
-            {
-                pDefaults = m_fetchDefaults.pInt;
-            }
-            else
-            {
-                LLPC_ASSERT(bitWidth == 64);
-                pDefaults = m_fetchDefaults.pInt64;
-            }
+            fetchValues[i] = ExtractElementInst::Create(pVertexFetch,
+                                                        ConstantInt::get(m_pContext->Int32Ty(), i),
+                                                        "",
+                                                        pInsertPos);
         }
-        else if (pBasicTy->isFloatingPointTy())
+    }
+
+    // Construct vertex fetch results
+    const uint32_t inputCompCount = pInputTy->isVectorTy() ? pInputTy->getVectorNumElements() : 1;
+    const uint32_t vertexCompCount = inputCompCount * bitWidth / 32;
+
+    std::vector<Value*> vertexValues(vertexCompCount);
+
+    // NOTE: Original component index is based on the basic scalar type.
+    compIdx *= ((bitWidth == 64) ? 2 : 1);
+
+    // Vertex input might take values from vertex fetch values or default fetch values
+    for (uint32_t i = 0; i < vertexCompCount; i++)
+    {
+        if (compIdx + i < fetchCompCount)
         {
-            if (bitWidth == 32)
-            {
-                pDefaults = m_fetchDefaults.pFloat;
-            }
-            else
-            {
-                LLPC_ASSERT(bitWidth == 64);
-                pDefaults = m_fetchDefaults.pDouble;
-            }
+            vertexValues[i] = fetchValues[compIdx + i];
+        }
+        else if (compIdx + i < defaultCompCount)
+        {
+            vertexValues[i] = defaultValues[compIdx + i];
         }
         else
         {
             LLPC_NEVER_CALLED();
+            vertexValues[i] = UndefValue::get(m_pContext->Int32Ty());
         }
+    }
 
+    if (vertexCompCount == 1)
+    {
+        pVertex = vertexValues[0];
+    }
+    else
+    {
         Type* pVertexTy = VectorType::get(m_pContext->Int32Ty(), vertexCompCount);
         pVertex = UndefValue::get(pVertexTy);
 
-        if (fetchCompCount == 1)
-        {
-            Constant* pIndex = ConstantInt::get(m_pContext->Int32Ty(), 0);
-            pVertex = InsertElementInst::Create(pVertex, pVertexFetch, pIndex, "", pInsertPos);
-        }
-        else
-        {
-            for (uint32_t i = 0; i < fetchCompCount; ++i)
-            {
-                Constant* pIndex = ConstantInt::get(m_pContext->Int32Ty(), i);
-                Value* pVertexComp = ExtractElementInst::Create(pVertexFetch, pIndex, "", pInsertPos);
-                pVertex = InsertElementInst::Create(pVertex, pVertexComp, pIndex, "", pInsertPos);
-            }
-        }
-
-        for (uint32_t i = fetchCompCount; i < vertexCompCount; ++i)
+        for (uint32_t i = 0; i < vertexCompCount; ++i)
         {
-            Constant* pIndex = ConstantInt::get(m_pContext->Int32Ty(), i);
-            Value* pVertexComp = ExtractElementInst::Create(pDefaults, pIndex, "", pInsertPos);
-            pVertex = InsertElementInst::Create(pVertex, pVertexComp, pIndex, "", pInsertPos);
+            pVertex = InsertElementInst::Create(pVertex,
+                                                vertexValues[i],
+                                                ConstantInt::get(m_pContext->Int32Ty(), i),
+                                                "",
+                                                pInsertPos);
         }
     }
 
diff --git a/icd/api/llpc/patch/llpcVertexFetch.h b/icd/api/llpc/patch/llpcVertexFetch.h
index 2fd8f9b9..f9481690 100644
--- a/icd/api/llpc/patch/llpcVertexFetch.h
+++ b/icd/api/llpc/patch/llpcVertexFetch.h
@@ -69,7 +69,7 @@ class VertexFetch
 
     static const VertexFormatInfo* GetVertexFormatInfo(VkFormat format);
 
-    llvm::Value* Run(llvm::Type* pInputTy, uint32_t location, llvm::Instruction* pInsertPos);
+    llvm::Value* Run(llvm::Type* pInputTy, uint32_t location, uint32_t compIdx, llvm::Instruction* pInsertPos);
 
     // Gets variable corresponding to vertex index
     llvm::Value* GetVertexIndex() { return m_pVertexIndex; }
diff --git a/icd/api/llpc/translator/SPIRVInternal.h b/icd/api/llpc/translator/SPIRVInternal.h
index fde8ae57..056c5565 100644
--- a/icd/api/llpc/translator/SPIRVInternal.h
+++ b/icd/api/llpc/translator/SPIRVInternal.h
@@ -1210,6 +1210,7 @@ union ShaderInOutMetadata {
     uint32_t Value              : 16; // Generic location or SPIR-V built-in ID
     uint32_t IsLoc              : 1;  // Whether value is a location
     uint32_t IsBuiltIn          : 1;  // Whether value is a SPIR-V built-in ID
+    uint32_t Component          : 2;  // Component offset of inputs and outputs
     uint32_t Signedness         : 1;  // Signedness of the input/output, valid
                                       // for integer (0 - unsigned, 1 - signed)
     uint32_t InterpMode         : 2;  // Interpolation mode (fragment shader)
@@ -1219,7 +1220,7 @@ union ShaderInOutMetadata {
                                       // output (tessellation shader)
     uint32_t StreamId           : 2;  // ID of output stream (geometry shader)
 
-    uint32_t Unused             : 8;
+    uint32_t Unused             : 6;
   };
   uint32_t U32All;
 };
@@ -1236,6 +1237,8 @@ struct ShaderInOutDecorate {
 
   bool           IsBuiltIn;         // Whether this is a SPIR-V built-in
 
+  uint32_t       Component;         // Component offset of inputs and outputs
+
   bool           PerPatch;          // Whether this is a per-patch input/output
                                     // (tessellation shader)
   struct
diff --git a/icd/api/llpc/translator/SPIRVReader.cpp b/icd/api/llpc/translator/SPIRVReader.cpp
index fc52c3fc..45c26e41 100644
--- a/icd/api/llpc/translator/SPIRVReader.cpp
+++ b/icd/api/llpc/translator/SPIRVReader.cpp
@@ -3465,6 +3465,10 @@ SPIRVToLLVM::transShaderDecoration(SPIRVValue *BV, Value *V) {
         InOutDec.Value.BuiltIn = BuiltInPerVertex;
       }
 
+      SPIRVWord Component = SPIRVID_INVALID;
+      if (BV->hasDecorate(DecorationComponent, 0, &Component))
+        InOutDec.Component = Component;
+
       if (BV->hasDecorate(DecorationFlat))
         InOutDec.Interp.Mode = InterpModeFlat;
 
@@ -3482,7 +3486,7 @@ SPIRVToLLVM::transShaderDecoration(SPIRVValue *BV, Value *V) {
 
       SPIRVWord StreamId = SPIRVID_INVALID;
       if (BV->hasDecorate(DecorationStream, 0, &StreamId))
-          InOutDec.StreamId = StreamId;
+        InOutDec.StreamId = StreamId;
 
       Type* MDTy = nullptr;
       SPIRVType* BT = BV->getType()->getPointerElementType();
@@ -3724,6 +3728,10 @@ SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *BT,
     InOutDec.IsBuiltIn = true;
   }
 
+  SPIRVWord Component = SPIRVID_INVALID;
+  if (BT->hasDecorate(DecorationComponent, 0, &Component))
+    InOutDec.Component = Component;
+
   if (BT->hasDecorate(DecorationFlat))
     InOutDec.Interp.Mode = InterpModeFlat;
 
@@ -3759,6 +3767,7 @@ SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *BT,
       InOutMD.Value = InOutDec.Value.Loc;
     }
 
+    InOutMD.Component = InOutDec.Component;
     InOutMD.InterpMode = InOutDec.Interp.Mode;
     InOutMD.InterpLoc = InOutDec.Interp.Loc;
     InOutMD.PerPatch = InOutDec.PerPatch;
@@ -3826,6 +3835,7 @@ SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *BT,
       InOutMD.Value = StartLoc;
     }
 
+    InOutMD.Component = InOutDec.Component;
     InOutMD.InterpMode = InOutDec.Interp.Mode;
     InOutMD.InterpLoc = InOutDec.Interp.Loc;
     InOutMD.PerPatch = InOutDec.PerPatch;
@@ -3861,6 +3871,11 @@ SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *BT,
         MemberDec.Value.BuiltIn = MemberBuiltIn;
       }
 
+      SPIRVWord MemberComponent = SPIRVID_INVALID;
+      if (BT->hasMemberDecorate(
+            MemberIdx, DecorationComponent, 0, &MemberComponent))
+        MemberDec.Component = Component;
+
       if (BT->hasMemberDecorate(MemberIdx, DecorationFlat))
         MemberDec.Interp.Mode = InterpModeFlat;
 
diff --git a/icd/api/open_strings/entry_points.txt b/icd/api/open_strings/entry_points.txt
index 7ac0eaaf..83c86ff6 100644
--- a/icd/api/open_strings/entry_points.txt
+++ b/icd/api/open_strings/entry_points.txt
@@ -245,3 +245,9 @@ vkGetFenceFdKHR                                 @dext KHR_external_fence_fd
 
 vkImportFenceWin32HandleKHR                     @dext KHR_external_fence_win32
 vkGetFenceWin32HandleKHR                        @dext KHR_external_fence_win32
+
+vkCmdWriteBufferMarkerAMD                       @dext AMD_buffer_marker
+
+vkCreateDebugReportCallbackEXT                  @iext EXT_debug_report
+vkDestroyDebugReportCallbackEXT                 @iext EXT_debug_report
+vkDebugReportMessageEXT                         @iext EXT_debug_report
diff --git a/icd/api/open_strings/extensions.txt b/icd/api/open_strings/extensions.txt
index d39d91b9..64c7c977 100644
--- a/icd/api/open_strings/extensions.txt
+++ b/icd/api/open_strings/extensions.txt
@@ -10,6 +10,7 @@ VK_KHR_external_memory_capabilities
 VK_KHR_external_semaphore_capabilities
 VK_KHR_external_fence_capabilities
 VK_KHX_device_group_creation
+VK_EXT_debug_report
 
 ###############################################################################
 # DEVICE EXTENSIONS
@@ -57,3 +58,4 @@ VK_AMD_shader_fragment_mask
 VK_EXT_sample_locations
 VK_KHR_win32_keyed_mutex
 VK_EXT_global_priority
+VK_AMD_buffer_marker
diff --git a/icd/api/open_strings/g_entry_points_decl.h b/icd/api/open_strings/g_entry_points_decl.h
index 76507632..ae5d3bff 100644
--- a/icd/api/open_strings/g_entry_points_decl.h
+++ b/icd/api/open_strings/g_entry_points_decl.h
@@ -892,3 +892,19 @@ extern const char vkGetFenceWin32HandleKHR_name[];
 static const char* VKGETFENCEWIN32HANDLEKHR_name = vkGetFenceWin32HandleKHR_name;
 #define vkGetFenceWin32HandleKHR_condition_type vk::secure::entry::ENTRY_POINT_DEVICE_EXTENSION
 #define vkGetFenceWin32HandleKHR_condition_value vk::DeviceExtensions::KHR_EXTERNAL_FENCE_WIN32
+extern const char vkCmdWriteBufferMarkerAMD_name[];
+static const char* VKCMDWRITEBUFFERMARKERAMD_name = vkCmdWriteBufferMarkerAMD_name;
+#define vkCmdWriteBufferMarkerAMD_condition_type vk::secure::entry::ENTRY_POINT_DEVICE_EXTENSION
+#define vkCmdWriteBufferMarkerAMD_condition_value vk::DeviceExtensions::AMD_BUFFER_MARKER
+extern const char vkCreateDebugReportCallbackEXT_name[];
+static const char* VKCREATEDEBUGREPORTCALLBACKEXT_name = vkCreateDebugReportCallbackEXT_name;
+#define vkCreateDebugReportCallbackEXT_condition_type vk::secure::entry::ENTRY_POINT_INSTANCE_EXTENSION
+#define vkCreateDebugReportCallbackEXT_condition_value vk::InstanceExtensions::EXT_DEBUG_REPORT
+extern const char vkDestroyDebugReportCallbackEXT_name[];
+static const char* VKDESTROYDEBUGREPORTCALLBACKEXT_name = vkDestroyDebugReportCallbackEXT_name;
+#define vkDestroyDebugReportCallbackEXT_condition_type vk::secure::entry::ENTRY_POINT_INSTANCE_EXTENSION
+#define vkDestroyDebugReportCallbackEXT_condition_value vk::InstanceExtensions::EXT_DEBUG_REPORT
+extern const char vkDebugReportMessageEXT_name[];
+static const char* VKDEBUGREPORTMESSAGEEXT_name = vkDebugReportMessageEXT_name;
+#define vkDebugReportMessageEXT_condition_type vk::secure::entry::ENTRY_POINT_INSTANCE_EXTENSION
+#define vkDebugReportMessageEXT_condition_value vk::InstanceExtensions::EXT_DEBUG_REPORT
diff --git a/icd/api/open_strings/g_entry_points_impl.h b/icd/api/open_strings/g_entry_points_impl.h
index d0ac8e5c..549c5cb3 100644
--- a/icd/api/open_strings/g_entry_points_impl.h
+++ b/icd/api/open_strings/g_entry_points_impl.h
@@ -241,3 +241,7 @@ const char vkImportFenceFdKHR_name[] = "vkImportFenceFdKHR";
 const char vkGetFenceFdKHR_name[] = "vkGetFenceFdKHR";
 const char vkImportFenceWin32HandleKHR_name[] = "vkImportFenceWin32HandleKHR";
 const char vkGetFenceWin32HandleKHR_name[] = "vkGetFenceWin32HandleKHR";
+const char vkCmdWriteBufferMarkerAMD_name[] = "vkCmdWriteBufferMarkerAMD";
+const char vkCreateDebugReportCallbackEXT_name[] = "vkCreateDebugReportCallbackEXT";
+const char vkDestroyDebugReportCallbackEXT_name[] = "vkDestroyDebugReportCallbackEXT";
+const char vkDebugReportMessageEXT_name[] = "vkDebugReportMessageEXT";
diff --git a/icd/api/open_strings/g_extensions_decl.h b/icd/api/open_strings/g_extensions_decl.h
index 5ac07169..ed7fd7fa 100644
--- a/icd/api/open_strings/g_extensions_decl.h
+++ b/icd/api/open_strings/g_extensions_decl.h
@@ -42,6 +42,8 @@ extern const char VK_KHR_external_fence_capabilities_name[];
 static const char* VK_KHR_EXTERNAL_FENCE_CAPABILITIES_name = VK_KHR_external_fence_capabilities_name;
 extern const char VK_KHX_device_group_creation_name[];
 static const char* VK_KHX_DEVICE_GROUP_CREATION_name = VK_KHX_device_group_creation_name;
+extern const char VK_EXT_debug_report_name[];
+static const char* VK_EXT_DEBUG_REPORT_name = VK_EXT_debug_report_name;
 extern const char VK_KHR_bind_memory2_name[];
 static const char* VK_KHR_BIND_MEMORY2_name = VK_KHR_bind_memory2_name;
 extern const char VK_KHR_dedicated_allocation_name[];
@@ -130,3 +132,5 @@ extern const char VK_KHR_win32_keyed_mutex_name[];
 static const char* VK_KHR_WIN32_KEYED_MUTEX_name = VK_KHR_win32_keyed_mutex_name;
 extern const char VK_EXT_global_priority_name[];
 static const char* VK_EXT_GLOBAL_PRIORITY_name = VK_EXT_global_priority_name;
+extern const char VK_AMD_buffer_marker_name[];
+static const char* VK_AMD_BUFFER_MARKER_name = VK_AMD_buffer_marker_name;
diff --git a/icd/api/open_strings/g_extensions_impl.h b/icd/api/open_strings/g_extensions_impl.h
index c3df04f9..1077dad6 100644
--- a/icd/api/open_strings/g_extensions_impl.h
+++ b/icd/api/open_strings/g_extensions_impl.h
@@ -33,6 +33,7 @@ const char VK_KHR_external_memory_capabilities_name[] = "VK_KHR_external_memory_
 const char VK_KHR_external_semaphore_capabilities_name[] = "VK_KHR_external_semaphore_capabilities";
 const char VK_KHR_external_fence_capabilities_name[] = "VK_KHR_external_fence_capabilities";
 const char VK_KHX_device_group_creation_name[] = "VK_KHX_device_group_creation";
+const char VK_EXT_debug_report_name[] = "VK_EXT_debug_report";
 const char VK_KHR_bind_memory2_name[] = "VK_KHR_bind_memory2";
 const char VK_KHR_dedicated_allocation_name[] = "VK_KHR_dedicated_allocation";
 const char VK_KHR_descriptor_update_template_name[] = "VK_KHR_descriptor_update_template";
@@ -77,3 +78,4 @@ const char VK_AMD_shader_fragment_mask_name[] = "VK_AMD_shader_fragment_mask";
 const char VK_EXT_sample_locations_name[] = "VK_EXT_sample_locations";
 const char VK_KHR_win32_keyed_mutex_name[] = "VK_KHR_win32_keyed_mutex";
 const char VK_EXT_global_priority_name[] = "VK_EXT_global_priority";
+const char VK_AMD_buffer_marker_name[] = "VK_AMD_buffer_marker";
diff --git a/icd/api/open_strings/g_func_table.cpp b/icd/api/open_strings/g_func_table.cpp
index 6b992d90..022ddced 100644
--- a/icd/api/open_strings/g_func_table.cpp
+++ b/icd/api/open_strings/g_func_table.cpp
@@ -999,6 +999,26 @@ extern void GetNextDeviceLayerTable(
                            pInstance, pDevice, remainingCount, pRemainingTables,
                            VK_SECURE_ENTRY(vkGetFenceWin32HandleKHR)));
 #endif
+#if VK_AMD_buffer_marker
+    pNextLayerFuncs->vkCmdWriteBufferMarkerAMD =
+                       reinterpret_cast<PFN_vkCmdWriteBufferMarkerAMD>(vk::GetIcdProcAddr(
+                           pInstance, pDevice, remainingCount, pRemainingTables,
+                           VK_SECURE_ENTRY(vkCmdWriteBufferMarkerAMD)));
+#endif
+#if VK_EXT_debug_report
+    pNextLayerFuncs->vkCreateDebugReportCallbackEXT =
+                       reinterpret_cast<PFN_vkCreateDebugReportCallbackEXT>(vk::GetIcdProcAddr(
+                           pInstance, pDevice, remainingCount, pRemainingTables,
+                           VK_SECURE_ENTRY(vkCreateDebugReportCallbackEXT)));
+    pNextLayerFuncs->vkDestroyDebugReportCallbackEXT =
+                       reinterpret_cast<PFN_vkDestroyDebugReportCallbackEXT>(vk::GetIcdProcAddr(
+                           pInstance, pDevice, remainingCount, pRemainingTables,
+                           VK_SECURE_ENTRY(vkDestroyDebugReportCallbackEXT)));
+    pNextLayerFuncs->vkDebugReportMessageEXT =
+                       reinterpret_cast<PFN_vkDebugReportMessageEXT>(vk::GetIcdProcAddr(
+                           pInstance, pDevice, remainingCount, pRemainingTables,
+                           VK_SECURE_ENTRY(vkDebugReportMessageEXT)));
+#endif
 
 }
 
diff --git a/icd/api/open_strings/g_func_table.h b/icd/api/open_strings/g_func_table.h
index 188f0c36..d2fc6b6a 100644
--- a/icd/api/open_strings/g_func_table.h
+++ b/icd/api/open_strings/g_func_table.h
@@ -304,6 +304,14 @@ struct EntryPointTable
     PFN_vkImportFenceWin32HandleKHR vkImportFenceWin32HandleKHR;
     PFN_vkGetFenceWin32HandleKHR vkGetFenceWin32HandleKHR;
 #endif
+#if VK_AMD_buffer_marker
+    PFN_vkCmdWriteBufferMarkerAMD vkCmdWriteBufferMarkerAMD;
+#endif
+#if VK_EXT_debug_report
+    PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT;
+    PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT;
+    PFN_vkDebugReportMessageEXT vkDebugReportMessageEXT;
+#endif
 
 };
 
diff --git a/icd/api/vert_buf_binding_mgr.cpp b/icd/api/vert_buf_binding_mgr.cpp
index c13e6681..7190f931 100644
--- a/icd/api/vert_buf_binding_mgr.cpp
+++ b/icd/api/vert_buf_binding_mgr.cpp
@@ -155,7 +155,7 @@ void VertBufBindingMgr::BindVertexBuffers(
                 // a final partial element. Rounding down matches our current behavior for buffer views.
                 if (pBinding->view.stride > 1)
                 {
-                    pBinding->view.range = Util::RoundUpToMultiple(pBinding->size, pBinding->view.stride);
+                    pBinding->view.range = Util::RoundDownToMultiple(pBinding->size, pBinding->view.stride);
                 }
                 else
                 {
@@ -221,7 +221,7 @@ void VertBufBindingMgr::GraphicsPipelineChanged(
                     // for a final partial element. Rounding down matches our current behavior for buffer views.
                     if (pBinding->view.stride > 1)
                     {
-                        pBinding->view.range = Util::RoundUpToMultiple(pBinding->size, pBinding->view.stride);
+                        pBinding->view.range = Util::RoundDownToMultiple(pBinding->size, pBinding->view.stride);
                     }
                     else
                     {
diff --git a/icd/api/vk_buffer.cpp b/icd/api/vk_buffer.cpp
index b3f40101..52a425dc 100644
--- a/icd/api/vk_buffer.cpp
+++ b/icd/api/vk_buffer.cpp
@@ -46,7 +46,8 @@ namespace vk
 // based on its declared usage bits at create time.  These masks come in handy when trying to decide optimal PAL
 // caches coherency flags during a pipeline barrier.
 void Buffer::CalcBarrierUsage(
-    VkBufferUsageFlags          usage)
+    const Device*      pDevice,
+    VkBufferUsageFlags usage)
 {
     m_inputCacheMask  = 0;
     m_outputCacheMask = Pal::CoherCpu | Pal::CoherMemory;   // Always allow CPU writes and memory writes
@@ -61,6 +62,13 @@ void Buffer::CalcBarrierUsage(
         // Also need Pal::CoherShader here as vkCmdCopyQueryPoolResults uses a compute shader defined in the Vulkan
         // API layer when used with timestamp queries.
         m_outputCacheMask |= Pal::CoherCopy | Pal::CoherShader;
+
+        // Buffer markers fall under the same PAL coherency rules as timestamp writes
+        if (pDevice->IsExtensionEnabled(DeviceExtensions::AMD_BUFFER_MARKER))
+        {
+            m_inputCacheMask  |= Pal::CoherTimestamp;
+            m_outputCacheMask |= Pal::CoherTimestamp;
+        }
     }
 
     if (usage & (VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT))
@@ -116,12 +124,12 @@ Buffer::Buffer(
     {
         if (pGpuMemory[deviceIdx] != nullptr)
         {
-            m_pGpuMemory[deviceIdx] = pGpuMemory[deviceIdx];
+            m_pGpuMemory[deviceIdx]  = pGpuMemory[deviceIdx];
             m_gpuVirtAddr[deviceIdx] = pGpuMemory[deviceIdx]->Desc().gpuVirtAddr;
         }
     }
 
-    CalcBarrierUsage(usage);
+    CalcBarrierUsage(pDevice, usage);
 }
 
 // =====================================================================================================================
@@ -378,7 +386,7 @@ VkResult Buffer::GetMemoryRequirements(
         // however, we'll specify such an alignment requirement which should fit formatted buffer use
         // with any kind of format
         pMemoryRequirements->alignment = (ubUsageEnabled) ? ubRequiredAlignment : 4;
-        pMemoryRequirements->size      = (!ubUsageEnabled) ? m_size : Util::RoundUpToMultiple(m_size, ubRequiredAlignment);
+        pMemoryRequirements->size      = Util::RoundUpToMultiple(m_size, pMemoryRequirements->alignment);
     }
 
     // Allow all available memory types for buffers
diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp
index f2ec4222..bd4805ad 100644
--- a/icd/api/vk_cmdbuffer.cpp
+++ b/icd/api/vk_cmdbuffer.cpp
@@ -2657,7 +2657,9 @@ void CmdBuffer::ResetEvent(
 // =====================================================================================================================
 // Given a bitmask of VkAccessFlags, computes the representative PAL CacheCoherencyUsageFlags that will be written
 // in the srcCacheMask field of a pipeline BarrierTransition.
-Pal::uint32 CmdBuffer::ConvertBarrierSrcAccessFlags(VkAccessFlags accessMask)
+Pal::uint32 CmdBuffer::ConvertBarrierSrcAccessFlags(
+    const Device* pDevice,
+    VkAccessFlags accessMask)
 {
     Pal::uint32 coher = 0;
 
@@ -2686,6 +2688,11 @@ Pal::uint32 CmdBuffer::ConvertBarrierSrcAccessFlags(VkAccessFlags accessMask)
         // Also need Pal::CoherShader here as vkCmdCopyQueryPoolResults uses a compute shader defined in the Vulkan
         // API layer when used with timestamp queries.
         coher |= Pal::CoherCopy | Pal::CoherResolve | Pal::CoherClear | Pal::CoherShader;
+
+        if (pDevice->IsExtensionEnabled(DeviceExtensions::AMD_BUFFER_MARKER))
+        {
+            coher |= Pal::CoherTimestamp;
+        }
     }
 
     if (accessMask & VK_ACCESS_MEMORY_WRITE_BIT)
@@ -2705,7 +2712,9 @@ Pal::uint32 CmdBuffer::ConvertBarrierSrcAccessFlags(VkAccessFlags accessMask)
 // =====================================================================================================================
 // Given a bitmask of VkAccessFlags, computes the representative PAL CacheCoherencyUsageFlags that will be written
 // in the dstCacheMask field of a pipeline BarrierTransition.
-Pal::uint32 CmdBuffer::ConvertBarrierDstAccessFlags(VkAccessFlags accessMask)
+Pal::uint32 CmdBuffer::ConvertBarrierDstAccessFlags(
+    const Device* pDevice,
+    VkAccessFlags accessMask)
 {
     // With the more loose memory barrier semantics introduced we practically have to always invalidate all relevant
     // caches. The complete set is limited based on the usage allowed by the resource at the caller side.
@@ -2742,6 +2751,7 @@ Pal::uint32 CmdBuffer::ConvertBarrierDstAccessFlags(VkAccessFlags accessMask)
 // Convert src access and dst access mask to the PAL CacheCoherencyUsageFlags that will be written
 // in the srcCacheMask and dstCacheMask field of a pipeline BarrierTransition.
 void CmdBuffer::ConvertBarrierCacheFlags(
+    const Device*           pDevice,
     VkAccessFlags           srcAccess,
     VkAccessFlags           dstAccess,
     uint32_t                supportInputCacheMask,
@@ -2749,7 +2759,7 @@ void CmdBuffer::ConvertBarrierCacheFlags(
     uint32_t                barrierOptions,
     Pal::BarrierTransition* pResult)
 {
-     pResult->srcCacheMask = (supportOutputCacheMask != 0xFFFFFFFF) ? supportOutputCacheMask & ConvertBarrierSrcAccessFlags(srcAccess) : ConvertBarrierSrcAccessFlags(srcAccess);
+     pResult->srcCacheMask = supportOutputCacheMask & ConvertBarrierSrcAccessFlags(pDevice, srcAccess);
 
      // srccachemask is 0 for all read only source access like VK_ACCESS_*_READ_BIT
      // etc. hence, only validate against all input caches if we are going from write to any other access flag.
@@ -2761,7 +2771,7 @@ void CmdBuffer::ConvertBarrierCacheFlags(
      }
      else
      {
-         pResult->dstCacheMask = (supportInputCacheMask != 0xFFFFFFFF) ? supportInputCacheMask & ConvertBarrierDstAccessFlags(dstAccess) : ConvertBarrierDstAccessFlags(dstAccess);
+         pResult->dstCacheMask = supportInputCacheMask & ConvertBarrierDstAccessFlags(pDevice, dstAccess);
      }
 }
 
@@ -2837,7 +2847,13 @@ void CmdBuffer::ExecuteBarriers(
 
     for (uint32_t i = 0; i < memBarrierCount; ++i)
     {
-        ConvertBarrierCacheFlags(pMemoryBarriers[i].srcAccessMask, pMemoryBarriers[i].dstAccessMask, 0xFFFFFFFF, 0xFFFFFFFF, barrierOptions, pNextMain);
+        ConvertBarrierCacheFlags(
+            m_pDevice,
+            pMemoryBarriers[i].srcAccessMask,
+            pMemoryBarriers[i].dstAccessMask,
+            0xFFFFFFFF, 0xFFFFFFFF,
+            barrierOptions,
+            pNextMain);
 
         pNextMain->imageInfo.pImage = nullptr;
         VK_ASSERT(pMemoryBarriers[i].pNext == nullptr);
@@ -2872,7 +2888,14 @@ void CmdBuffer::ExecuteBarriers(
         uint32_t supportInputCoherMask = pBuffer->GetSupportedInputCoherMask();
         uint32_t supportOutputCoherMask = pBuffer->GetSupportedOutputCoherMask();
 
-        ConvertBarrierCacheFlags(pBufferMemoryBarriers[i].srcAccessMask, pBufferMemoryBarriers[i].dstAccessMask, supportInputCoherMask, supportOutputCoherMask, barrierOptions, pNextMain);
+        ConvertBarrierCacheFlags(
+            m_pDevice,
+            pBufferMemoryBarriers[i].srcAccessMask,
+            pBufferMemoryBarriers[i].dstAccessMask,
+            supportInputCoherMask,
+            supportOutputCoherMask,
+            barrierOptions,
+            pNextMain);
 
         pNextMain->imageInfo.pImage = nullptr;
 
@@ -2909,7 +2932,14 @@ void CmdBuffer::ExecuteBarriers(
         uint32_t               supportOutputCoherMask = pImage->GetSupportedOutputCoherMask();
         Pal::BarrierTransition barrierTransition      = { 0 };
 
-        ConvertBarrierCacheFlags(pImageMemoryBarriers[i].srcAccessMask, pImageMemoryBarriers[i].dstAccessMask, supportInputCoherMask, supportOutputCoherMask, barrierOptions, &barrierTransition);
+        ConvertBarrierCacheFlags(
+            m_pDevice,
+            pImageMemoryBarriers[i].srcAccessMask,
+            pImageMemoryBarriers[i].dstAccessMask,
+            supportInputCoherMask,
+            supportOutputCoherMask,
+            barrierOptions,
+            &barrierTransition);
 
         pNextMain->imageInfo.pImage = nullptr;
 
@@ -4109,6 +4139,7 @@ void CmdBuffer::RPSyncPoint(
             pGlobalTransition->imageInfo.pImage = nullptr;
 
             ConvertBarrierCacheFlags(
+                m_pDevice,
                 syncPoint.barrier.srcAccessMask,
                 syncPoint.barrier.dstAccessMask,
                 0xffffffff,
@@ -5016,6 +5047,30 @@ void CmdBuffer::DbgCmdBarrier(bool preCmd)
 }
 #endif
 
+// =====================================================================================================================
+void CmdBuffer::WriteBufferMarker(
+    VkPipelineStageFlagBits pipelineStage,
+    VkBuffer                dstBuffer,
+    VkDeviceSize            dstOffset,
+    uint32_t                marker)
+{
+    const Buffer* pDestBuffer        = Buffer::ObjectFromHandle(dstBuffer);
+    const Pal::HwPipePoint pipePoint = VkToPalSrcPipePointForMarkers(pipelineStage, m_palEngineType);
+
+    utils::IterateMask deviceGroup(m_palDeviceMask);
+
+    while (deviceGroup.Iterate())
+    {
+        const uint32_t deviceIdx = deviceGroup.Index();
+
+        PalCmdBuffer(deviceIdx)->CmdWriteImmediate(
+            pipePoint,
+            marker,
+            Pal::ImmediateDataWidth::ImmediateData32Bit,
+            pDestBuffer->GpuVirtAddr(deviceIdx) + dstOffset);
+    }
+}
+
 // =====================================================================================================================
 RenderPassInstanceState::RenderPassInstanceState(
     PalAllocator* pAllocator)
@@ -5780,6 +5835,18 @@ VKAPI_ATTR void VKAPI_CALL vkCmdSetSampleLocationsEXT(
 {
     ApiCmdBuffer::ObjectFromHandle(commandBuffer)->SetSampleLocations(pSampleLocationsInfo);
 }
+
+// =====================================================================================================================
+VKAPI_ATTR void VKAPI_CALL vkCmdWriteBufferMarkerAMD(
+    VkCommandBuffer         commandBuffer,
+    VkPipelineStageFlagBits pipelineStage,
+    VkBuffer                dstBuffer,
+    VkDeviceSize            dstOffset,
+    uint32_t                marker)
+{
+    ApiCmdBuffer::ObjectFromHandle(commandBuffer)->WriteBufferMarker(pipelineStage, dstBuffer, dstOffset, marker);
+}
+
 } // namespace entry
 
 } // namespace vk
diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp
index 2251b34a..515781e8 100644
--- a/icd/api/vk_conv.cpp
+++ b/icd/api/vk_conv.cpp
@@ -393,6 +393,466 @@ VK_TO_PAL_DECL_LOOKUP_TABLE(IMAGE_TILING,                   ImageTiling
 VK_TO_PAL_DECL_LOOKUP_TABLE(COMPONENT_SWIZZLE,              ChannelSwizzle                                             )
 VK_TO_PAL_DECL_LOOKUP_TABLE(PIPELINE_BIND_POINT,            PipelineBindPoint                                          )
 
+// =====================================================================================================================
+// Converts a PAL::Result value to an equivalent string name
+const char* PalResultName(
+    Pal::Result result)
+{
+    const char* resultName = nullptr;
+
+    switch (result)
+    {
+    case Pal::Result::TooManyFlippableAllocations:
+        resultName = "TooManyFlippableAllocations";
+        break;
+
+    case Pal::Result::PresentOccluded:
+        resultName = "PresentOccluded";
+        break;
+
+    case Pal::Result::Unsupported:
+        resultName = "Unsupported";
+        break;
+
+    case Pal::Result::NotReady:
+        resultName = "NotReady";
+        break;
+
+    case Pal::Result::Timeout:
+        resultName = "Timeout";
+        break;
+
+    case Pal::Result::ErrorFenceNeverSubmitted:
+        resultName = "ErrorFenceNeverSubmitted";
+        break;
+
+    case Pal::Result::EventSet:
+        resultName = "EventSet";
+        break;
+
+    case Pal::Result::EventReset:
+        resultName = "EventReset";
+        break;
+
+    case Pal::Result::ErrorInitializationFailed:
+        resultName = "ErrorInitializationFailed";
+        break;
+
+    case Pal::Result::ErrorOutOfMemory:
+        resultName = "ErrorOutOfMemory";
+        break;
+
+    case Pal::Result::ErrorOutOfGpuMemory:
+        resultName = "ErrorOutOfGpuMemory";
+        break;
+
+    case Pal::Result::ErrorDeviceLost:
+        resultName = "ErrorDeviceLost";
+        break;
+
+    case Pal::Result::ErrorIncompatibleLibrary:
+        resultName = "ErrorIncompatibleLibrary";
+        break;
+
+    case Pal::Result::ErrorGpuMemoryMapFailed:
+        resultName = "ErrorGpuMemoryMapFailed";
+        break;
+
+    case Pal::Result::ErrorNotMappable:
+        resultName = "ErrorNotMappable";
+        break;
+
+    case Pal::Result::ErrorUnknown:
+        resultName = "ErrorUnknown";
+        break;
+
+    case Pal::Result::ErrorUnavailable:
+        resultName = "ErrorUnavailable";
+        break;
+
+    case Pal::Result::ErrorInvalidPointer:
+        resultName = "ErrorInvalidPointer";
+        break;
+
+    case Pal::Result::ErrorInvalidValue:
+        resultName = "ErrorInvalidValue";
+        break;
+
+    case Pal::Result::ErrorInvalidOrdinal:
+        resultName = "ErrorInvalidOrdinal";
+        break;
+
+    case Pal::Result::ErrorInvalidMemorySize:
+        resultName = "ErrorInvalidMemorySize";
+        break;
+
+    case Pal::Result::ErrorInvalidFlags:
+        resultName = "ErrorInvalidFlags";
+        break;
+
+    case Pal::Result::ErrorInvalidAlignment:
+        resultName = "ErrorInvalidAlignment";
+        break;
+
+    case Pal::Result::ErrorInvalidFormat:
+        resultName = "ErrorInvalidFormat";
+        break;
+
+    case Pal::Result::ErrorInvalidImage:
+        resultName = "ErrorInvalidImage";
+        break;
+
+    case Pal::Result::ErrorInvalidDescriptorSetData:
+        resultName = "ErrorInvalidDescriptorSetData";
+        break;
+
+    case Pal::Result::ErrorInvalidQueueType:
+        resultName = "ErrorInvalidQueueType";
+        break;
+
+    case Pal::Result::ErrorUnsupportedShaderIlVersion:
+        resultName = "ErrorUnsupportedShaderIlVersion";
+        break;
+
+    case Pal::Result::ErrorBadShaderCode:
+        resultName = "ErrorBadShaderCode";
+        break;
+
+    case Pal::Result::ErrorBadPipelineData:
+        resultName = "ErrorBadPipelineData";
+        break;
+
+    case Pal::Result::ErrorGpuMemoryUnmapFailed:
+        resultName = "ErrorGpuMemoryUnmapFailed";
+        break;
+
+    case Pal::Result::ErrorIncompatibleDevice:
+        resultName = "ErrorIncompatibleDevice";
+        break;
+
+    case Pal::Result::ErrorBuildingCommandBuffer:
+        resultName = "ErrorBuildingCommandBuffer";
+        break;
+
+    case Pal::Result::ErrorGpuMemoryNotBound:
+        resultName = "ErrorGpuMemoryNotBound";
+        break;
+
+    case Pal::Result::ErrorImageNotShaderAccessible:
+        resultName = "ErrorImageNotShaderAccessible";
+        break;
+
+    case Pal::Result::ErrorInvalidUsageForFormat:
+        resultName = "ErrorInvalidUsageForFormat";
+        break;
+
+    case Pal::Result::ErrorFormatIncompatibleWithImageUsage:
+        resultName = "ErrorFormatIncompatibleWithImageUsage";
+        break;
+
+    case Pal::Result::ErrorThreadGroupTooBig:
+        resultName = "ErrorThreadGroupTooBig";
+        break;
+
+    case Pal::Result::ErrorInvalidMsaaMipLevels:
+        resultName = "ErrorInvalidMsaaMipLevels";
+        break;
+
+    case Pal::Result::ErrorInvalidSampleCount:
+        resultName = "ErrorInvalidSampleCount";
+        break;
+
+    case Pal::Result::ErrorInvalidImageArraySize:
+        resultName = "ErrorInvalidImageArraySize";
+        break;
+
+    case Pal::Result::ErrorInvalid3dImageArraySize:
+        resultName = "ErrorInvalid3dImageArraySize";
+        break;
+
+    case Pal::Result::ErrorInvalidImageWidth:
+        resultName = "ErrorInvalidImageWidth";
+        break;
+
+    case Pal::Result::ErrorInvalidImageHeight:
+        resultName = "ErrorInvalidImageHeight";
+        break;
+
+    case Pal::Result::ErrorInvalidImageDepth:
+        resultName = "ErrorInvalidImageDepth";
+        break;
+
+    case Pal::Result::ErrorInvalidMipCount:
+        resultName = "ErrorInvalidMipCount";
+        break;
+
+    case Pal::Result::ErrorInvalidBaseMipLevel:
+        resultName = "ErrorInvalidBaseMipLevel";
+        break;
+
+    case Pal::Result::ErrorInvalidViewArraySize:
+        resultName = "ErrorInvalidViewArraySize";
+        break;
+
+    case Pal::Result::ErrorInvalidViewBaseSlice:
+        resultName = "ErrorInvalidViewBaseSlice";
+        break;
+
+    case Pal::Result::ErrorInsufficientImageArraySize:
+        resultName = "ErrorInsufficientImageArraySize";
+        break;
+
+    case Pal::Result::ErrorCubemapNonSquareFaceSize:
+        resultName = "ErrorCubemapNonSquareFaceSize";
+        break;
+
+    case Pal::Result::ErrorInvalidImageTargetUsage:
+        resultName = "ErrorInvalidImageTargetUsage";
+        break;
+
+    case Pal::Result::ErrorMissingDepthStencilUsage:
+        resultName = "ErrorMissingDepthStencilUsage";
+        break;
+
+    case Pal::Result::ErrorInvalidColorTargetType:
+        resultName = "ErrorInvalidColorTargetType";
+        break;
+
+    case Pal::Result::ErrorInvalidDepthTargetType:
+        resultName = "ErrorInvalidDepthTargetType";
+        break;
+
+    case Pal::Result::ErrorInvalidMsaaType:
+        resultName = "ErrorInvalidMsaaType";
+        break;
+
+    case Pal::Result::ErrorInvalidCompressedImageType:
+        resultName = "ErrorInvalidCompressedImageType";
+        break;
+
+    case Pal::Result::ErrorImageAspectUnavailable:
+        resultName = "ErrorImageAspectUnavailable";
+        break;
+
+    case Pal::Result::ErrorInvalidFormatSwizzle:
+        resultName = "ErrorInvalidFormatSwizzle";
+        break;
+
+    case Pal::Result::ErrorViewTypeIncompatibleWithImageType:
+        resultName = "ErrorViewTypeIncompatibleWithImageType";
+        break;
+
+    case Pal::Result::ErrorCubemapIncompatibleWithMsaa:
+        resultName = "ErrorCubemapIncompatibleWithMsaa";
+        break;
+
+    case Pal::Result::ErrorInvalidMsaaFormat:
+        resultName = "ErrorInvalidMsaaFormat";
+        break;
+
+    case Pal::Result::ErrorFormatIncompatibleWithImageFormat:
+        resultName = "ErrorFormatIncompatibleWithImageFormat";
+        break;
+
+    case Pal::Result::ErrorFormatIncompatibleWithImageAspect:
+        resultName = "ErrorFormatIncompatibleWithImageAspect";
+        break;
+
+    case Pal::Result::ErrorFullscreenUnavailable:
+        resultName = "ErrorFullscreenUnavailable";
+        break;
+
+    case Pal::Result::ErrorScreenRemoved:
+        resultName = "ErrorScreenRemoved";
+        break;
+
+    case Pal::Result::ErrorIncompatibleScreenMode:
+        resultName = "ErrorIncompatibleScreenMode";
+        break;
+
+    case Pal::Result::ErrorMultiDevicePresentFailed:
+        resultName = "ErrorMultiDevicePresentFailed";
+        break;
+
+    case Pal::Result::ErrorWindowedPresentUnavailable:
+        resultName = "ErrorWindowedPresentUnavailable";
+        break;
+
+    case Pal::Result::ErrorInvalidResolution:
+        resultName = "ErrorInvalidResolution";
+        break;
+
+    case Pal::Result::ErrorInvalidObjectType:
+        resultName = "ErrorInvalidObjectType";
+        break;
+
+    case Pal::Result::ErrorTooManyMemoryReferences:
+        resultName = "ErrorTooManyMemoryReferences";
+        break;
+
+    case Pal::Result::ErrorNotShareable:
+        resultName = "ErrorNotShareable";
+        break;
+
+    case Pal::Result::ErrorImageFmaskUnavailable:
+        resultName = "ErrorImageFmaskUnavailable";
+        break;
+
+    case Pal::Result::ErrorPrivateScreenRemoved:
+        resultName = "ErrorPrivateScreenRemoved";
+        break;
+
+    case Pal::Result::ErrorPrivateScreenUsed:
+        resultName = "ErrorPrivateScreenUsed";
+        break;
+
+    case Pal::Result::ErrorTooManyPrivateDisplayImages:
+        resultName = "ErrorTooManyPrivateDisplayImages";
+        break;
+
+    case Pal::Result::ErrorPrivateScreenNotEnabled:
+        resultName = "ErrorPrivateScreenNotEnabled";
+        break;
+
+    default:
+        VK_NOT_IMPLEMENTED;
+        resultName = "??";
+        break;
+    }
+
+    return resultName;
+}
+
+// =====================================================================================================================
+// Converts a VkResult value to an equivalent string name
+const char* VkResultName(
+    VkResult result)
+{
+    const char* errName = nullptr;
+
+    switch (result)
+    {
+    case VkResult::VK_SUCCESS:
+        errName = "VK_SUCCESS";
+        break;
+
+    case VkResult::VK_NOT_READY:
+        errName = "VK_NOT_READY";
+        break;
+
+    case VkResult::VK_TIMEOUT:
+        errName = "VK_TIMEOUT";
+        break;
+
+    case VkResult::VK_EVENT_SET:
+        errName = "VK_EVENT_SET";
+        break;
+
+    case VkResult::VK_EVENT_RESET:
+        errName = "VK_EVENT_RESET";
+        break;
+
+    case VkResult::VK_INCOMPLETE:
+        errName = "VK_INCOMPLETE";
+        break;
+
+    case VkResult::VK_ERROR_OUT_OF_HOST_MEMORY:
+        errName = "VK_ERROR_OUT_OF_HOST_MEMORY";
+        break;
+
+    case VkResult::VK_ERROR_OUT_OF_DEVICE_MEMORY:
+        errName = "VK_ERROR_OUT_OF_DEVICE_MEMORY";
+        break;
+
+    case VkResult::VK_ERROR_INITIALIZATION_FAILED:
+        errName = "VK_ERROR_INITIALIZATION_FAILED";
+        break;
+
+    case VkResult::VK_ERROR_DEVICE_LOST:
+        errName = "VK_ERROR_DEVICE_LOST";
+        break;
+
+    case VkResult::VK_ERROR_MEMORY_MAP_FAILED:
+        errName = "VK_ERROR_MEMORY_MAP_FAILED";
+        break;
+
+    case VkResult::VK_ERROR_LAYER_NOT_PRESENT:
+        errName = "VK_ERROR_LAYER_NOT_PRESENT";
+        break;
+
+    case VkResult::VK_ERROR_EXTENSION_NOT_PRESENT:
+        errName = "VK_ERROR_EXTENSION_NOT_PRESENT";
+        break;
+
+    case VkResult::VK_ERROR_FEATURE_NOT_PRESENT:
+        errName = "VK_ERROR_FEATURE_NOT_PRESENT";
+        break;
+
+    case VkResult::VK_ERROR_INCOMPATIBLE_DRIVER:
+        errName = "VK_ERROR_INCOMPATIBLE_DRIVER";
+        break;
+
+    case VkResult::VK_ERROR_TOO_MANY_OBJECTS:
+        errName = "VK_ERROR_TOO_MANY_OBJECTS";
+        break;
+
+    case VkResult::VK_ERROR_FORMAT_NOT_SUPPORTED:
+        errName = "VK_ERROR_FORMAT_NOT_SUPPORTED";
+        break;
+
+    case VkResult::VK_ERROR_FRAGMENTED_POOL:
+        errName = "VK_ERROR_FRAGMENTED_POOL";
+        break;
+
+    case VkResult::VK_ERROR_OUT_OF_POOL_MEMORY_KHR:
+        errName = "VK_ERROR_OUT_OF_POOL_MEMORY_KHR";
+        break;
+
+    case VkResult::VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR:
+        errName = "VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR";
+        break;
+
+    case VkResult::VK_ERROR_SURFACE_LOST_KHR:
+        errName = "VK_ERROR_SURFACE_LOST_KHR";
+        break;
+
+    case VkResult::VK_ERROR_NATIVE_WINDOW_IN_USE_KHR:
+        errName = "VK_ERROR_NATIVE_WINDOW_IN_USE_KHR";
+        break;
+
+    case VkResult::VK_SUBOPTIMAL_KHR:
+        errName = "VK_SUBOPTIMAL_KHR";
+        break;
+
+    case VkResult::VK_ERROR_OUT_OF_DATE_KHR:
+        errName = "VK_ERROR_OUT_OF_DATE_KHR";
+        break;
+
+    case VkResult::VK_ERROR_INCOMPATIBLE_DISPLAY_KHR:
+        errName = "VK_ERROR_INCOMPATIBLE_DISPLAY_KHR";
+        break;
+
+    case VkResult::VK_ERROR_VALIDATION_FAILED_EXT:
+        errName = "VK_ERROR_VALIDATION_FAILED_EXT";
+        break;
+
+    case VkResult::VK_ERROR_INVALID_SHADER_NV:
+        errName = "VK_ERROR_INVALID_SHADER_NV";
+        break;
+
+    case VkResult::VK_ERROR_NOT_PERMITTED_EXT:
+        errName = "VK_ERROR_NOT_PERMITTED_EXT";
+        break;
+
+    default:
+        VK_NOT_IMPLEMENTED;
+        errName = "??";
+        break;
+    };
+
+    return errName;
+}
+
 // =====================================================================================================================
 // Converts a non-Success PAL result to an equivalent VK error
 VkResult PalToVkError(
@@ -400,49 +860,64 @@ VkResult PalToVkError(
 {
     VK_ASSERT(result != Pal::Result::Success);
 
+    VkResult vkResult = VK_SUCCESS;
+
     switch (result)
     {
     // These PAL error codes currently aren't handled specially and they indicate success otherwise
     case Pal::Result::TooManyFlippableAllocations:
     case Pal::Result::PresentOccluded:
-        return VK_SUCCESS;
+        vkResult = VK_SUCCESS;
+        break;
 
     case Pal::Result::Unsupported:
-        return VK_ERROR_FORMAT_NOT_SUPPORTED;
+        vkResult = VK_ERROR_FORMAT_NOT_SUPPORTED;
+        break;
 
     case Pal::Result::NotReady:
-        return VK_NOT_READY;
+        vkResult = VK_NOT_READY;
+        break;
 
     case Pal::Result::Timeout:
     case Pal::Result::ErrorFenceNeverSubmitted:
-        return VK_TIMEOUT;
+        vkResult = VK_TIMEOUT;
+        break;
 
     case Pal::Result::EventSet:
-        return VK_EVENT_SET;
+        vkResult = VK_EVENT_SET;
+        break;
 
     case Pal::Result::EventReset:
-        return VK_EVENT_RESET;
+        vkResult = VK_EVENT_RESET;
+        break;
 
     case Pal::Result::ErrorInitializationFailed:
-        return VK_ERROR_INITIALIZATION_FAILED;
+        vkResult = VK_ERROR_INITIALIZATION_FAILED;
+        break;
 
     case Pal::Result::ErrorOutOfMemory:
-        return VK_ERROR_OUT_OF_HOST_MEMORY;
+        vkResult = VK_ERROR_OUT_OF_HOST_MEMORY;
+        break;
 
     case Pal::Result::ErrorOutOfGpuMemory:
-        return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+        vkResult = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+        break;
 
     case Pal::Result::ErrorDeviceLost:
-        return VK_ERROR_DEVICE_LOST;
+        vkResult = VK_ERROR_DEVICE_LOST;
+        break;
 
     case Pal::Result::ErrorIncompatibleLibrary:
-        return VK_ERROR_INCOMPATIBLE_DRIVER;
+        vkResult = VK_ERROR_INCOMPATIBLE_DRIVER;
+        break;
 
     case Pal::Result::ErrorGpuMemoryMapFailed:
-        return VK_ERROR_MEMORY_MAP_FAILED;
+        vkResult = VK_ERROR_MEMORY_MAP_FAILED;
+        break;
 
     case Pal::Result::ErrorNotMappable:
-        return VK_ERROR_MEMORY_MAP_FAILED;
+        vkResult = VK_ERROR_MEMORY_MAP_FAILED;
+        break;
 
     case Pal::Result::ErrorUnknown:
     case Pal::Result::ErrorUnavailable:
@@ -499,7 +974,8 @@ VkResult PalToVkError(
     case Pal::Result::ErrorMultiDevicePresentFailed:
     case Pal::Result::ErrorWindowedPresentUnavailable:
     case Pal::Result::ErrorInvalidResolution:
-        return VK_ERROR_INITIALIZATION_FAILED;
+        vkResult = VK_ERROR_INITIALIZATION_FAILED;
+        break;
 
     case Pal::Result::ErrorInvalidObjectType:
         // This is only generated by RemapVirtualMemoryPages currently which is only used
@@ -520,8 +996,17 @@ VkResult PalToVkError(
         // There's no private screen support yet. Fall through to the default path.
     default:
         VK_NOT_IMPLEMENTED;
-        return VK_ERROR_INITIALIZATION_FAILED;
+        vkResult = VK_ERROR_INITIALIZATION_FAILED;
+        break;
     }
+
+#if PAL_ENABLE_PRINTS_ASSERTS
+    const char* palErrorName = PalResultName(result);
+    const char* vkErrorName = VkResultName(vkResult);
+    PAL_DPINFO("Vulkan error: %s(%d), from Pal error: Pal::Result::%s(%d)", vkErrorName, vkResult, palErrorName, result);
+#endif
+
+    return vkResult;
 }
 
 // =====================================================================================================================
diff --git a/icd/api/vk_debug_report.cpp b/icd/api/vk_debug_report.cpp
new file mode 100644
index 00000000..2ea76d5a
--- /dev/null
+++ b/icd/api/vk_debug_report.cpp
@@ -0,0 +1,175 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2018 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to deal
+ *  in the Software without restriction, including without limitation the rights
+ *  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *  copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in all
+ *  copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *  SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+#include "include/vk_debug_report.h"
+#include "include/vk_instance.h"
+#include "palDbgPrint.h"
+
+namespace vk
+{
+// =====================================================================================================================
+// Create a DebugReportCallback object.
+VkResult DebugReportCallback::Create(
+    Instance*                                 pInstance,
+    const VkDebugReportCallbackCreateInfoEXT* pCreateInfo,
+    const VkAllocationCallbacks*              pAllocator,
+    VkDebugReportCallbackEXT*                 pCallback)
+{
+    VkResult result = VK_SUCCESS;
+
+    void* pSystemMem = pAllocator->pfnAllocation(
+        pAllocator->pUserData,
+        sizeof(DebugReportCallback),
+        VK_DEFAULT_MEM_ALIGN,
+        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+    if (pSystemMem == nullptr)
+    {
+        result = VK_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    if (result == VK_SUCCESS)
+    {
+        VK_PLACEMENT_NEW(pSystemMem) DebugReportCallback();
+
+        *pCallback = DebugReportCallback::HandleFromVoidPointer(pSystemMem);
+
+        result = pInstance->RegisterDebugCallback(DebugReportCallback::ObjectFromHandle(*pCallback));
+
+        if (result == VK_SUCCESS)
+        {
+            DebugReportCallback::ObjectFromHandle(*pCallback)->m_createInfo = *pCreateInfo;
+        }
+        else
+        {
+            Util::Destructor(DebugReportCallback::ObjectFromHandle(*pCallback));
+            pAllocator->pfnFree(pAllocator->pUserData, DebugReportCallback::ObjectFromHandle(*pCallback));
+        }
+    }
+
+    return result;
+}
+
+// =====================================================================================================================
+// Destroy a DebugReportCallback object.
+void DebugReportCallback::Destroy(
+    Instance*                                 pInstance,
+    const VkAllocationCallbacks*              pAllocator)
+{
+    pInstance->UnregisterDebugCallback(this);
+
+    Util::Destructor(this);
+
+    // Free memory
+    pAllocator->pfnFree(pAllocator->pUserData, this);
+}
+
+// =====================================================================================================================
+// Inject a message into the debug stream from the Debug Report Callback.
+void DebugReportCallback::Message(
+    Instance*                                 pInstance,
+    VkDebugReportFlagsEXT                     flags,
+    VkDebugReportObjectTypeEXT                objectType,
+    uint64_t                                  object,
+    size_t                                    location,
+    int32_t                                   messageCode,
+    const char*                               pLayerPrefix,
+    const char*                               pMessage)
+{
+    pInstance->CallExternalCallbacks(flags,
+                                     objectType,
+                                     object,
+                                     location,
+                                     messageCode,
+                                     pLayerPrefix,
+                                     pMessage);
+}
+
+// =====================================================================================================================
+// Get the flags for this callback
+VkDebugReportFlagsEXT DebugReportCallback::GetFlags()
+{
+    return m_createInfo.flags;
+}
+
+// =====================================================================================================================
+// Get the external callback function pointer for this callback
+PFN_vkDebugReportCallbackEXT DebugReportCallback::GetCallbackFunc()
+{
+    return m_createInfo.pfnCallback;
+}
+
+// =====================================================================================================================
+// Get the client-provided user data pointer for this callback
+void* DebugReportCallback::GetUserData()
+{
+    return m_createInfo.pUserData;
+}
+
+namespace entry
+{
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDebugReportCallbackEXT(
+    VkInstance                                instance,
+    const VkDebugReportCallbackCreateInfoEXT* pCreateInfo,
+    const VkAllocationCallbacks*              pAllocator,
+    VkDebugReportCallbackEXT*                 pCallback)
+{
+    Instance* pInstance = Instance::ObjectFromHandle(instance);
+
+    const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pInstance->GetAllocCallbacks();
+
+    return DebugReportCallback::Create(pInstance, pCreateInfo, pAllocCB, pCallback);
+}
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyDebugReportCallbackEXT(
+    VkInstance                                instance,
+    VkDebugReportCallbackEXT                  callback,
+    const VkAllocationCallbacks*              pAllocator)
+{
+    Instance* pInstance = Instance::ObjectFromHandle(instance);
+
+    const VkAllocationCallbacks* pAllocCB = pAllocator ? pAllocator : pInstance->GetAllocCallbacks();
+
+    DebugReportCallback::ObjectFromHandle(callback)->Destroy(pInstance, pAllocCB);
+}
+
+VKAPI_ATTR void VKAPI_CALL vkDebugReportMessageEXT(
+    VkInstance                                instance,
+    VkDebugReportFlagsEXT                     flags,
+    VkDebugReportObjectTypeEXT                objectType,
+    uint64_t                                  object,
+    size_t                                    location,
+    int32_t                                   messageCode,
+    const char*                               pLayerPrefix,
+    const char*                               pMessage)
+{
+    Instance* pInstance = Instance::ObjectFromHandle(instance);
+
+    pInstance->CallExternalCallbacks(flags, objectType, object, location, messageCode, pLayerPrefix, pMessage);
+}
+
+} // namespace entry
+
+} // namespace vk
diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp
index 0d5c20c9..d00dc160 100644
--- a/icd/api/vk_descriptor_set.cpp
+++ b/icd/api/vk_descriptor_set.cpp
@@ -106,7 +106,7 @@ void DescriptorSet::Reassign(
 
             // In this case we also have to copy the immutable sampler data from the descriptor set layout to the
             // descriptor set's appropriate memory locations.
-            InitImmutableDescriptors(numPalDevices);
+            InitImmutableDescriptors(pLayout, numPalDevices);
         }
         else
         {
@@ -126,19 +126,23 @@ void DescriptorSet::Reassign(
 
 // =====================================================================================================================
 // Initialize immutable descriptor data in the descriptor set.
-void DescriptorSet::InitImmutableDescriptors(uint32_t numPalDevices)
+void DescriptorSet::InitImmutableDescriptors(
+    const DescriptorSetLayout*  pLayout,
+    uint32_t                    numPalDevices)
 {
-    const size_t imageDescDwSize = m_pLayout->VkDevice()->GetProperties().descriptorSizes.imageView / sizeof(uint32_t);
-    const size_t samplerDescSize = m_pLayout->VkDevice()->GetProperties().descriptorSizes.sampler;
+    VK_ASSERT(m_pLayout == pLayout);
+
+    const size_t imageDescDwSize = pLayout->VkDevice()->GetProperties().descriptorSizes.imageView / sizeof(uint32_t);
+    const size_t samplerDescSize = pLayout->VkDevice()->GetProperties().descriptorSizes.sampler;
 
-    uint32_t immutableSamplersLeft = m_pLayout->Info().imm.numImmutableSamplers;
+    uint32_t immutableSamplersLeft = pLayout->Info().imm.numImmutableSamplers;
     uint32_t binding = 0;
 
-    uint32_t* pSrcData  = m_pLayout->Info().imm.pImmutableSamplerData;
+    uint32_t* pSrcData  = pLayout->Info().imm.pImmutableSamplerData;
 
     while (immutableSamplersLeft > 0)
     {
-        const DescriptorSetLayout::BindingInfo& bindingInfo = m_pLayout->Info().bindings[binding];
+        const DescriptorSetLayout::BindingInfo& bindingInfo = pLayout->Binding(binding);
         uint32_t desCount = bindingInfo.info.descriptorCount;
 
         if (bindingInfo.imm.dwSize > 0)
diff --git a/icd/api/vk_descriptor_set_layout.cpp b/icd/api/vk_descriptor_set_layout.cpp
index 2f448cdf..d46a2985 100644
--- a/icd/api/vk_descriptor_set_layout.cpp
+++ b/icd/api/vk_descriptor_set_layout.cpp
@@ -282,7 +282,8 @@ void DescriptorSetLayout::ConvertImmutableInfo(
 VkResult DescriptorSetLayout::ConvertCreateInfo(
     const Device*                          pDevice,
     const VkDescriptorSetLayoutCreateInfo* pIn,
-    CreateInfo*                            pOut)
+    CreateInfo*                            pOut,
+    BindingInfo*                           pOutBindings)
 {
     if (pIn == nullptr)
     {
@@ -324,7 +325,7 @@ VkResult DescriptorSetLayout::ConvertCreateInfo(
                 for (uint32_t inIndex = 0; inIndex < pInfo->bindingCount; ++inIndex)
                 {
                     const VkDescriptorSetLayoutBinding & currentBinding = pInfo->pBindings[inIndex];
-                    pOut->bindings[currentBinding.binding].info = currentBinding;
+                    pOutBindings[currentBinding.binding].info = currentBinding;
                 }
 
                 // Now iterate over our output array to convert the binding info.  Any gaps in
@@ -332,7 +333,7 @@ VkResult DescriptorSetLayout::ConvertCreateInfo(
                 // should be safe to call ConvertBindingInfo on those as well.
                 for (uint32_t bindingNumber = 0; bindingNumber < pOut->count; ++bindingNumber)
                 {
-                    BindingInfo* pBinding = &pOut->bindings[bindingNumber];
+                    BindingInfo* pBinding = &pOutBindings[bindingNumber];
 
                     // Determine the alignment requirement of descriptors in dwords.
                     uint32_t descAlignmentInDw = pDevice->GetProperties().descriptorSizes.alignment / sizeof(uint32_t);
@@ -441,10 +442,10 @@ VkResult DescriptorSetLayout::Create(
     info.count = bindingCount;
 
     // Set the bindings array to the appropriate location within the allocated memory
-    info.bindings = reinterpret_cast<BindingInfo*>(reinterpret_cast<uint8_t*>(pSysMem) + apiSize);
+    BindingInfo* pBindings = reinterpret_cast<BindingInfo*>(reinterpret_cast<uint8_t*>(pSysMem) + apiSize);
 
     // Also memset it as not all bindings may be actually used
-    memset(info.bindings, 0, bindingInfoAuxSize);
+    memset(pBindings, 0, bindingInfoAuxSize);
 
     // Set the base pointer of the immutable sampler data to the appropriate location within the allocated memory
     info.imm.pImmutableSamplerData = reinterpret_cast<uint32_t*>(Util::VoidPtrInc(pSysMem, apiSize + bindingInfoAuxSize));
@@ -453,7 +454,8 @@ VkResult DescriptorSetLayout::Create(
     VkResult result = ConvertCreateInfo(
         pDevice,
         pCreateInfo,
-        &info);
+        &info,
+        pBindings);
 
     if (result != VK_SUCCESS)
     {
diff --git a/icd/api/vk_dispatch.cpp b/icd/api/vk_dispatch.cpp
index 7742de73..c58ca16e 100644
--- a/icd/api/vk_dispatch.cpp
+++ b/icd/api/vk_dispatch.cpp
@@ -61,6 +61,7 @@
 #include "include/vk_shader.h"
 #include "include/vk_surface.h"
 #include "include/vk_swapchain.h"
+#include "include/vk_debug_report.h"
 
 #include <cstring>
 
@@ -447,6 +448,11 @@ const DispatchTableEntry g_StandardDispatchTable[] =
     PRIMARY_DISPATCH_ENTRY( vkGetPhysicalDeviceMultisamplePropertiesEXT     ),
 
     PRIMARY_DISPATCH_ENTRY( vkGetPhysicalDeviceExternalFencePropertiesKHR   ),
+    PRIMARY_DISPATCH_ENTRY( vkCreateDebugReportCallbackEXT                  ),
+    PRIMARY_DISPATCH_ENTRY( vkDestroyDebugReportCallbackEXT                 ),
+    PRIMARY_DISPATCH_ENTRY( vkDebugReportMessageEXT                         ),
+
+    PRIMARY_DISPATCH_ENTRY( vkCmdWriteBufferMarkerAMD                             ),
 
     VK_DISPATCH_TABLE_END()
 };
diff --git a/icd/api/vk_instance.cpp b/icd/api/vk_instance.cpp
index 53d1278f..26d597c0 100644
--- a/icd/api/vk_instance.cpp
+++ b/icd/api/vk_instance.cpp
@@ -53,6 +53,7 @@
 #include "palDevice.h"
 #include "palPlatform.h"
 #include "palOglPresent.h"
+#include "palListImpl.h"
 
 #include <new>
 
@@ -85,7 +86,8 @@ Instance::Instance(
 #endif
     m_screenCount(0),
     m_pScreenStorage(nullptr),
-    m_pDevModeMgr(nullptr)
+    m_pDevModeMgr(nullptr),
+    m_debugReportCallbacks(&m_palAllocator)
 #if PAL_ENABLE_PRINTS_ASSERTS
     , m_dispatchTableQueryCount(0)
 #endif
@@ -266,6 +268,13 @@ VkResult Instance::Init(
         return VK_ERROR_OUT_OF_HOST_MEMORY;
     }
 
+    // Initialize mutexes used for debug report extension before registering the callback with the Platform.
+    if ((m_logCallbackInternalOnlyMutex.Init() != Pal::Result::Success) ||
+        (m_logCallbackInternalExternalMutex.Init() != Pal::Result::Success))
+    {
+        return VK_ERROR_INITIALIZATION_FAILED;
+    }
+
     // Thunk PAL's memory allocator callbacks to our own
     const Util::AllocCallbacks allocCb =
     {
@@ -276,6 +285,15 @@ VkResult Instance::Init(
 
     Pal::PlatformCreateInfo createInfo = { 0 };
     createInfo.pAllocCb = &allocCb;
+
+    const Util::LogCallbackInfo callbackInfo =
+    {
+        this,
+        &LogCallback
+    };
+
+    createInfo.pLogInfo = &callbackInfo;
+
     createInfo.pSettingsPath = "/etc/amd";
 
     // Switch to "null" GPU mode if requested
@@ -584,6 +602,7 @@ const InstanceExtensions::Supported& Instance::GetSupportedExtensions()
 
         supportedExtensions.AddExtension(VK_INSTANCE_EXTENSION(KHR_EXTERNAL_SEMAPHORE_CAPABILITIES));
         supportedExtensions.AddExtension(VK_INSTANCE_EXTENSION(KHR_EXTERNAL_FENCE_CAPABILITIES));
+        supportedExtensions.AddExtension(VK_INSTANCE_EXTENSION(EXT_DEBUG_REPORT));
 
         supportedExtensionsPopulated = true;
     }
@@ -821,6 +840,164 @@ VkResult Instance::QueryApplicationProfile(RuntimeSettings* pRuntimeSettings)
 }
 #endif
 
+// =====================================================================================================================
+// Callback function used to route debug prints to the VK_EXT_debug_report extension
+void PAL_STDCALL Instance::LogCallback(
+    void*       pClientData,
+    Pal::uint32 level,
+    Pal::uint64 categoryMask,
+    const char* pFormat,
+    va_list     args)
+{
+    Instance* pInstance = reinterpret_cast<Instance*>(pClientData);
+    pInstance->LogMessage(level, categoryMask, pFormat, args);
+}
+
+// =====================================================================================================================
+// Add the given Debug Report Callback to the instance.
+VkResult Instance::RegisterDebugCallback(
+    DebugReportCallback* pCallback)
+{
+    VkResult result = VK_SUCCESS;
+
+    Pal::Result palResult = m_debugReportCallbacks.PushBack(pCallback);
+
+    if (palResult == Pal::Result::Success)
+    {
+        result = VK_SUCCESS;
+    }
+    else
+    {
+        result = VK_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    return result;
+}
+
+// =====================================================================================================================
+// Remove the given Debug Report Callback from the instance.
+void Instance::UnregisterDebugCallback(
+    DebugReportCallback* pCallback)
+{
+    auto it = m_debugReportCallbacks.Begin();
+
+    DebugReportCallback* element = *it.Get();
+
+    while (element != nullptr)
+    {
+        if (pCallback == element)
+        {
+            m_debugReportCallbacks.Erase(&it);
+
+            // Each element should only be in the list once; break out of loop once found
+            element = nullptr;
+        }
+        else
+        {
+            it.Next();
+            element = *it.Get();
+        }
+    }
+}
+
+// =====================================================================================================================
+// Convert log message data to match the format of the external callback, then call required external callbacks
+void Instance::LogMessage(uint32_t    level,
+                          uint64_t    categoryMask,
+                          const char* pFormat,
+                          va_list     args)
+{
+    // Guarantee serialization of this function to keep internal log messages from getting intermixed
+    m_logCallbackInternalOnlyMutex.Lock();
+
+    uint32_t flags = 0;
+
+    if (categoryMask == Pal::LogCategoryMaskInternal)
+    {
+        if ((level == static_cast<uint32_t>(Pal::LogLevel::Info)) ||
+            (level == static_cast<uint32_t>(Pal::LogLevel::Verbose)))
+        {
+            flags = VK_DEBUG_REPORT_INFORMATION_BIT_EXT;
+        }
+        else if (level == static_cast<uint32_t>(Pal::LogLevel::Alert))
+        {
+            flags = VK_DEBUG_REPORT_WARNING_BIT_EXT;
+        }
+        else if (level == static_cast<uint32_t>(Pal::LogLevel::Error))
+        {
+            flags = VK_DEBUG_REPORT_ERROR_BIT_EXT;
+        }
+        else if (level == static_cast<uint32_t>(Pal::LogLevel::Debug))
+        {
+            flags = VK_DEBUG_REPORT_DEBUG_BIT_EXT;
+        }
+        else if (level == static_cast<uint32_t>(Pal::LogLevel::Always))
+        {
+            flags = VK_DEBUG_REPORT_DEBUG_BIT_EXT |
+                    VK_DEBUG_REPORT_INFORMATION_BIT_EXT |
+                    VK_DEBUG_REPORT_WARNING_BIT_EXT |
+                    VK_DEBUG_REPORT_ERROR_BIT_EXT;
+        }
+    }
+    else if (categoryMask == Pal::LogCategoryMaskPerformance)
+    {
+        flags = VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT;
+    }
+
+    constexpr uint64_t  object = 0;
+    constexpr size_t    location = 0;
+    constexpr int32_t   messageCode = 0;
+    constexpr char      layerPrefix[] = "AMDVLK\0";
+
+    constexpr uint32_t messageSize = 256;
+    char message[messageSize];
+
+    Util::Vsnprintf(message,
+                    messageSize,
+                    pFormat,
+                    args);
+
+    CallExternalCallbacks(flags,
+                          VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT,
+                          object,
+                          location,
+                          messageCode,
+                          layerPrefix,
+                          message);
+
+    m_logCallbackInternalOnlyMutex.Unlock();
+}
+
+// =====================================================================================================================
+// Call all registered callbacks with the given VkDebugReportFlagsEXT.
+void Instance::CallExternalCallbacks(
+    VkDebugReportFlagsEXT       flags,
+    VkDebugReportObjectTypeEXT  objectType,
+    uint64_t                    object,
+    size_t                      location,
+    int32_t                     messageCode,
+    const char*                 pLayerPrefix,
+    const char*                 pMessage)
+{
+    // Guarantee serialization of this function to keep internal and external log messages from getting intermixed
+    m_logCallbackInternalExternalMutex.Lock();
+
+    for (auto it = m_debugReportCallbacks.Begin(); it.Get() != nullptr; it.Next())
+    {
+        DebugReportCallback* element = *it.Get();
+
+        if (flags & element->GetFlags())
+        {
+            PFN_vkDebugReportCallbackEXT pfnCallback = element->GetCallbackFunc();
+            void* pUserData = element->GetUserData();
+
+            (*pfnCallback)(flags, objectType, object, location, messageCode, pLayerPrefix, pMessage, pUserData);
+        }
+    }
+
+    m_logCallbackInternalExternalMutex.Unlock();
+}
+
 namespace entry
 {
 
diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp
index af98b7fd..e707dfcd 100644
--- a/icd/api/vk_memory.cpp
+++ b/icd/api/vk_memory.cpp
@@ -349,25 +349,6 @@ VkResult Memory::Create(
     }
     else if (vkResult == VK_SUCCESS)
     {
-        // Initialize tiny host visible allocations to zero
-        const uint32_t NumBytesToZero = 32;
-
-        if ((pAllocInfo->allocationSize < NumBytesToZero) &&
-            (createInfo.heaps[0] != Pal::GpuHeapInvisible))
-        {
-            void*    pData  = nullptr;
-            VkResult result = pMemory->Map(0, 0, NumBytesToZero, &pData);
-
-            VK_ASSERT(createInfo.size >= NumBytesToZero);
-
-            if (result == VK_SUCCESS)
-            {
-                memset(pData, 0, NumBytesToZero);
-
-                pMemory->Unmap();
-            }
-        }
-
         // notify the memory object that it is counted so that the destructor can decrease the counter accordingly
         pMemory->SetAllocationCounted();
 
diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp
index a2315675..3d545de9 100644
--- a/icd/api/vk_physical_device.cpp
+++ b/icd/api/vk_physical_device.cpp
@@ -2444,6 +2444,8 @@ DeviceExtensions::Supported PhysicalDevice::GetAvailableExtensions(
     // TODO: Add this extension if the related implementation of Linux is done.
     // availableExtensions.AddExtension(VK_DEVICE_EXTENSION(KHR_EXTERNAL_FENCE_FD));
 
+    availableExtensions.AddExtension(VK_DEVICE_EXTENSION(AMD_BUFFER_MARKER));
+
     return availableExtensions;
 }
 
diff --git a/icd/api/vk_pipeline.cpp b/icd/api/vk_pipeline.cpp
index ba3be9ca..9efc6312 100644
--- a/icd/api/vk_pipeline.cpp
+++ b/icd/api/vk_pipeline.cpp
@@ -484,12 +484,6 @@ void Pipeline::CreateLegacyPathElfBinary(
                 gpuVersionNote.gfxipMajorVer = 9;
                 gpuVersionNote.gfxipMinorVer = 0;
                 break;
-#ifdef VKI_CLOSED_SOURCE
-            case Pal::GfxIpLevel::GfxIp10:
-                gpuVersionNote.gfxipMajorVer = 10;
-                gpuVersionNote.gfxipMinorVer = 0;
-                break;
-#endif
             default:
                 VK_NEVER_CALLED();
                 break;
diff --git a/icd/api/vk_query.cpp b/icd/api/vk_query.cpp
index 7e0463be..169d8720 100644
--- a/icd/api/vk_query.cpp
+++ b/icd/api/vk_query.cpp
@@ -158,7 +158,10 @@ VkResult PalQueryPool::Create(
     {
         // Allocate and bind GPU memory for the object
         const bool removeInvisibleHeap = true;
-        result = pDevice->MemMgr()->AllocAndBindGpuMem(pPalQueryPool, false, &internalMem, removeInvisibleHeap);
+        const bool persistentMapped = true;
+
+        result = pDevice->MemMgr()->AllocAndBindGpuMem(
+                                        pPalQueryPool, false, &internalMem, removeInvisibleHeap, persistentMapped);
     }
 
     if (result == VK_SUCCESS)
@@ -227,6 +230,7 @@ VkResult PalQueryPool::GetResults(
             m_palQueryType,
             startQuery,
             queryCount,
+            m_internalMem.CpuAddr(),
             &dataSize,
             pData,
             static_cast<size_t>(stride));
@@ -406,8 +410,9 @@ VkResult TimestampQueryPool::GetResults(
 
         // Although the spec says that dataSize has to be large enough to contain the result of each query, which sort
         // of sounds like it makes it redundant, clamp the maximum number of queries written to the given dataSize
-        // just in case, since it's harmless to do.
-        queryCount = static_cast<uint32_t>(Util::Min(static_cast<size_t>(queryCount), dataSize / querySlotSize));
+        // and take account of the supplied stride, since it's harmless to do.
+        queryCount = Util::Min(queryCount,
+                static_cast<uint32_t>(dataSize / Util::Max(querySlotSize, static_cast<size_t>(stride))));
 
         // Write results of each query slot
         for (uint32_t dstSlot = 0; dstSlot < queryCount; ++dstSlot)
diff --git a/icd/make/importdefs b/icd/make/importdefs
index a0b4c8cf..cda93b6d 100644
--- a/icd/make/importdefs
+++ b/icd/make/importdefs
@@ -1,7 +1,7 @@
 # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION.  It describes the version of the PAL interface
 # that the ICD supports.  PAL uses this value to enable backwards-compatibility for older interface versions.  It must
 # be updated on each PAL promotion after handling all of the interface changes described in palLib.h.
-ICD_PAL_CLIENT_MAJOR_VERSION = 366
+ICD_PAL_CLIENT_MAJOR_VERSION = 377
 
 # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1.  It describes
 # the interface version of the gpuopen shared module (part of PAL) that the ICD supports.
diff --git a/icd/res/ver.h b/icd/res/ver.h
index b450f0d3..6501d652 100644
--- a/icd/res/ver.h
+++ b/icd/res/ver.h
@@ -40,7 +40,7 @@
 #define VERSION_MAJOR_STR           MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0"
 
 // Bump up after each promotion to mainline
-#define VULKAN_ICD_BUILD_VERSION    9
+#define VULKAN_ICD_BUILD_VERSION    10
 
 // String version is needed with leading zeros and extra termination (unicode)
 #define VERSION_NUMBER_MINOR        VULKAN_ICD_BUILD_VERSION
diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp
index f080caa1..d9d07c66 100644
--- a/icd/settings/settings.cpp
+++ b/icd/settings/settings.cpp
@@ -105,6 +105,9 @@ void ProcessSettings(
     // setup default values for the settings.
     SetupDefaults(pSettings);
 
+    // Update PAL settings based on runtime settings and desired driver defaults if needed
+    UpdatePalSettings(pPalDevice, pSettings);
+
 #ifdef ICD_BUILD_APPPROFILE
     const AppProfile origProfile = *pAppProfile;
     // Override defaults based on application profile
@@ -136,11 +139,6 @@ void ProcessSettings(
     {
         ProcessSettings(pPalDevice, pAppProfile, pSettings);
     }
-    else
-    {
-        // update PAL settings based on runtime settings if needed
-        UpdatePalSettings(pPalDevice, pSettings);
-    }
 #endif
 }
 
@@ -180,7 +178,7 @@ void UpdatePalSettings(
 {
     Pal::PalPublicSettings* pPalSettings = pPalDevice->GetPublicSettings();
 
-    /* Nothing to do here at the moment */
+    pPalSettings->hintDisableSmallSurfColorCompressionSize = 0;
 }
 
 };