Skip to content

Commit

Permalink
Make descriptor offset relocatable.
Browse files Browse the repository at this point in the history
This commit consists of the original changes made by s-perron
that emits a relocatable entry for each descriptor load, instead
of baking in a constant offset during initial compilation. The
compiler patches up the compiled binary with actual descriptor
offset at a later time when that info is available.
  • Loading branch information
Yong He committed Mar 20, 2020
1 parent 428a1e2 commit 4a25fbe
Show file tree
Hide file tree
Showing 8 changed files with 526 additions and 23 deletions.
51 changes: 47 additions & 4 deletions llpc/patch/llpcPatchDescriptorLoad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,18 @@
* @brief LLPC source file: contains implementation of class Llpc::PatchDescriptorLoad.
***********************************************************************************************************************
*/
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"

#include "llpcBuilderContext.h"
#include "llpcPatchDescriptorLoad.h"
#include "llpcTargetInfo.h"

#include <sstream>

#define DEBUG_TYPE "llpc-patch-descriptor-load"

using namespace llvm;
Expand Down Expand Up @@ -435,8 +439,21 @@ Value* PatchDescriptorLoad::LoadDescriptor(
"",
pInsertPoint);

auto pDescOffset = ConstantInt::get(Type::getInt32Ty(*m_pContext), descOffset);

llvm::Value* pDescOffset = nullptr;
if (!m_pPipelineState->GetBuilderContext()->BuildingRelocatableElf())
{
pDescOffset = ConstantInt::get(Type::getInt32Ty(*m_pContext), descOffset);
}
else
{
IRBuilder<> builder(*m_pContext);
builder.SetInsertPoint(pInsertPoint);
std::stringstream ssRelocName;
ssRelocName << "doff_" << descSet << "_" << binding << "_b";
pDescOffset = builder.CreateIntrinsic(Intrinsic::amdgcn_reloc_constant, {}, {
llvm::MetadataAsValue::get(*m_pContext, llvm::MDNode::get(*m_pContext, llvm::MDString::get(*m_pContext, ssRelocName.str().c_str())))
});
}
pDescElem0 = BinaryOperator::CreateAdd(pDescElem0, pDescOffset, "", pInsertPoint);

if (pDescPtrTy == nullptr)
Expand Down Expand Up @@ -509,8 +526,34 @@ Value* PatchDescriptorLoad::LoadDescriptor(
}
else
{
auto pDescOffset = ConstantInt::get(Type::getInt32Ty(*m_pContext), descOffset);
auto pDescSize = ConstantInt::get(Type::getInt32Ty(*m_pContext), descSize, 0);
llvm::Value* pDescOffset = nullptr;
if (!m_pPipelineState->GetBuilderContext()->BuildingRelocatableElf())
{
pDescOffset = ConstantInt::get(Type::getInt32Ty(*m_pContext), descOffset);
}
else
{
IRBuilder<> builder(*m_pContext);
builder.SetInsertPoint(pInsertPoint);
std::stringstream ssRelocName;
ssRelocName << "doff_" << descSet << "_" << binding;
if (nodeType1 == ResourceMappingNodeType::DescriptorSampler)
{
ssRelocName << "_s";
}
else if (nodeType1 == ResourceMappingNodeType::DescriptorResource)
{
ssRelocName << "_r";
}
else
{
ssRelocName << "_b";
}
pDescOffset = builder.CreateIntrinsic(Intrinsic::amdgcn_reloc_constant, {}, {
llvm::MetadataAsValue::get(*m_pContext, llvm::MDNode::get(*m_pContext, llvm::MDString::get(*m_pContext, ssRelocName.str().c_str())))
});
}
auto pDescSize = ConstantInt::get(Type::getInt32Ty(*m_pContext), descSize, 0);

Value* pOffset = BinaryOperator::CreateMul(pArrayOffset, pDescSize, "", pInsertPoint);
pOffset = BinaryOperator::CreateAdd(pOffset, pDescOffset, "", pInsertPoint);
Expand Down
67 changes: 67 additions & 0 deletions llpc/test/shaderdb/PipelineCs_RelocCombinedTextureSampler.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@

// This test case checks that two relocation entries are generated for a combined texture sampler descriptor,
// one for the resource handle offset, and another for the sampler handle offset.
; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -use-relocatable-shader-elf -o test_cs_pipeline_out_reloccombinedtex.elf %gfxip %s && llvm-objdump -triple=amdgcn -mcpu=gfx900 -d test_cs_pipeline_out_reloccombinedtex.elf | FileCheck -check-prefix=SHADERTEST %s && rm test_cs_pipeline_out_reloccombinedtex.elf
// Test that correct relocated offsets are in the linked texture fetching code.
; SHADERTEST-LABEL: 0000000000000000 _amdgpu_cs_main:
// Matching the relocation entry for the resource descriptor
; SHADERTEST-DAG: s_mov_b32 s[[RELOREG_RESOURCE:[0-9]+]], 16 //{{.*}}
// Matching the relocation entry for the sampler descriptor
; SHADERTEST-DAG: s_mov_b32 s[[RELOREG_SAMPLER:[0-9]+]], 48 //{{.*}}
// Loading the resource descriptor
; SHADERTEST: s_load_dwordx8 s[[RESOURCE_REG:\[[0-9]+:[0-9]+\]]], s[{{.*}}:{{.*}}], s[[RELOREG_RESOURCE]] //{{.*}}
// Loading the sampler descriptor
; SHADERTEST: s_load_dwordx4 s[[SAMPLER_REG:\[[0-9]+:[0-9]+\]]], s[{{.*}}:{{.*}}], s[[RELOREG_SAMPLER]] //{{.*}}
// Sampling the texture
; SHADERTEST: image_sample_lz v[0:3], v0, s[[RESOURCE_REG]], s[[SAMPLER_REG]] {{.*}}
; END_SHADERTEST

; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -use-relocatable-shader-elf -v %gfxip %s | FileCheck -check-prefix=SHADERTEST1 %s
; SHADERTEST1-LABEL: {{^// LLPC}} pipeline patching results
; SHADERTEST1: %[[RELOCONST:[0-9]+]] = call i32 @llvm.amdgcn.reloc.constant(metadata !{{.*}})
; SHADERTEST1: %[[RELOCONSTEXT:[0-9]+]] = zext i32 %[[RELOCONST]] to i64
; SHADERTEST1: %[[BUFFERDESCPTR:[0-9]+]] = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %{{.*}}, i64 0, i64 %[[RELOCONSTEXT]]
; SHADERTEST1: %[[BUFFERADDR:[0-9]+]] = bitcast i8 addrspace(4)* %[[BUFFERDESCPTR]] to <4 x i32> addrspace(4)*, !amdgpu.uniform !{{.*}}
; SHADERTEST1: %{{[0-9]+}} = load <4 x i32>, <4 x i32> addrspace(4)* %[[BUFFERADDR]], align 16, !invariant.load !{{.*}}
; SHADERTEST1: AMDLLPC SUCCESS
; END_SHADERTEST

[CsGlsl]
#version 450
#extension GL_ARB_separate_shader_objects : enable

layout(binding = 0) uniform sampler2D texSampler;

layout(set = 1, binding = 0, std430) buffer OUT
{
vec4 o;
};

layout(local_size_x = 2, local_size_y = 3) in;
void main() {
o = texture(texSampler, vec2(0.0, 0.0));
}


[CsInfo]
entryPoint = main
userDataNode[0].type = DescriptorTableVaPtr
userDataNode[0].offsetInDwords = 0
userDataNode[0].sizeInDwords = 1
userDataNode[0].set = 0
userDataNode[0].next[0].type = DescriptorCombinedTexture
userDataNode[0].next[0].offsetInDwords = 4
userDataNode[0].next[0].sizeInDwords = 12
userDataNode[0].next[0].set = 0
userDataNode[0].next[0].binding = 0
userDataNode[1].type = DescriptorTableVaPtr
userDataNode[1].offsetInDwords = 1
userDataNode[1].sizeInDwords = 1
userDataNode[1].set = 1
userDataNode[1].next[0].type = DescriptorBuffer
userDataNode[1].next[0].offsetInDwords = 16
userDataNode[1].next[0].sizeInDwords = 8
userDataNode[1].next[0].set = 1
userDataNode[1].next[0].binding = 0
58 changes: 58 additions & 0 deletions llpc/test/shaderdb/PipelineCs_RelocConst.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// This test case checks that descriptor offset relocation works for buffer descriptors in a compute pipeline.
; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -use-relocatable-shader-elf -o test_cs_pipeline_out_relocconst.elf %gfxip %s && llvm-objdump -triple=amdgcn -mcpu=gfx900 -d test_cs_pipeline_out_relocconst.elf | FileCheck -check-prefix=SHADERTEST %s && rm test_cs_pipeline_out_relocconst.elf
; SHADERTEST-LABEL: 0000000000000000 _amdgpu_cs_main:
; SHADERTEST: s_mov_b32 s[[RELOREG:[0-9]+]], 16 //{{.*}}
; SHADERTEST: s_load_dwordx4 s[{{.*}}:{{.*}}], s[{{.*}}:{{.*}}], s[[RELOREG]] //{{.*}}
; END_SHADERTEST

; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -use-relocatable-shader-elf -v %gfxip %s | FileCheck -check-prefix=SHADERTEST1 %s
; SHADERTEST1-LABEL: {{^// LLPC}} pipeline patching results
; SHADERTEST1: %[[RELOCONST:[0-9]+]] = call i32 @llvm.amdgcn.reloc.constant(metadata !{{.*}})
; SHADERTEST1: %[[RELOCONSTEXT:[0-9]+]] = zext i32 %[[RELOCONST]] to i64
; SHADERTEST1: %[[BUFFERDESCPTR:[0-9]+]] = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %{{.*}}, i64 0, i64 %[[RELOCONSTEXT]]
; SHADERTEST1: %[[BUFFERADDR:[0-9]+]] = bitcast i8 addrspace(4)* %[[BUFFERDESCPTR]] to <4 x i32> addrspace(4)*, !amdgpu.uniform !{{.*}}
; SHADERTEST1: %{{[0-9]+}} = load <4 x i32>, <4 x i32> addrspace(4)* %[[BUFFERADDR]], align 16, !invariant.load !{{.*}}
; SHADERTEST1: AMDLLPC SUCCESS
; END_SHADERTEST

[CsGlsl]
#version 450
#extension GL_ARB_separate_shader_objects : enable

layout(binding = 0) uniform UniformBufferObject {
vec4 i;
} ubo;

layout(set = 1, binding = 0, std430) buffer OUT
{
vec4 o;
};

layout(local_size_x = 2, local_size_y = 3) in;
void main() {
o = ubo.i;
}


[CsInfo]
entryPoint = main
userDataNode[0].type = DescriptorTableVaPtr
userDataNode[0].offsetInDwords = 0
userDataNode[0].sizeInDwords = 1
userDataNode[0].set = 0
userDataNode[0].next[0].type = DescriptorBuffer
userDataNode[0].next[0].offsetInDwords = 4
userDataNode[0].next[0].sizeInDwords = 8
userDataNode[0].next[0].set = 0
userDataNode[0].next[0].binding = 0
userDataNode[1].type = DescriptorTableVaPtr
userDataNode[1].offsetInDwords = 1
userDataNode[1].sizeInDwords = 1
userDataNode[1].set = 1
userDataNode[1].next[0].type = DescriptorBuffer
userDataNode[1].next[0].offsetInDwords = 4
userDataNode[1].next[0].sizeInDwords = 8
userDataNode[1].next[0].set = 1
userDataNode[1].next[0].binding = 0
104 changes: 104 additions & 0 deletions llpc/test/shaderdb/PipelineVsFs_RelocConst.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@

// This test case checks that descriptor offset relocation works for buffer descriptors in a vs/fs pipeline.
; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -use-relocatable-shader-elf -o test_pipeline_out_relocconst.elf %gfxip %s && llvm-objdump -triple=amdgcn -mcpu=gfx900 -d test_pipeline_out_relocconst.elf | FileCheck -check-prefix=SHADERTEST %s && rm test_pipeline_out_relocconst.elf
; SHADERTEST-LABEL: 0000000000000000 _amdgpu_vs_main:
; SHADERTEST: s_mov_b32 s[[RELOREG:[0-9]+]], 12 //{{.*}}
; SHADERTEST: s_load_dwordx4 s[{{.*}}:{{.*}}], s[{{.*}}:{{.*}}], s[[RELOREG]] //{{.*}}
; SHADERTEST: {{[0-9A-Za-z]+}} _amdgpu_ps_main:
; END_SHADERTEST

; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -use-relocatable-shader-elf -v %gfxip %s | FileCheck -check-prefix=SHADERTEST1 %s
; SHADERTEST1-LABEL: {{^// LLPC}} pipeline patching results
; SHADERTEST1: %[[RELOCONST:[0-9]+]] = call i32 @llvm.amdgcn.reloc.constant(metadata !{{.*}})
; SHADERTEST1: %[[RELOCONSTEXT:[0-9]+]] = zext i32 %[[RELOCONST]] to i64
; SHADERTEST1: %[[BUFFERDESCPTR:[0-9]+]] = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %{{.*}}, i64 0, i64 %[[RELOCONSTEXT]]
; SHADERTEST1: %[[BUFFERADDR:[0-9]+]] = bitcast i8 addrspace(4)* %[[BUFFERDESCPTR]] to <4 x i32> addrspace(4)*, !amdgpu.uniform !{{.*}}
; SHADERTEST1: %{{[0-9]+}} = load <4 x i32>, <4 x i32> addrspace(4)* %[[BUFFERADDR]], align 16, !invariant.load !{{.*}}
; SHADERTEST1: AMDLLPC SUCCESS
; END_SHADERTEST

[VsGlsl]
#version 450
#extension GL_ARB_separate_shader_objects : enable

layout(binding = 0) uniform UniformBufferObject {
mat4 model;
mat4 view;
vec4 proj;
} ubo;

layout(location = 0) in vec2 inPosition;
layout(location = 1) in vec3 inColor;

layout(location = 0) out vec3 fragColor;

void main() {
gl_Position = ubo.proj;
fragColor = inColor;
}


[VsInfo]
entryPoint = main
userDataNode[0].type = IndirectUserDataVaPtr
userDataNode[0].offsetInDwords = 0
userDataNode[0].sizeInDwords = 1
userDataNode[0].indirectUserDataCount = 0
userDataNode[1].type = DescriptorTableVaPtr
userDataNode[1].offsetInDwords = 1
userDataNode[1].sizeInDwords = 4
userDataNode[1].set = 0
userDataNode[1].next[0].type = DescriptorBuffer
userDataNode[1].next[0].offsetInDwords = 3
userDataNode[1].next[0].sizeInDwords = 8
userDataNode[1].next[0].set = 0
userDataNode[1].next[0].binding = 0

trapPresent = 0
debugMode = 0
enablePerformanceData = 0
vgprLimit = 0
sgprLimit = 0
maxThreadGroupsPerComputeUnit = 0

[FsGlsl]
#version 450
#extension GL_ARB_separate_shader_objects : enable

layout(location = 0) in vec3 fragColor;
layout(location = 0) out vec4 outputColor;
void main() {
outputColor = vec4(fragColor, 1.0);
}

[FsInfo]
entryPoint = main
trapPresent = 0
debugMode = 0
enablePerformanceData = 0
vgprLimit = 0
sgprLimit = 0
maxThreadGroupsPerComputeUnit = 0

[GraphicsPipelineState]
topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP
patchControlPoints = 0
deviceIndex = 0
disableVertexReuse = 0
switchWinding = 0
enableMultiView = 0
depthClipEnable = 1
rasterizerDiscardEnable = 0
perSampleShading = 1
numSamples = 8
samplePatternIdx = 48
usrClipPlaneMask = 0
includeDisassembly = 0
alphaToCoverageEnable = 0
dualSourceBlendEnable = 1
colorBuffer[0].format = VK_FORMAT_R32G32B32A32_SFLOAT
colorBuffer[0].channelWriteMask = 15
colorBuffer[0].blendEnable = 1
colorBuffer[0].blendSrcAlphaToColor = 1
12 changes: 6 additions & 6 deletions llpc/util/llpcElfReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ uint32_t ElfReader<Elf>::GetSymbolCount() const
// Gets info of the symbol in the symbol table section according to the specified index.
template<class Elf>
void ElfReader<Elf>::GetSymbol(
uint32_t idx, // Symbol index
ElfSymbol* pSymbol) // [out] Info of the symbol
uint32_t idx, // Symbol index
ElfSymbol* pSymbol) const // [out] Info of the symbol
{
auto& pSection = m_sections[m_symSecIdx];
const char* pStrTab = reinterpret_cast<const char*>(m_sections[m_strtabSecIdx]->pData);
Expand All @@ -216,7 +216,7 @@ void ElfReader<Elf>::GetSymbol(
// =====================================================================================================================
// Gets the count of relocations in the relocation section.
template<class Elf>
uint32_t ElfReader<Elf>::GetRelocationCount()
uint32_t ElfReader<Elf>::GetRelocationCount() const
{
uint32_t relocCount = 0;
if (m_relocSecIdx >= 0)
Expand All @@ -231,8 +231,8 @@ uint32_t ElfReader<Elf>::GetRelocationCount()
// Gets info of the relocation in the relocation section according to the specified index.
template<class Elf>
void ElfReader<Elf>::GetRelocation(
uint32_t idx, // Relocation index
ElfReloc* pReloc) // [out] Info of the relocation
uint32_t idx, // Relocation index
ElfReloc* pReloc) const // [out] Info of the relocation
{
auto& pSection = m_sections[m_relocSecIdx];

Expand Down Expand Up @@ -321,7 +321,7 @@ void ElfReader<Elf>::GetSymbolsBySectionIndex(
secSymbols.push_back(symbol);
}
}

std::sort(secSymbols.begin(), secSymbols.end(),
[](const ElfSymbol& a, const ElfSymbol& b)
{
Expand Down
8 changes: 4 additions & 4 deletions llpc/util/llpcElfReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ static const char ShStrTabName[] = ".shstrtab"; // Name of ".shstrtab" se
static const char StrTabName[] = ".strtab"; // Name of ".strtab" section
static const char SymTabName[] = ".symtab"; // Name of ".symtab" section
static const char NoteName[] = ".note"; // Name of ".note" section
static const char RelocName[] = ".reloc"; // Name of ".reloc" section
static const char RelocName[] = ".rel.text"; // Name of ".reloc" section
static const char CommentName[] = ".comment"; // Name of ".comment" section

static const uint32_t NT_AMD_AMDGPU_ISA = 11; // Note type of AMDGPU ISA version
Expand Down Expand Up @@ -469,16 +469,16 @@ class ElfReader
bool IsSectionPresent(const char* pName) const { return (m_map.find(pName) != m_map.end()); }

uint32_t GetSymbolCount() const;
void GetSymbol(uint32_t idx, ElfSymbol* pSymbol);
void GetSymbol(uint32_t idx, ElfSymbol* pSymbol) const;

bool IsValidSymbol(const char* pSymbolName);

ElfNote GetNote(Util::Abi::PipelineAbiNoteType noteType) const;

void GetSymbolsBySectionIndex(uint32_t secIndx, std::vector<ElfSymbol>& secSymbols) const;

uint32_t GetRelocationCount();
void GetRelocation(uint32_t idx, ElfReloc* pReloc);
uint32_t GetRelocationCount() const;
void GetRelocation(uint32_t idx, ElfReloc* pReloc) const;

// Gets the section index for the specified section name.
int32_t GetSectionIndex(const char* pName) const
Expand Down
Loading

0 comments on commit 4a25fbe

Please sign in to comment.