Skip to content

Commit

Permalink
Make descriptor offset relocatable.
Browse files Browse the repository at this point in the history
This commit consists of the original changes made by s-perron
that emits a relocatable entry for each descriptor load, instead
of baking in a constant offset during initial compilation. The
compiler patches up the compiled binary with actual descriptor
offset at a later time when that info is available.
  • Loading branch information
Yong He committed Apr 17, 2020
1 parent e1d13e0 commit 20f036f
Show file tree
Hide file tree
Showing 17 changed files with 755 additions and 117 deletions.
5 changes: 5 additions & 0 deletions lgc/include/lgc/util/Internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#pragma once

#include "lgc/CommonDefs.h"
#include "../interface/lgc/BuilderBase.h"

#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
Expand Down Expand Up @@ -75,6 +77,9 @@ llvm::CallInst *emitCall(llvm::StringRef funcName, llvm::Type *retTy, llvm::Arra
llvm::CallInst *emitCall(llvm::StringRef funcName, llvm::Type *retTy, llvm::ArrayRef<llvm::Value *> args,
llvm::ArrayRef<llvm::Attribute::AttrKind> attribs, llvm::BasicBlock *insertAtEnd);

// Emits a amdgcn.reloc.constant intrinsics that represents a relocatable value with the given symbol name.
llvm::CallInst* emitRelocationConstant(llvm::IRBuilderBase *builder, const llvm::Twine &symbolName);

// Adds LLVM-style type mangling suffix for the specified return type and args to the name.
void addTypeMangling(llvm::Type *returnTy, llvm::ArrayRef<llvm::Value *> args, std::string &name);

Expand Down
54 changes: 41 additions & 13 deletions lgc/patch/PatchDescriptorLoad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
***********************************************************************************************************************
*/
#include "PatchDescriptorLoad.h"
#include "../interface/lgc/LgcContext.h"
#include "lgc/state/TargetInfo.h"
#include "lgc/util/Internal.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
Expand Down Expand Up @@ -166,8 +168,8 @@ void PatchDescriptorLoad::processDescriptorGetPtr(CallInst *descPtrCall, StringR
// @param resType : Resource type
// @param descSet : Descriptor set
// @param binding : Binding
// @param topNode : Node in top-level descriptor table (TODO: nullptr for shader compilation)
// @param node : The descriptor node itself (TODO: nullptr for shader compilation)
// @param topNode : Node in top-level descriptor table (nullptr for shader compilation)
// @param node : The descriptor node itself (nullptr for shader compilation)
// @param shadow : Whether to load from shadow descriptor table
// @param [in/out] builder : IRBuilder
Value *PatchDescriptorLoad::getDescPtrAndStride(ResourceNodeType resType, unsigned descSet, unsigned binding,
Expand Down Expand Up @@ -198,11 +200,12 @@ Value *PatchDescriptorLoad::getDescPtrAndStride(ResourceNodeType resType, unsign

if (!stride) {
// Stride is not determinable just from the descriptor type requested by the Builder call.
if (!node) {
// TODO: Shader compilation: Get byte stride using a reloc.
llvm_unreachable("");
if (m_pipelineState->getLgcContext()->buildingRelocatableElf()) {
// Shader compilation: Get byte stride using a reloc.
stride = emitRelocationConstant(&builder, "dstride_" + Twine(descSet) + "_" + Twine(binding));
} else {
// Pipeline compilation: Get the stride from the resource type in the node.
assert(node && "expected valid user data node to determine descriptor stride.");
switch (node->type) {
case ResourceNodeType::DescriptorSampler:
stride = builder.getInt32(DescriptorSizeSampler / 4);
Expand Down Expand Up @@ -261,8 +264,8 @@ Value *PatchDescriptorLoad::getDescPtrAndStride(ResourceNodeType resType, unsign
// @param resType : Resource type
// @param descSet : Descriptor set
// @param binding : Binding
// @param topNode : Node in top-level descriptor table (TODO: nullptr for shader compilation)
// @param node : The descriptor node itself (TODO: nullptr for shader compilation)
// @param topNode : Node in top-level descriptor table (nullptr for shader compilation)
// @param node : The descriptor node itself (nullptr for shader compilation)
// @param shadow : Whether to load from shadow descriptor table
// @param [in/out] builder : IRBuilder
Value *PatchDescriptorLoad::getDescPtr(ResourceNodeType resType, unsigned descSet, unsigned binding,
Expand All @@ -287,22 +290,47 @@ Value *PatchDescriptorLoad::getDescPtr(ResourceNodeType resType, unsigned descSe

// Add on the byte offset of the descriptor.
Value *offset = nullptr;
if (!node) {
// TODO: Shader compilation: Get the offset for the descriptor using a reloc. The reloc symbol name
bool useRelocationForOffsets = !node || m_pipelineState->getLgcContext()->buildingRelocatableElf();
if (useRelocationForOffsets) {
// Get the offset for the descriptor using a reloc. The reloc symbol name
// needs to contain the descriptor set and binding, and, for image, fmask or sampler, whether it is
// a sampler.
llvm_unreachable("");
StringRef relocNameSuffix = "";
switch (resType) {
case ResourceNodeType::DescriptorSampler:
case ResourceNodeType::DescriptorYCbCrSampler:
relocNameSuffix = "_s";
break;
case ResourceNodeType::DescriptorResource:
relocNameSuffix = "_r";
break;
case ResourceNodeType::DescriptorBuffer:
case ResourceNodeType::DescriptorBufferCompact:
case ResourceNodeType::DescriptorTexelBuffer:
relocNameSuffix = "_b";
break;
default:
relocNameSuffix = "_x";
break;
}
offset = emitRelocationConstant(&builder, "doff_" + Twine(descSet) + "_" + Twine(binding) + relocNameSuffix);
// The LLVM's internal handling of GEP instruction results in a lot of junk code and prevented selection
// of the offset-from-register variant of the s_load_dwordx4 instruction. To workaround this issue,
// we use integer arithmetic here so the amdgpu backend can pickup the optimal instruction.
// When relocation is used, offset is in bytes, not in dwords.
descPtr = builder.CreatePtrToInt(descPtr, builder.getInt64Ty());
descPtr = builder.CreateAdd(descPtr, builder.CreateZExt(offset, builder.getInt64Ty()));
descPtr = builder.CreateIntToPtr(descPtr, builder.getInt32Ty()->getPointerTo(ADDR_SPACE_CONST));
} else {
// Get the offset for the descriptor. Where we are getting the second part of a combined resource,
// add on the size of the first part.
unsigned offsetInDwords = node->offsetInDwords;
if (resType == ResourceNodeType::DescriptorSampler && node->type == ResourceNodeType::DescriptorCombinedTexture)
offsetInDwords += DescriptorSizeResource / 4;
offset = builder.getInt32(offsetInDwords);
descPtr = builder.CreateBitCast(descPtr, builder.getInt32Ty()->getPointerTo(ADDR_SPACE_CONST));
descPtr = builder.CreateGEP(builder.getInt32Ty(), descPtr, offset);
}
descPtr = builder.CreateBitCast(descPtr, builder.getInt32Ty()->getPointerTo(ADDR_SPACE_CONST));
descPtr = builder.CreateGEP(builder.getInt32Ty(), descPtr, offset);

return descPtr;
}

Expand Down
12 changes: 12 additions & 0 deletions lgc/util/Internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

#include "lgc/BuilderBase.h"
#include "lgc/util/Internal.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"

#define DEBUG_TYPE "llpc-internal"

Expand Down Expand Up @@ -82,6 +83,17 @@ CallInst *emitCall(StringRef funcName, Type *retTy, ArrayRef<Value *> args, Arra
return builder.createNamedCall(funcName, retTy, args, attribs);
}

// =====================================================================================================================
// Emits a amdgcn.reloc.constant intrinsics that represents a relocatable value with the given symbol name.
//
// @param builder : [in,out] An IRBuilder for instruction insertion
// @param symbolName : Name of the relocation symbol associated with this relocation
llvm::CallInst *emitRelocationConstant(llvm::IRBuilderBase *builder, const llvm::Twine &symbolName) {
auto mdNode = llvm::MDNode::get(builder->getContext(), llvm::MDString::get(builder->getContext(), symbolName.str()));
return builder->CreateIntrinsic(llvm::Intrinsic::amdgcn_reloc_constant, {},
{llvm::MetadataAsValue::get(builder->getContext(), mdNode)});
}

// =====================================================================================================================
// Gets LLVM-style name for type.
//
Expand Down
40 changes: 20 additions & 20 deletions llpc/context/llpcCompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,15 +734,15 @@ Result Compiler::buildPipelineWithRelocatableElf(Context *context, ArrayRef<cons
IShaderCache *userShaderCache = nullptr;
if (context->isGraphics()) {
auto pipelineInfo = reinterpret_cast<const GraphicsPipelineBuildInfo *>(context->getPipelineBuildInfo());
cacheHash = PipelineDumper::generateHashForGraphicsPipeline(pipelineInfo, true, stage);
cacheHash = PipelineDumper::generateHashForGraphicsPipeline(pipelineInfo, true, true, stage);
#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 38
pUserShaderCache = pPipelineInfo->pShaderCache;
userShaderCache = pipelineInfo->pShaderCache;
#endif
} else {
auto pipelineInfo = reinterpret_cast<const ComputePipelineBuildInfo *>(context->getPipelineBuildInfo());
cacheHash = PipelineDumper::generateHashForComputePipeline(pipelineInfo, true);
cacheHash = PipelineDumper::generateHashForComputePipeline(pipelineInfo, true, true);
#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 38
pUserShaderCache = pPipelineInfo->pShaderCache;
userShaderCache = pipelineInfo->pShaderCache;
#endif
}

Expand Down Expand Up @@ -1083,8 +1083,8 @@ unsigned GraphicsShaderCacheChecker::check(const Module *module, unsigned stageM

IShaderCache *appCache = nullptr;
#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 38
auto pPipelineInfo = reinterpret_cast<const GraphicsPipelineBuildInfo *>(m_pContext->GetPipelineBuildInfo());
pAppCache = pPipelineInfo->pShaderCache;
auto pPipelineInfo = reinterpret_cast<const GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
appCache = pPipelineInfo->pShaderCache;
#endif
if (stageMask & shaderStageToMask(ShaderStageFragment)) {
m_fragmentCacheEntryState = m_compiler->lookUpShaderCaches(appCache, &fragmentHash, &m_fragmentElf,
Expand Down Expand Up @@ -1115,7 +1115,7 @@ unsigned GraphicsShaderCacheChecker::check(const Module *module, unsigned stageM
void GraphicsShaderCacheChecker::updateRootUserDateOffset(ElfPackage *pipelineElf) {
ElfWriter<Elf64> writer(m_context->getGfxIpVersion());
// Load ELF binary
auto result = writer.ReadFromBuffer(pipelineElf->data(), pipelineElf->size());
auto result = writer.readFromBuffer(pipelineElf->data(), pipelineElf->size());
assert(result == Result::Success);
(void(result)); // unused
writer.updateElfBinary(m_context, pipelineElf);
Expand Down Expand Up @@ -1171,7 +1171,7 @@ void GraphicsShaderCacheChecker::updateAndMerge(Result result, ElfPackage *outpu

// Merge and store the result in pPipelineElf
ElfWriter<Elf64> writer(m_context->getGfxIpVersion());
auto result = writer.ReadFromBuffer(nonFragmentElf.pCode, nonFragmentElf.codeSize);
auto result = writer.readFromBuffer(nonFragmentElf.pCode, nonFragmentElf.codeSize);
assert(result == Result::Success);
(void(result)); // unused
writer.mergeElfBinary(m_context, &fragmentElf, outputPipelineElf);
Expand Down Expand Up @@ -1243,14 +1243,15 @@ Result Compiler::BuildGraphicsPipeline(const GraphicsPipelineBuildInfo *pipeline
const PipelineShaderInfo *shaderInfo[ShaderStageGfxCount] = {
&pipelineInfo->vs, &pipelineInfo->tcs, &pipelineInfo->tes, &pipelineInfo->gs, &pipelineInfo->fs,
};
bool buildingRelocatableElf = canUseRelocatableGraphicsShaderElf(shaderInfo);

for (unsigned i = 0; i < ShaderStageGfxCount && result == Result::Success; ++i)
result = validatePipelineShaderInfo(shaderInfo[i]);

MetroHash::Hash cacheHash = {};
MetroHash::Hash pipelineHash = {};
cacheHash = PipelineDumper::generateHashForGraphicsPipeline(pipelineInfo, true);
pipelineHash = PipelineDumper::generateHashForGraphicsPipeline(pipelineInfo, false);
cacheHash = PipelineDumper::generateHashForGraphicsPipeline(pipelineInfo, true, buildingRelocatableElf);
pipelineHash = PipelineDumper::generateHashForGraphicsPipeline(pipelineInfo, false, false);

if (result == Result::Success && EnableOuts()) {
LLPC_OUTS("===============================================================================\n");
Expand All @@ -1277,10 +1278,9 @@ Result Compiler::BuildGraphicsPipeline(const GraphicsPipelineBuildInfo *pipeline
}

ShaderEntryState cacheEntryState = ShaderEntryState::New;
bool buildingRelocatableElf = canUseRelocatableGraphicsShaderElf(shaderInfo);
IShaderCache *appCache = nullptr;
#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 38
pAppCache = pPipelineInfo->pShaderCache;
appCache = pipelineInfo->pShaderCache;
#endif
ShaderCache *shaderCache = nullptr;
CacheEntryHandle hEntry = nullptr;
Expand Down Expand Up @@ -1371,8 +1371,8 @@ Result Compiler::BuildComputePipeline(const ComputePipelineBuildInfo *pipelineIn

MetroHash::Hash cacheHash = {};
MetroHash::Hash pipelineHash = {};
cacheHash = PipelineDumper::generateHashForComputePipeline(pipelineInfo, true);
pipelineHash = PipelineDumper::generateHashForComputePipeline(pipelineInfo, false);
cacheHash = PipelineDumper::generateHashForComputePipeline(pipelineInfo, true, buildingRelocatableElf);
pipelineHash = PipelineDumper::generateHashForComputePipeline(pipelineInfo, false, buildingRelocatableElf);

if (result == Result::Success && EnableOuts()) {
const ShaderModuleData *moduleData = reinterpret_cast<const ShaderModuleData *>(pipelineInfo->cs.pModuleData);
Expand All @@ -1397,7 +1397,7 @@ Result Compiler::BuildComputePipeline(const ComputePipelineBuildInfo *pipelineIn
ShaderEntryState cacheEntryState = ShaderEntryState::New;
IShaderCache *appCache = nullptr;
#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 38
pAppCache = pPipelineInfo->pShaderCache;
appCache = pipelineInfo->pShaderCache;
#endif
ShaderCache *shaderCache = nullptr;
CacheEntryHandle hEntry = nullptr;
Expand Down Expand Up @@ -1537,7 +1537,7 @@ Result Compiler::CreateShaderCache(const ShaderCacheCreateInfo *pCreateInfo, //
ShaderCache *pShaderCache = new ShaderCache();

if (pShaderCache != nullptr) {
result = pShaderCache->Init(pCreateInfo, &auxCreateInfo);
result = pShaderCache->init(pCreateInfo, &auxCreateInfo);
if (result != Result::Success) {
pShaderCache->Destroy();
pShaderCache = nullptr;
Expand Down Expand Up @@ -1716,7 +1716,7 @@ void Compiler::buildShaderCacheHash(Context *context, unsigned stageMask, ArrayR
MetroHash64 hasher;

// Update common shader info
PipelineDumper::updateHashForPipelineShaderInfo(stage, shaderInfo, true, &hasher);
PipelineDumper::updateHashForPipelineShaderInfo(stage, shaderInfo, true, &hasher, false);
hasher.Update(pipelineInfo->iaState.deviceIndex);

// Update input/output usage (provided by middle-end caller of this callback).
Expand Down Expand Up @@ -1774,14 +1774,14 @@ void Compiler::linkRelocatableShaderElf(ElfPackage *shaderElfs, ElfPackage *pipe
ElfReader<Elf64> fsReader(m_gfxIp);
if (!shaderElfs[ShaderStageVertex].empty()) {
size_t codeSize = shaderElfs[ShaderStageVertex].size_in_bytes();
result = vsReader.ReadFromBuffer(shaderElfs[ShaderStageVertex].data(), &codeSize);
result = vsReader.readFromBuffer(shaderElfs[ShaderStageVertex].data(), &codeSize);
if (result != Result::Success)
return;
}

if (!shaderElfs[ShaderStageFragment].empty()) {
size_t codeSize = shaderElfs[ShaderStageFragment].size_in_bytes();
result = fsReader.ReadFromBuffer(shaderElfs[ShaderStageFragment].data(), &codeSize);
result = fsReader.readFromBuffer(shaderElfs[ShaderStageFragment].data(), &codeSize);
if (result != Result::Success)
return;
}
Expand All @@ -1790,7 +1790,7 @@ void Compiler::linkRelocatableShaderElf(ElfPackage *shaderElfs, ElfPackage *pipe
} else {
ElfReader<Elf64> csReader(m_gfxIp);
size_t codeSize = shaderElfs[ShaderStageCompute].size_in_bytes();
result = csReader.ReadFromBuffer(shaderElfs[ShaderStageCompute].data(), &codeSize);
result = csReader.readFromBuffer(shaderElfs[ShaderStageCompute].data(), &codeSize);
if (result != Result::Success)
return;
result = writer.linkComputeRelocatableElf(csReader, context);
Expand Down
2 changes: 1 addition & 1 deletion llpc/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ project(shadertest

if(DEFINED XGL_LLVM_SRC_PATH)
# This is a build where LLPC lit testing is integrated into AMDVLK cmake files.
set(AMDLLPC_TEST_DEPS amdllpc spvgen FileCheck count not)
set(AMDLLPC_TEST_DEPS amdllpc spvgen FileCheck llvm-objdump count not)
set(LLVM_DIR ${XGL_LLVM_SRC_PATH})
endif()

Expand Down
2 changes: 1 addition & 1 deletion llpc/test/lit.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@

tool_dirs = [config.llvm_tools_dir, config.amdllpc_dir]

tools = ['amdllpc']
tools = ['amdllpc', 'llvm-objdump']

llvm_config.add_tool_substitutions(tools, tool_dirs)
66 changes: 66 additions & 0 deletions llpc/test/shaderdb/PipelineCs_RelocCombinedTextureSampler.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@

// This test case checks that two relocation entries are generated for a combined texture sampler descriptor,
// one for the resource handle offset, and another for the sampler handle offset.
; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -use-relocatable-shader-elf -o %t.elf %gfxip %s && llvm-objdump --triple=amdgcn --mcpu=gfx900 -d %t.elf | FileCheck -check-prefix=SHADERTEST %s
// Test that correct relocated offsets are in the linked texture fetching code.
; SHADERTEST-LABEL: 0000000000000000 <_amdgpu_cs_main>:
// Matching the relocation entry for the resource descriptor
; SHADERTEST-DAG: s_mov_b32 s[[RELOREG_RESOURCE:[0-9]+]], 16 //{{.*}}
// Matching the relocation entry for the sampler descriptor
; SHADERTEST-DAG: s_mov_b32 s[[RELOREG_SAMPLER:[0-9]+]], 48 //{{.*}}
// Loading the resource descriptor
; SHADERTEST-DAG: s_load_dwordx8 s[[RESOURCE_REG:\[[0-9]+:[0-9]+\]]], s[{{.*}}:{{.*}}], s[[RELOREG_RESOURCE]] //{{.*}}
// Loading the sampler descriptor
; SHADERTEST-DAG: s_load_dwordx4 s[[SAMPLER_REG:\[[0-9]+:[0-9]+\]]], s[{{.*}}:{{.*}}], s[[RELOREG_SAMPLER]] //{{.*}}
// Sampling the texture
; SHADERTEST: image_sample_lz v[0:3], v0, s[[RESOURCE_REG]], s[[SAMPLER_REG]] {{.*}}
; END_SHADERTEST

; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -use-relocatable-shader-elf -v %gfxip %s | FileCheck -check-prefix=SHADERTEST1 %s
; SHADERTEST1-LABEL: {{^// LLPC}} pipeline patching results
; SHADERTEST1-DAG: ![[METADATANODE1:[0-9]+]] = !{!"doff_0_0_r"}
; SHADERTEST1-DAG: ![[METADATANODE2:[0-9]+]] = !{!"doff_0_0_s"}
; SHADERTEST1-DAG: %[[RELOCONST1:[0-9]+]] = call i32 @llvm.amdgcn.reloc.constant(metadata ![[METADATANODE1]])
; SHADERTEST1-DAG: %[[RELOCONST2:[0-9]+]] = call i32 @llvm.amdgcn.reloc.constant(metadata ![[METADATANODE2]])
; SHADERTEST1: AMDLLPC SUCCESS
; END_SHADERTEST

[CsGlsl]
#version 450
#extension GL_ARB_separate_shader_objects : enable

layout(binding = 0) uniform sampler2D texSampler;

layout(set = 1, binding = 0, std430) buffer OUT
{
vec4 o;
};

layout(local_size_x = 2, local_size_y = 3) in;
void main() {
o = texture(texSampler, vec2(0.0, 0.0));
}


[CsInfo]
entryPoint = main
userDataNode[0].type = DescriptorTableVaPtr
userDataNode[0].offsetInDwords = 0
userDataNode[0].sizeInDwords = 1
userDataNode[0].set = 0
userDataNode[0].next[0].type = DescriptorCombinedTexture
userDataNode[0].next[0].offsetInDwords = 4
userDataNode[0].next[0].sizeInDwords = 12
userDataNode[0].next[0].set = 0
userDataNode[0].next[0].binding = 0
userDataNode[1].type = DescriptorTableVaPtr
userDataNode[1].offsetInDwords = 1
userDataNode[1].sizeInDwords = 1
userDataNode[1].set = 1
userDataNode[1].next[0].type = DescriptorBuffer
userDataNode[1].next[0].offsetInDwords = 16
userDataNode[1].next[0].sizeInDwords = 8
userDataNode[1].next[0].set = 1
userDataNode[1].next[0].binding = 0
Loading

0 comments on commit 20f036f

Please sign in to comment.