Skip to content

Commit

Permalink
Support partial packing with dynamic indexing on VS-TCS
Browse files Browse the repository at this point in the history
TCS input import calls with dynamic indexing (locOffset/elemIdx) and the
corresponding VS output export calls cannot be unpacked - call them
unpackable calls. We should keep unpackable calls unchanged and map
their location properly. There are three things to do:
1.Skip unpackable calls in scalarization input calls
2.Skip unpackable calls in re-asssemble output calls
3.The inputLocInfoMap/outputLocInfoMap use plain location for unpackable
calls; The locations of the packable calls are remapped in front of
those of the unpackable calls.
4.The key to look up VS outputLocInfoMap or TCS inputLocInfoMap can be
the combination of location and component or just location.
  • Loading branch information
xuechen417 authored and JaxLinAMD committed Dec 8, 2020
1 parent 4b4e0c8 commit 3562e51
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 55 deletions.
46 changes: 40 additions & 6 deletions lgc/patch/PatchInOutImportExport.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -540,12 +540,34 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
if (m_pipelineState->canPackInOut() &&
(m_shaderStage == ShaderStageFragment || m_shaderStage == ShaderStageTessControl)) {
// The new InOutLocationInfo is used to map scalarized FS and TCS input import as compact as possible
const uint32_t elemIdxArgIdx = isInterpolantInputImport || m_shaderStage != ShaderStageFragment ? 2 : 1;
origLocInfo.setComponent(cast<ConstantInt>(callInst.getOperand(elemIdxArgIdx))->getZExtValue());
locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo);
const bool isTcs = m_shaderStage == ShaderStageTessControl;
const uint32_t elemIdxArgIdx = (isInterpolantInputImport || isTcs) ? 2 : 1;
bool hasDynIndex = false;
if (isTcs) {
hasDynIndex = !isa<ConstantInt>(callInst.getOperand(1)) || !isa<ConstantInt>(callInst.getOperand(2));
if (!hasDynIndex) {
// TCS input calls at the same location may have dynamic indexing or not
// Try the key as combination of location and component at first
origLocInfo.setComponent(cast<ConstantInt>(callInst.getOperand(elemIdxArgIdx))->getZExtValue());
locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo);
if (locInfoMapIt == resUsage->inOutUsage.inputLocInfoMap.end()) {
// Try the key as the plain location
origLocInfo.setComponent(0);
locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo);
hasDynIndex = true;
}
} else {
locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo);
}
} else {
origLocInfo.setComponent(cast<ConstantInt>(callInst.getOperand(elemIdxArgIdx))->getZExtValue());
locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo);
}
assert(locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end());

loc = locInfoMapIt->second.getLocation();
elemIdx = builder.getInt32(locInfoMapIt->second.getComponent());
if (!hasDynIndex)
elemIdx = builder.getInt32(locInfoMapIt->second.getComponent());
highHalf = locInfoMapIt->second.isHighHalf();
} else {
assert(locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end());
Expand Down Expand Up @@ -825,12 +847,24 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
}
} else {
if (m_pipelineState->canPackInOut()) {
assert(m_shaderStage == ShaderStageVertex || m_shaderStage == ShaderStageTessEval);
const bool isVs = m_shaderStage == ShaderStageVertex;
assert(isVs || m_shaderStage == ShaderStageTessEval);
origLocInfo.setComponent(cast<ConstantInt>(callInst.getOperand(1))->getZExtValue());
locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo);
bool relateDynIndex = false;
const bool checkDynIndex = (isVs && m_pipelineState->hasShaderStage(ShaderStageTessControl));
if (checkDynIndex && locInfoMapIt == resUsage->inOutUsage.outputLocInfoMap.end()) {
// The location in TCS may be used with dynamic indexing, try location as the key for a search
origLocInfo.setComponent(0);
locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo);
relateDynIndex = true;
}

if (locInfoMapIt != resUsage->inOutUsage.outputLocInfoMap.end()) {
loc = locInfoMapIt->second.getLocation();
elemIdx = locInfoMapIt->second.getComponent();
// Dynamic indexing related locations just use the location for mapping
if (!relateDynIndex)
elemIdx = locInfoMapIt->second.getComponent();
exist = true;
} else {
exist = false;
Expand Down
119 changes: 81 additions & 38 deletions lgc/patch/PatchResourceCollect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1382,7 +1382,7 @@ void PatchResourceCollect::matchGenericInOut() {
// Do input/output matching and location remapping
for (unsigned idx = 0; idx < 2; ++idx) {
// True means perform packing on input/output, otherwise no packing
if (m_pipelineState->canPackInOut() && m_inOutPackStates[m_shaderStage][idx]) {
if (m_inOutPackStates[m_shaderStage][idx]) {
packInOutLocation();
} else {
if (idx == 0)
Expand Down Expand Up @@ -2473,43 +2473,74 @@ void PatchResourceCollect::packInOutLocation() {
void PatchResourceCollect::fillInOutLocInfoMap() {
if (m_inputCalls.empty())
return;

assert(m_shaderStage == ShaderStageFragment || m_shaderStage == ShaderStageTessControl);

// Create locationInfoMap according to the packed calls
m_locationInfoMapManager->createMap(m_inputCalls, m_shaderStage);

const bool isTcs = m_shaderStage == ShaderStageTessControl;
const bool isFs = m_shaderStage == ShaderStageFragment;
assert(isTcs || isFs);
auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
auto &inputLocInfoMap = inOutUsage.inputLocInfoMap;
inputLocInfoMap.clear();

// TCS: @llpc.input.import.generic.%Type%(i32 location, i32 locOffset, i32 elemIdx, i32 vertexIdx)
// FS: @llpc.input.import.generic.%Type%(i32 location, i32 elemIdx, i32 interpMode, i32 interpLoc)
// @llpc.input.import.interpolant.%Type%(i32 location, i32 locOffset, i32 elemIdx,
// i32 interpMode, <2 x float> | i32 auxInterpValue)
const bool isTcs = m_shaderStage == ShaderStageTessControl;
for (auto call : m_inputCalls) {
const bool isInterpolant = !isTcs && call->getNumArgOperands() != 4;

// The locations of TCS with dynamic indexing (locOffset/elemIdx) cannot be unpacked
// NOTE: Dynamic indexing in FS is processed to be constant in the lower pass.
std::vector<CallInst *> packableCalls;
DenseSet<unsigned> unpackableLocs;
if (isTcs) {
for (auto call : m_inputCalls) {
const unsigned loc = cast<ConstantInt>(call->getOperand(0))->getZExtValue();
if (!isa<ConstantInt>(call->getOperand(1)) || !isa<ConstantInt>(call->getOperand(2)))
unpackableLocs.insert(loc);
}
}
if (unpackableLocs.empty()) {
packableCalls = std::move(m_inputCalls);
} else {
for (auto call : m_inputCalls) {
const unsigned loc = cast<ConstantInt>(call->getOperand(0))->getZExtValue();
if (unpackableLocs.count(loc) == 0)
packableCalls.push_back(call);
}
m_inputCalls.clear();
}

auto &inputLocInfoMap = inOutUsage.inputLocInfoMap;

// Create locationMap according to the packable calls
m_locationInfoMapManager->createMap(packableCalls, m_shaderStage);

// Fill inputLocInfoMap of TCS/FS for the packable calls
unsigned newLocIdx = 0;
for (auto call : packableCalls) {
const bool isInterpolant = isFs && call->getNumArgOperands() == 5;
unsigned locOffset = 0;
unsigned compIdxArgIdx = 1;
if (isInterpolant || isTcs) {
assert(isa<ConstantInt>(call->getOperand(1)));
locOffset = cast<ConstantInt>(call->getOperand(1))->getZExtValue();
compIdxArgIdx = 2;
}
assert(isa<ConstantInt>(call->getOperand(compIdxArgIdx)));

// Construct original InOutLocationInfo from the location and elemIdx operands of the FS' or TCS' input import call
// Get the packed InOutLocationInfo from locationInfoMap
InOutLocationInfo origLocInfo;
origLocInfo.setLocation(cast<ConstantInt>(call->getOperand(0))->getZExtValue() + locOffset);
origLocInfo.setComponent(cast<ConstantInt>(call->getOperand(compIdxArgIdx))->getZExtValue());

// Get the packed InOutLocationInfo from locationInfoMap
InOutLocationInfoMap::const_iterator mapIter;
assert(m_locationInfoMapManager->findMap(origLocInfo, mapIter));
m_locationInfoMapManager->findMap(origLocInfo, mapIter);
inputLocInfoMap.insert({origLocInfo, mapIter->second});
inputLocInfoMap[origLocInfo] = mapIter->second;
newLocIdx = std::max(newLocIdx, mapIter->second.getLocation() + 1);
}

// Fill inputLocInfoMap for the unpackable calls
for (auto &locInfo : inputLocInfoMap) {
if (locInfo.second.isInvalid()) {
locInfo.second.setData(0);
locInfo.second.setLocation(newLocIdx++);
}
}
m_inputCalls.clear();
}

// =====================================================================================================================
Expand All @@ -2536,16 +2567,28 @@ void PatchResourceCollect::reassembleOutputExportCalls() {
// Collect ElementsInfo in each packed location
ElementsInfo elemsInfo = {{nullptr}, {nullptr}, 0, 0};
std::vector<ElementsInfo> elementsInfoArray(m_outputCalls.size(), elemsInfo);

const auto &tcsInputLocInfoMap =
m_pipelineState->getShaderResourceUsage(ShaderStageTessControl)->inOutUsage.inputLocInfoMap;
const bool checkDynIndex =
(m_shaderStage == ShaderStageVertex && m_pipelineState->hasShaderStage(ShaderStageTessControl));
for (auto call : m_outputCalls) {
InOutLocationInfo origLocInfo;
origLocInfo.setLocation(cast<ConstantInt>(call->getOperand(0))->getZExtValue());
origLocInfo.setComponent(cast<ConstantInt>(call->getOperand(1))->getZExtValue());

m_deadCalls.push_back(call);
InOutLocationInfoMap::const_iterator mapIter;
if (!m_locationInfoMapManager->findMap(origLocInfo, mapIter)) {
// An unused export call
// No change on the VS output export call corresponding to dynamic indexing TCS input import call
origLocInfo.setComponent(0);
if (checkDynIndex && tcsInputLocInfoMap.count(origLocInfo) == 0) {
// An unused export call
m_deadCalls.push_back(call);
}
continue;
} else {
// To be re-assembaled call
m_deadCalls.push_back(call);
}

const unsigned newLoc = mapIter->second.getLocation();
Expand Down Expand Up @@ -2632,9 +2675,8 @@ void PatchResourceCollect::scalarizeForInOutPacking(Module *module) {
// First gather the input/output calls that need scalarizing.
SmallVector<CallInst *, 4> outputCalls;
SmallVector<CallInst *, 4> inputCalls;
DenseSet<unsigned> dynamicLocs;
for (Function &func : *module) {
if (!m_pipelineState->canPackInOut())
break;
const bool isInterpolant = func.getName().startswith(lgcName::InputImportInterpolant);
if (func.getName().startswith(lgcName::InputImportGeneric) || isInterpolant) {
// This is a generic (possibly interpolated) input. Find its uses in FS (VS-FS, TES-FS) or TCS.
Expand All @@ -2644,19 +2686,17 @@ void PatchResourceCollect::scalarizeForInOutPacking(Module *module) {
const bool isFs = shaderStage == ShaderStageFragment;
const bool isTcs = shaderStage == ShaderStageTessControl;
if (isFs || isTcs) {
// This is a workaround to disable pack for the pipeline if there exists dynamic index in TCS
// TODO: Do partial packing except calls with dynamic index in future change
// NOTE: Dynamic index (location offset or component) in FS is processed to be constant in lower pass.
// NOTE: Dynamic indexing (location offset or component) in FS is processed to be constant in lower pass.
assert(!isInterpolant ||
(isInterpolant && isa<ConstantInt>(call->getOperand(1)) && isa<ConstantInt>(call->getOperand(2))));
const bool hasDynIdx =
isTcs && (!isa<ConstantInt>(call->getOperand(1)) || !isa<ConstantInt>(call->getOperand(2)));
if (hasDynIdx) {
m_pipelineState->setPackInOut(false);
break;
}
// We have a use in FS (VS-FS, TES-FS) or TCS. See if it needs scalarizing.
if (isa<VectorType>(call->getType()) || call->getType()->getPrimitiveSizeInBits() == 64)

// Collect input calls without dynamic indexing that need scalarize
const bool hasDynIndex =
isTcs ? (!isa<ConstantInt>(call->getOperand(1)) || !isa<ConstantInt>(call->getOperand(2))) : false;
if (hasDynIndex)
dynamicLocs.insert(cast<ConstantInt>(call->getOperand(0))->getZExtValue());
if (!hasDynIndex &&
(isa<FixedVectorType>(call->getType()) || call->getType()->getPrimitiveSizeInBits() == 64))
inputCalls.push_back(call);
}
}
Expand All @@ -2676,13 +2716,16 @@ void PatchResourceCollect::scalarizeForInOutPacking(Module *module) {
}
}
}
if (m_pipelineState->canPackInOut()) {
// Scalarize the gathered inputs and outputs.
for (CallInst *call : inputCalls)
scalarizeGenericInput(call);
for (CallInst *call : outputCalls)
scalarizeGenericOutput(call);
// Scalarize the gathered inputs and outputs.
for (CallInst *call : inputCalls) {
const unsigned loc = cast<ConstantInt>(call->getOperand(0))->getZExtValue();
// Don't scalarize the location of the call that has dynamic indexing in TCS
if (m_shaderStage == ShaderStageTessControl && dynamicLocs.count(loc) == 0)
continue;
scalarizeGenericInput(call);
}
for (CallInst *call : outputCalls)
scalarizeGenericOutput(call);
}

// =====================================================================================================================
Expand Down
59 changes: 48 additions & 11 deletions llpc/test/shaderdb/PipelineTess_TestInOutPacking.pipe
Original file line number Diff line number Diff line change
@@ -1,13 +1,26 @@
; BEGIN_SHADERTEST
; RUN: amdllpc -spvgen-dir=%spvgendir% -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
; SHADERTEST: shl i32 %3, 3
; SHADERTEST: or i32 %{{[0-9]*}}, 4
; SHADERTEST: getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @lds, i32 0, i32 %{{[0-9]*}}
; SHADERTEST: shl nuw nsw i32 %{{[0-9]*}}, 3
; SHADERTEST: or i32 %{{[0-9]*}}, 2
; SHADERTEST: getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @lds, i32 0, i32 %{{[0-9]*}}
; SHADERTEST: add nuw nsw i32 %{{[0-9]*}}, 384
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 40
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 1
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 2
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 3
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 8
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 12
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 16
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 20
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 4
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 24
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 28
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 32
; SHADERTEST: %{{[0-9]*}} = add i32 %{{[0-9]*}}, 36
; SHADERTEST: %{{[0-9]*}} = add nuw nsw i32 %{{[0-9]*}}, 4
; SHADERTEST: %{{[0-9]*}} = add nuw nsw i32 %{{[0-9]*}}, 10
; SHADERTEST: %{{[0-9]*}} = add nuw nsw i32 %{{[0-9]*}}, 12
; SHADERTEST: %{{[0-9]*}} = add nuw nsw i32 %{{[0-9]*}}, 8
; SHADERTEST: %{{[0-9]*}} = add nuw nsw i32 %{{[0-9]*}}, 52
; SHADERTEST: %{{[0-9]*}} = add nuw nsw i32 %{{[0-9]*}}, 96
; SHADERTEST: %{{[0-9]*}} = add nuw nsw i32 %{{[0-9]*}}, 24
; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 {{.*}}32, i32 {{.*}}15, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, i1 {{.*}}false, i1 {{.*}}false)
; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 {{.*}}33, i32 {{.*}}3, float %{{[0-9]*}}, float %{{[0-9]*}}, float undef, float undef, i1 {{.*}}false, i1 {{.*}}false)
; SHADERTEST: call float @llvm.amdgcn.interp.p1(float %{{[.i0-9]*}}, i32 immarg 1, i32 immarg 1, i32 %2)
Expand All @@ -28,15 +41,26 @@ version = 40
layout(location = 0) in vec4 inV0;
layout(location = 1) in vec2 inV1;
layout(location = 2) in vec2 inV2;
layout(location = 3) in vec3 inV3;

layout(location = 0) out vec2 out1;
layout(location = 1) out vec2 out2;
layout(location = 2) out vec3 out3;
layout(location = 3) out float out4[3];
layout(location = 6) out int index;
layout(location = 7) out dvec3 out5[2];


void main(void)
{
gl_Position = inV0;
out1 = inV1;
out2 = inV2;
out3 = inV3;
out4[0] = out4[1] = out4[2] = inV0.x;
index = 1;
out5[0] = dvec3(1.0, 1.0, 1.0);
out5[1] = dvec3(2.0, 2.0, 2.0);
}

[VsInfo]
Expand All @@ -48,6 +72,10 @@ entryPoint = main
layout(vertices=3) out;
layout(location = 0) in vec2 inArray1[];
layout(location = 1) in vec2 inArray2[];
layout(location = 2) in vec3 inArray3[];
layout(location = 3) in float inArray4[][3];
layout(location = 6) in int indexArr[];
layout(location = 7) in dvec3 inArray5[][2];

layout(location = 0) out vec4 outArray[];

Expand All @@ -59,7 +87,12 @@ void main(void)
gl_TessLevelInner[0] = 4.0;

gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID].gl_Position;
outArray[gl_InvocationID] = vec4(inArray2[gl_InvocationID].xy, inArray1[gl_InvocationID].xy);
int index = indexArr[gl_InvocationID];
float z = inArray3[gl_InvocationID].z + inArray4[gl_InvocationID][index];
float w = inArray3[0][gl_InvocationID] + inArray3[1][gl_InvocationID] + inArray3[2][gl_InvocationID];
dvec3 dv = inArray5[0][index];
w += float(dv.x + dv.y + dv.z);
outArray[gl_InvocationID] = vec4(inArray2[gl_InvocationID].xy + inArray1[gl_InvocationID].xy, z, w);
}
[TcsInfo]
entryPoint = main
Expand Down Expand Up @@ -161,7 +194,7 @@ options.extendedRobustness.nullDescriptor = 0

[VertexInputState]
binding[0].binding = 0
binding[0].stride = 16
binding[0].stride = 64
binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX
attribute[0].location = 0
attribute[0].binding = 0
Expand All @@ -170,8 +203,12 @@ attribute[0].offset = 0
attribute[1].location = 1
attribute[1].binding = 0
attribute[1].format = VK_FORMAT_R32G32_SFLOAT
attribute[1].offset = 0
attribute[1].offset = 16
attribute[2].location = 2
attribute[2].binding = 0
attribute[2].format = VK_FORMAT_R32G32_SFLOAT
attribute[2].offset = 0
attribute[2].offset = 24
attribute[3].location = 3
attribute[3].binding = 0
attribute[3].format = VK_FORMAT_R32G32B32_SFLOAT
attribute[3].offset = 32

0 comments on commit 3562e51

Please sign in to comment.