Skip to content

Commit

Permalink
Implement pack input/output in XX-FS pipeline
Browse files Browse the repository at this point in the history
- Implemented the workflow of packing input/output for XX-FS pipeline
- Use cl::PackInOut as a global switch to turn on/off the feature. It is turned on by default and will be turned on after microbench verification
  • Loading branch information
xuechen417 authored and JaxLinAMD committed Dec 12, 2019
1 parent 3e359b5 commit 40b2a6b
Show file tree
Hide file tree
Showing 9 changed files with 742 additions and 17 deletions.
15 changes: 15 additions & 0 deletions context/llpcContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,21 @@ class Context : public llvm::LLVMContext
// Sets triple and data layout in specified module from the context's target machine.
void SetModuleTargetMachine(llvm::Module* pModule);

bool CanPackInOut(ShaderStage shaderStage, bool isOutput) const
{
return m_pPipelineContext->CanPackInOut(shaderStage, isOutput);
}

bool IsPackInOutEnabled() const
{
return m_pPipelineContext->IsPackInOutEnabled();
}

void EnablePackInOut(bool packInOut)
{
m_pPipelineContext->EnablePackInOut(packInOut);
}

private:
LLPC_DISALLOW_DEFAULT_CTOR(Context);
LLPC_DISALLOW_COPY_AND_ASSIGN(Context);
Expand Down
33 changes: 32 additions & 1 deletion context/llpcGraphicsContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ namespace llvm
namespace cl
{

// -pack-in-out: pack input/output
opt<bool> PackInOut("pack-in-out", desc("Pack input/output"), init(false));

#if LLPC_BUILD_GFX10
extern opt<int> SubgroupSize;
#endif
Expand All @@ -73,7 +76,8 @@ GraphicsContext::GraphicsContext(
m_pPipelineInfo(pPipelineInfo),
m_stageMask(0),
m_activeStageCount(0),
m_gsOnChip(false)
m_gsOnChip(false),
m_packInOut(cl::PackInOut)
{
const PipelineShaderInfo* shaderInfo[ShaderStageGfxCount] =
{
Expand Down Expand Up @@ -647,4 +651,31 @@ uint32_t GraphicsContext::GetShaderWaveSize(
return waveSize;
}

// =====================================================================================================================
// Determine whether the requirements of packing input/output is satisfied
bool GraphicsContext::CanPackInOut(
ShaderStage shaderStage, // Current shader stage
bool isOutput // Whether it is to pack an output
) const
{
// Pack input/output requirements:
// 1) Both cl::PackInOut and m_packInOut are enabled.
// 2) It is a XX-FS pipeline.
// 3) It is XX' output or FS'input.
bool canPackInOut = cl::PackInOut && m_packInOut;
if (canPackInOut)
{
const uint32_t validStageMask = ShaderStageToMask(ShaderStageVertex) | ShaderStageToMask(ShaderStageFragment);
canPackInOut = (GetShaderStageMask() == validStageMask);

if (canPackInOut)
{
canPackInOut = (((shaderStage == ShaderStageVertex) && isOutput) || // It's XX' output
((shaderStage == ShaderStageFragment) && (isOutput == false))); // It's FS' input
}
}

return canPackInOut;
}

} // Llpc
10 changes: 10 additions & 0 deletions context/llpcGraphicsContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ class GraphicsContext: public PipelineContext
// Gets per pipeline options
virtual const PipelineOptions* GetPipelineOptions() const { return &m_pPipelineInfo->options; }

// Checks whether the requirements of packing input/output is satisfied
virtual bool CanPackInOut(ShaderStage shaderStage, bool isOutput) const;

// Checks whether pack input/output is enabled
virtual bool IsPackInOutEnabled() const { return m_packInOut; }

// Enable/disable pack input/output
virtual void EnablePackInOut(bool packInOut) { m_packInOut = packInOut; }

void InitShaderInfoForNullFs();

private:
Expand All @@ -106,6 +115,7 @@ class GraphicsContext: public PipelineContext
InterfaceData m_intfData[ShaderStageGfxCount]; // Interface data of all graphics shader stages

bool m_gsOnChip; // Whether to enable GS on-chip mode
bool m_packInOut; // Whether to enable/disable pack in/out

llvm::SmallVector<std::unique_ptr<llvm::SmallVectorImpl<ResourceMappingNode>>, 4>
m_allocUserDataNodes; // Allocated merged user data nodes
Expand Down
14 changes: 13 additions & 1 deletion context/llpcPipelineContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,10 +350,13 @@ struct ResourceUsage
// Usage of generic input/output
struct
{
// Map from shader specified locations to tightly packed locations
// Map from shader specified locations to tightly packed locations.
std::map<uint32_t, uint32_t> inputLocMap;
std::map<uint32_t, uint32_t> outputLocMap;

// The original and new InOutLocations for shader cache
std::map<uint32_t, uint32_t> inOutLocMap;

std::map<uint32_t, uint32_t> perPatchInputLocMap;
std::map<uint32_t, uint32_t> perPatchOutputLocMap;

Expand Down Expand Up @@ -749,6 +752,15 @@ class PipelineContext
// Set pipeline state in Pipeline object for middle-end
void SetPipelineState(Pipeline* pPipeline) const;

// Checks whether the requirements of packing input/output is satisfied
virtual bool CanPackInOut(ShaderStage shaderStage, bool isOutput) const { return false; }

// Checks whether pack input/output is enabled
virtual bool IsPackInOutEnabled() const { return false; }

// Enable/disable pack input/output
virtual void EnablePackInOut(bool packInOut) {}

static void InitShaderResourceUsage(ShaderStage shaderStage, ResourceUsage* pResUsage);

static void InitShaderInterfaceData(InterfaceData* pIntfData);
Expand Down
194 changes: 180 additions & 14 deletions lower/llpcSpirvLowerGlobal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "SPIRVInternal.h"
#include "llpcBuilder.h"
#include "llpcContext.h"
#include "llpcPipelineContext.h"
#include "llpcSpirvLowerGlobal.h"

#define DEBUG_TYPE "llpc-spirv-lower-global"
Expand Down Expand Up @@ -1407,13 +1408,112 @@ Value* SpirvLowerGlobal::AddCallInstForInOutImport(
inOutInfo.SetInterpLoc(interpLoc);
inOutInfo.SetInterpMode(inOutMeta.InterpMode);
}
pInOutValue = m_pBuilder->CreateReadGenericInput(pInOutTy,
inOutMeta.Value,
pLocOffset,
pElemIdx,
maxLocOffset,
inOutInfo,
pVertexIdx);

// Scalarize FS' output in XX-FS pipeline which is executed once by MapInputToProxy()
if (m_pContext->CanPackInOut(m_shaderStage, false))
{
const bool is64Bit = (pInOutTy->getScalarSizeInBits() == 64);

if (is64Bit == false)
{
if (pInOutTy->isVectorTy() == false)
{
pInOutValue = m_pBuilder->CreateReadGenericInput(pInOutTy,
inOutMeta.Value,
pLocOffset,
pElemIdx,
maxLocOffset,
inOutInfo,
pVertexIdx);
}
else
{
// Scalarize 32-bit vector per component
const uint32_t compCount = pInOutTy->getVectorNumElements();
Type* pCompTy = pInOutTy->getScalarType();
for (uint32_t compIdx = 0; compIdx < compCount; ++compIdx)
{
pElemIdx = m_pBuilder->getInt32(compIdx + elemIdx);
Value* pComp = m_pBuilder->CreateReadGenericInput(pCompTy,
inOutMeta.Value,
pLocOffset,
pElemIdx,
maxLocOffset,
inOutInfo,
pVertexIdx);

// Restore the input vector to be reference
pInOutValue = m_pBuilder->CreateInsertElement(pInOutValue,
pComp,
m_pBuilder->getInt32(compIdx));
}
}
}
else
{
// Scalarize 64-bit vector into 32-bit elements
Type* pInputTy = pInOutTy;
if (pInOutTy->isVectorTy() == false)
{
pInputTy = VectorType::get(pInOutTy, 1);
}

uint32_t locAdjust[] = { 0, 0, 1, 1 };
uint32_t compAdjust[] = { 0, 1, 0, 1 };

const uint32_t compCount = pInputTy->getVectorNumElements();
for (uint32_t compIdx = 0; compIdx < compCount; ++compIdx)
{
uint32_t loc = inOutMeta.Value + locAdjust[compIdx];
uint32_t newCompIdx = 2 * compAdjust[compIdx] + elemIdx;

// Cast a 64-bit scalar into a 32-bit vector
Type* pScalarTy = pInputTy->getVectorElementType()->isIntegerTy() ?
m_pContext->Int32Ty() : m_pContext->FloatTy();
Value* pVecValue = UndefValue::get(VectorType::get(pScalarTy, 2));

// Scalarize a 32-bit vector
for (uint32_t j = 0; j < 2; ++j)
{
pElemIdx = m_pBuilder->getInt32(newCompIdx + j);

Value* pElem = m_pBuilder->CreateReadGenericInput(pScalarTy,
loc,
pLocOffset,
pElemIdx,
maxLocOffset,
inOutInfo,
pVertexIdx);

// Restore to a 32-bit vector
pVecValue = m_pBuilder->CreateInsertElement(pVecValue, pElem, m_pBuilder->getInt32(j));
}

// Restore to a 64-bit scalar
Value* pComp = m_pBuilder->CreateBitCast(pVecValue, pInputTy->getVectorElementType());

// Restore to the value of 64-bit to be reference
if (pInOutTy->isVectorTy())
{
pInOutValue = m_pBuilder->CreateInsertElement(pInOutValue, pComp, m_pBuilder->getInt32(compIdx));
}
else
{
pInOutValue = pComp;
}
}
}
}
else
{
pInOutValue = m_pBuilder->CreateReadGenericInput(pInOutTy,
inOutMeta.Value,
pLocOffset,
pElemIdx,
maxLocOffset,
inOutInfo,
pVertexIdx);
}
}
else
{
Expand Down Expand Up @@ -1713,13 +1813,79 @@ void SpirvLowerGlobal::AddCallInstForOutputExport(
"xfbOffset = " << cast<ConstantInt>(pXfbOffset)->getZExtValue() << "\n");
}

m_pBuilder->CreateWriteGenericOutput(pOutputValue,
location,
pLocOffset,
pElemIdx,
maxLocOffset,
outputInfo,
pVertexIdx);
// Scalarize XX' output in XX-FS pipeline which is executed once by MapOutputToProxy()
if (m_pContext->CanPackInOut(m_shaderStage, true))
{
// Make %Type into v1%Type for dealing uniformly
Value* pOutputVecValue = pOutputValue;
if (pOutputTy->isVectorTy() == false)
{
pOutputVecValue = UndefValue::get(VectorType::get(pOutputTy->getScalarType(), 1));
pOutputVecValue = m_pBuilder->CreateInsertElement(pOutputVecValue, pOutputValue, m_pBuilder->getInt32(0));
}

pOutputTy = pOutputVecValue->getType();
const auto compCount = pOutputTy->getVectorNumElements();
const bool is64Bit = (pOutputTy->getScalarSizeInBits() == 64);

if (is64Bit == false)
{
// Scalarize 32-bit vector per component
for (auto compIdx = 0; compIdx < compCount; ++compIdx)
{
Value* pComp = m_pBuilder->CreateExtractElement(pOutputVecValue, m_pBuilder->getInt32(compIdx));
pElemIdx = m_pBuilder->getInt32(compIdx + elemIdx);
m_pBuilder->CreateWriteGenericOutput(pComp,
location,
pLocOffset,
pElemIdx,
maxLocOffset,
outputInfo,
pVertexIdx);
}
}
else
{
uint32_t locAdjust[] = { 0, 0, 1, 1 };
uint32_t compAdjust[] = { 0, 1, 0, 1 };

// Scalarize 64-bit vector into 32-bit elements
for (auto compIdx = 0; compIdx < compCount; ++compIdx)
{
// Cast 64-bit into a 32-bit vector
Value* pComp = m_pBuilder->CreateExtractElement(pOutputVecValue, m_pBuilder->getInt32(compIdx));
Type* pCastCompTy = pComp->getType()->isIntegerTy() ?
m_pContext->Int32x2Ty() : m_pContext->Floatx2Ty();
pComp = m_pBuilder->CreateBitCast(pComp, pCastCompTy);

// Scalarize a 32-bit vector per component
auto loc = location + locAdjust[compIdx];
const auto newCompIdx = 2 * compAdjust[compIdx] + elemIdx;
for (auto j = 0; j < 2; ++j)
{
Value* pElem = m_pBuilder->CreateExtractElement(pComp, m_pBuilder->getInt32(j));
pElemIdx = m_pBuilder->getInt32(newCompIdx + j);
m_pBuilder->CreateWriteGenericOutput(pElem,
loc,
pLocOffset,
pElemIdx,
maxLocOffset,
outputInfo,
pVertexIdx);
}
}
}
}
else
{
m_pBuilder->CreateWriteGenericOutput(pOutputValue,
location,
pLocOffset,
pElemIdx,
maxLocOffset,
outputInfo,
pVertexIdx);
}
}
}

Expand Down
1 change: 1 addition & 0 deletions patch/llpcPatchCheckShaderCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ bool PatchCheckShaderCache::runOnModule(
// Update input/output usage
StreamMapEntries(pResUsage->inOutUsage.inputLocMap, stream);
StreamMapEntries(pResUsage->inOutUsage.outputLocMap, stream);
StreamMapEntries(pResUsage->inOutUsage.inOutLocMap, stream);
StreamMapEntries(pResUsage->inOutUsage.perPatchInputLocMap, stream);
StreamMapEntries(pResUsage->inOutUsage.perPatchOutputLocMap, stream);
StreamMapEntries(pResUsage->inOutUsage.builtInInputLocMap, stream);
Expand Down
3 changes: 3 additions & 0 deletions patch/llpcPatchNullFragShader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ bool PatchNullFragShader::runOnModule(
GraphicsContext* pGraphicsContext = static_cast<GraphicsContext*>(m_pContext->GetPipelineContext());
pGraphicsContext->InitShaderInfoForNullFs();

// Disable pack input/output for null FS
pGraphicsContext->EnablePackInOut(false);

return true;
}

Expand Down
Loading

0 comments on commit 40b2a6b

Please sign in to comment.