Skip to content

Commit

Permalink
TPCClusterFinder: Improve performance of noisy pad filter on CPU.
Browse files Browse the repository at this point in the history
  • Loading branch information
fweig authored and davidrohr committed Jan 26, 2021
1 parent 9b68a21 commit cd07082
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 48 deletions.
1 change: 0 additions & 1 deletion Detectors/TPC/base/include/TPCBase/CalArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#ifndef ALICEO2_TPC_CALARRAY_H_
#define ALICEO2_TPC_CALARRAY_H_

#include <Vc/Vc>
#include <memory>
#include <vector>
#include <string>
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@

#define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix!

#define GPUCA_LB_GPUTPCCFCheckPadBaseline GPUCA_WARP_SIZE
#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap GPUCA_LB_CLUSTER_FINDER
#define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits GPUCA_LB_CLUSTER_FINDER
#define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart GPUCA_LB_CLUSTER_FINDER
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -551,7 +551,7 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
checkForNoisyPads &= (rec()->GetParam().rec.noisyPadsQuickCheck ? fragment.index == 0 : true);

if (checkForNoisyPads) {
int nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::getPadsPerBlock(doGPU);
int nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline;
runKernel<GPUTPCCFCheckPadBaseline>(GetGridBlk(nBlocks, lane), {iSlice}, {});
}

Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/TPCClusterFinder/Array2D.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ struct GridSize<1> {
template <>
struct GridSize<2> {
enum {
Width = 4,
Width = 8,
Height = 4,
};
};
Expand Down
119 changes: 98 additions & 21 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,45 @@
#include "PackedCharge.h"
#include "clusterFinderDefs.h"

#ifndef GPUCA_GPUCODE
#ifndef GPUCA_NO_VC
#include <Vc/Vc>
#else
#include <array>
#endif
#endif

using namespace GPUCA_NAMESPACE::gpu;
using namespace GPUCA_NAMESPACE::gpu::tpccf;

template <>
GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer)
{
static_assert(TPC_MAX_FRAGMENT_LEN % NumOfCachedTimebins == 0);

const CfFragment& fragment = clusterer.mPmemory->fragment;
Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));

int basePad = iBlock * PadsPerCacheline;
ChargePos basePos = padToChargePos(basePad, clusterer);

if (not basePos.valid()) {
return;
}

#ifdef GPUCA_GPUCODE
static_assert(TPC_MAX_FRAGMENT_LEN % NumOfCachedTimebins == 0);

int totalCharges = 0;
int consecCharges = 0;
int maxConsecCharges = 0;

int localPadId = iThread / NumOfCachedTimebins;
int localTimeBin = iThread % NumOfCachedTimebins;
short localPadId = iThread / NumOfCachedTimebins;
short localTimeBin = iThread % NumOfCachedTimebins;
bool handlePad = localTimeBin == 0;
int basePad = iBlock * PadsPerBlock;

CfFragment& fragment = clusterer.mPmemory->fragment;

ChargePos basePos = padToChargePos(basePad + localPadId, clusterer);

for (tpccf::TPCFragmentTime t = localTimeBin + fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) {
ChargePos pos = basePos.delta({0, t});
for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) {
ChargePos pos = basePos.delta({localPadId, short(t + localTimeBin)});
smem.charges[localPadId][localTimeBin] = (pos.valid()) ? chargeMap[pos].unpack() : 0;
GPUbarrierWarp();
GPUbarrier();
if (handlePad) {
for (int i = 0; i < NumOfCachedTimebins; i++) {
Charge q = smem.charges[localPadId][i];
Expand All @@ -53,28 +65,93 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int nBlocks, int nThreads, int i
}
}

GPUbarrierWarp();
GPUbarrier();

if (handlePad) {
int totalChargesBaseline = clusterer.Param().rec.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
int consecChargesBaseline = clusterer.Param().rec.maxConsecTimeBinAboveThreshold;
bool hasLostBaseline = (totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && maxConsecCharges >= consecChargesBaseline);
clusterer.mPpadHasLostBaseline[basePad + localPadId] |= hasLostBaseline;
updatePadBaseline(basePad + localPadId, clusterer, totalCharges, maxConsecCharges);
}

#else // CPU CODE

constexpr size_t ElemsInTileRow = TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;

#ifndef GPUCA_NO_VC
using UShort8 = Vc::fixed_size_simd<unsigned short, PadsPerCacheline>;

UShort8 totalCharges{Vc::Zero};
UShort8 consecCharges{Vc::Zero};
UShort8 maxConsecCharges{Vc::Zero};
#else
std::array<unsigned short, PadsPerCacheline> totalCharges{0};
std::array<unsigned short, PadsPerCacheline> consecCharges{0};
std::array<unsigned short, PadsPerCacheline> maxConsecCharges{0};
#endif

tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();
const unsigned short* charge = reinterpret_cast<unsigned short*>(&chargeMap[basePos.delta({0, t})]);

for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
#ifndef GPUCA_NO_VC
UShort8 charges{charge + PadsPerCacheline * localtime, Vc::Aligned};

UShort8::mask_type isCharge = charges != 0;

if (isCharge.isNotEmpty()) {
totalCharges(isCharge)++;
consecCharges += 1;
consecCharges(not isCharge) = 0;
maxConsecCharges = Vc::max(consecCharges, maxConsecCharges);
} else {
consecCharges = 0;
}
#else // Vc not available
for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
bool isCharge = charge[PadsPerCacheline * localtime + localpad] != 0;
if (isCharge) {
totalCharges[localpad]++;
consecCharges[localpad]++;
maxConsecCharges[localpad] = CAMath::Max(maxConsecCharges[localpad], consecCharges[localpad]);
} else {
consecCharges[localpad] = 0;
}
}
#endif
}

charge += ElemsInTileRow;
}

for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
updatePadBaseline(basePad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad]);
}
#endif
}

GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int pad, const GPUTPCClusterFinder& clusterer)
GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int& pad, const GPUTPCClusterFinder& clusterer)
{
const GPUTPCGeometry& geo = clusterer.Param().tpcGeometry;

int padOffset = 0;
for (Row r = 0; r < TPC_NUM_OF_ROWS; r++) {
int npads = geo.NPads(r);
int padInRow = pad - padOffset;
if (0 <= padInRow && padInRow < geo.NPads(r)) {
return ChargePos{r, Pad(padInRow), 0};
if (0 <= padInRow && padInRow < CAMath::nextMultipleOf<PadsPerCacheline, int>(npads)) {
int cachelineOffset = padInRow % PadsPerCacheline;
pad -= cachelineOffset;
return ChargePos{r, Pad(padInRow - cachelineOffset), 0};
}
padOffset += geo.NPads(r);
padOffset += npads;
}

return ChargePos{0, 0, INVALID_TIME_BIN};
}

GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int pad, const GPUTPCClusterFinder& clusterer, int totalCharges, int consecCharges)
{
const CfFragment& fragment = clusterer.mPmemory->fragment;
int totalChargesBaseline = clusterer.Param().rec.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
int consecChargesBaseline = clusterer.Param().rec.maxConsecTimeBinAboveThreshold;
bool hasLostBaseline = (totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline);
clusterer.mPpadHasLostBaseline[pad] |= hasLostBaseline;
}
27 changes: 7 additions & 20 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,15 @@ namespace GPUCA_NAMESPACE::gpu
class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
{

private:
// Only use these constants on device side...
// Use getPadsPerBlock() for host side
public:
enum {
PadsPerBlockGPU = 4, // Number of pads in a single cache line
PadsPerBlockCPU = 1,
#ifdef GPUCA_GPUCODE
PadsPerBlock = PadsPerBlockGPU,
#else
PadsPerBlock = PadsPerBlockCPU,
#endif
NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerBlock,
PadsPerCacheline = 8,
TimebinsPerCacheline = 4,
NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerCacheline,
};

public:
struct GPUSharedMemory {
tpccf::Charge charges[PadsPerBlock][NumOfCachedTimebins];
tpccf::Charge charges[PadsPerCacheline][NumOfCachedTimebins];
};

#ifdef HAVE_O2HEADERS
Expand All @@ -57,17 +49,12 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
return GPUDataTypes::RecoStep::TPCClusterFinding;
}

// Use this to get num of pads per block on host side. Can't use constant there.
static int getPadsPerBlock(bool isGPU)
{
return (isGPU) ? PadsPerBlockGPU : PadsPerBlockCPU;
}

template <int iKernel = defaultKernel>
GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer);

private:
GPUd() static ChargePos padToChargePos(int pad, const GPUTPCClusterFinder&);
GPUd() static ChargePos padToChargePos(int& pad, const GPUTPCClusterFinder&);
GPUd() static void updatePadBaseline(int pad, const GPUTPCClusterFinder&, int totalCharges, int consecCharges);
};

} // namespace GPUCA_NAMESPACE::gpu
Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/TPCClusterFinder/PackedCharge.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class PackedCharge
GPUdi() tpccf::Charge unpack() const { return tpccf::Charge(mVal & ChargeMask) / tpccf::Charge(1 << DecimalBits); }
GPUdi() bool has3x3Peak() const { return mVal & Has3x3PeakMask; }
GPUdi() bool isSplit() const { return mVal & IsSplitMask; }
GPUdi() bool isZero() const { return mVal == 0; }

private:
BasicType mVal;
Expand Down
9 changes: 6 additions & 3 deletions GPU/GPUTracking/TPCClusterFinder/clusterFinderDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,15 @@ using ulong = unsigned long;
#define SCRATCH_PAD_NOISE_N 16
#endif

#define PADDING_PAD 2
#define PADDING_TIME 3
// Padding of 2 and 3 respectively would be enough. But this ensures that
// rows are always aligned along cache lines. Likewise for TPC_PADS_PER_ROW.
#define PADDING_PAD 8
#define PADDING_TIME 4
#define TPC_PADS_PER_ROW 144

#define TPC_SECTORS 36
#define TPC_ROWS_PER_CRU 18
#define TPC_NUM_OF_ROWS 152
#define TPC_PADS_PER_ROW 138
#define TPC_PADS_PER_ROW_PADDED (TPC_PADS_PER_ROW + PADDING_PAD)
#define TPC_NUM_OF_PADS (TPC_NUM_OF_ROWS * TPC_PADS_PER_ROW_PADDED + PADDING_PAD)
#define TPC_PADS_IN_SECTOR 14560
Expand Down

0 comments on commit cd07082

Please sign in to comment.