/
ParticleTileCullingCS.hlsl
225 lines (188 loc) · 8.49 KB
/
ParticleTileCullingCS.hlsl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
//
// Copyright (c) Microsoft. All rights reserved.
// This code is licensed under the MIT License (MIT).
// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
//
// Developed by Minigraph
//
// Author(s): James Stanard
// Julia Careaga
//
#include "ParticleUtility.hlsli"
StructuredBuffer<uint> g_BinParticles : register(t0);
ByteAddressBuffer g_BinCounters : register(t1);
Texture2D<uint> g_DepthBounds : register(t2);
StructuredBuffer<ParticleScreenData> g_VisibleParticles : register(t3);
RWStructuredBuffer<uint> g_SortedParticles : register(u0);
RWByteAddressBuffer g_TileHitMasks : register(u1);
RWStructuredBuffer<uint> g_DrawPackets : register(u2);
RWStructuredBuffer<uint> g_FastDrawPackets : register(u3);
RWByteAddressBuffer g_DrawPacketCount : register(u4);
#if TILES_PER_BIN < 64
#define GROUP_THREAD_COUNT 64
#else
#define GROUP_THREAD_COUNT TILES_PER_BIN
#endif
#define GROUP_SIZE_X TILES_PER_BIN_X
#define GROUP_SIZE_Y (GROUP_THREAD_COUNT / GROUP_SIZE_X)
#define MASK_WORDS_PER_ITER (GROUP_THREAD_COUNT / 32)
groupshared uint gs_SortKeys[MAX_PARTICLES_PER_BIN];
groupshared uint gs_IntersectionMasks[TILES_PER_BIN * MASK_WORDS_PER_ITER];
groupshared uint gs_TileParticleCounts[TILES_PER_BIN];
groupshared uint gs_SlowTileParticleCounts[TILES_PER_BIN];
groupshared uint gs_MinMaxDepth[TILES_PER_BIN];
void BitonicSort(uint GI, uint NumElements, uint NextPow2, uint NumThreads)
{
for (uint k = 2; k <= NextPow2; k *= 2)
{
// Align NumElements to the next multiple of k
NumElements = (NumElements + k - 1) & ~(k - 1);
for (uint j = k / 2; j > 0; j /= 2)
{
// Loop over all N/2 unique element pairs
for (uint i = GI; i < NumElements / 2; i += NumThreads)
{
uint Index1 = InsertZeroBit(i, j);
uint Index2 = Index1 | j;
uint A = gs_SortKeys[Index1];
uint B = gs_SortKeys[Index2];
if ((A < B) != ((Index1 & k) == 0))
{
gs_SortKeys[Index1] = B;
gs_SortKeys[Index2] = A;
}
}
GroupMemoryBarrierWithGroupSync();
}
}
}
uint ComputeMaskOffset( uint2 Gid, uint2 GTid )
{
// Sometimes we have more threads than tiles per bin.
uint2 OutTileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + uint2(GTid.x, GTid.y % TILES_PER_BIN_Y);
uint OutTileIdx = OutTileCoord.x + OutTileCoord.y * gTileRowPitch;
return OutTileIdx * MAX_PARTICLES_PER_BIN / 8 + GTid.y / TILES_PER_BIN_Y * 4;
}
[RootSignature(Particle_RootSig)]
[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex, uint3 GTid : SV_GroupThreadID )
{
// Each group is assigned a bin
uint BinIndex = Gid.y * gBinsPerRow + Gid.x;
uint ParticleCountInBin = g_BinCounters.Load(BinIndex * 4);
if (ParticleCountInBin == 0)
return;
// Get the start location for particles in this bin
uint BinStart = BinIndex * MAX_PARTICLES_PER_BIN;
// Each thread is assigned a tile
uint2 TileCoord = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y) + GTid.xy;
if (GI < TILES_PER_BIN)
{
gs_TileParticleCounts[GI] = 0;
gs_SlowTileParticleCounts[GI] = 0;
gs_MinMaxDepth[GI] = g_DepthBounds[TileCoord] << 2;
}
// Sometimes the counter value exceeds the actual storage size
ParticleCountInBin = min(MAX_PARTICLES_PER_BIN, ParticleCountInBin);
// Compute the next power of two for the bitonic sort
uint NextPow2 = countbits(ParticleCountInBin) <= 1 ? ParticleCountInBin : (2u << firstbithigh(ParticleCountInBin));
// Fill in the sort key array. Each sort key has passenger data (in the least signficant
// bits, so that as the sort keys are moved around, they retain a pointer to the particle
// they refer to.
for (uint k = GI; k < NextPow2; k += GROUP_THREAD_COUNT)
gs_SortKeys[k] = k < ParticleCountInBin ? g_BinParticles[BinStart + k] : 0xffffffff;
GroupMemoryBarrierWithGroupSync();
// Sort the particles from front to back.
BitonicSort(GI, ParticleCountInBin, NextPow2, GROUP_THREAD_COUNT);
// Upper-left tile coord and lower-right coord, clamped to the screen
const int2 StartTile = Gid.xy * uint2(TILES_PER_BIN_X, TILES_PER_BIN_Y);
// Each thread writes the hit mask for one tile
uint OutOffsetInBytes = ComputeMaskOffset(Gid.xy, GTid.xy);
// Loop over all sorted particles, group-size count at a time
for (uint Iter = 0; Iter < ParticleCountInBin; Iter += GROUP_THREAD_COUNT)
{
// Reset temporary particle intersection masks. There are two words (64-bits) per thread.
//[unroll]
for (uint C = GI; C < TILES_PER_BIN * MASK_WORDS_PER_ITER; C += GROUP_THREAD_COUNT)
gs_IntersectionMasks[C] = 0;
GroupMemoryBarrierWithGroupSync();
// The array index of the particle this thread will test
uint SortIdx = Iter + GI;
// Compute word and bit to set (from thread index)
uint WordOffset = GI >> 5;
uint BitOffset = GI & 31;
// Only do the loads and stores if this is a valid index (see constant number of iterations comment above)
if (SortIdx < ParticleCountInBin)
{
uint SortKey = gs_SortKeys[SortIdx];
uint GlobalIdx = SortKey & 0x3FFFF;
// After this phase, all we care about is its global index
g_SortedParticles[BinStart + SortIdx] = SortKey;
uint Bounds = g_VisibleParticles[GlobalIdx].Bounds;
int2 MinTile = uint2(Bounds >> 0, Bounds >> 8) & 0xFF;
int2 MaxTile = uint2(Bounds >> 16, Bounds >> 24) & 0xFF;
MinTile = max(MinTile - StartTile, 0);
MaxTile = min(MaxTile - StartTile, int2(TILES_PER_BIN_X, TILES_PER_BIN_Y) - 1);
for (int y = MinTile.y; y <= MaxTile.y; y++)
{
for (int x = MinTile.x; x <= MaxTile.x; x++)
{
uint TileIndex = y * TILES_PER_BIN_X + x;
uint TileMaxZ = gs_MinMaxDepth[TileIndex];
uint Inside = SortKey < TileMaxZ ? 1 : 0;
uint SlowPath = SortKey > (TileMaxZ << 16) ? Inside : 0;
InterlockedAdd(gs_SlowTileParticleCounts[TileIndex], SlowPath);
InterlockedOr(gs_IntersectionMasks[TileIndex * MASK_WORDS_PER_ITER + WordOffset], Inside << BitOffset);
}
}
}
GroupMemoryBarrierWithGroupSync();
#if TILES_PER_BIN < GROUP_THREAD_COUNT
// Copy the hit masks from LDS to the output buffer. Here, each thread copies a single word
if (GI < TILES_PER_BIN * MASK_WORDS_PER_ITER)
{
uint TileIndex = GI % TILES_PER_BIN;
uint Offset = TileIndex * MASK_WORDS_PER_ITER + (GI / TILES_PER_BIN);
uint Mask = gs_IntersectionMasks[Offset];
InterlockedAdd(gs_TileParticleCounts[TileIndex], countbits(Mask));
g_TileHitMasks.Store(OutOffsetInBytes, Mask);
OutOffsetInBytes += 8;
}
#else
// Copy the hit masks from LDS to the output buffer. Here, each thread is assigned a tile.
uint Offset = GI * MASK_WORDS_PER_ITER;
[unroll]
for (uint O = 0; O < MASK_WORDS_PER_ITER; O += 2)
{
uint Mask0 = gs_IntersectionMasks[Offset+O];
uint Mask1 = gs_IntersectionMasks[Offset+O+1];
InterlockedAdd(gs_TileParticleCounts[GI], countbits(Mask0) + countbits(Mask1));
g_TileHitMasks.Store2( OutOffsetInBytes, uint2(Mask0, Mask1) );
OutOffsetInBytes += 8;
}
#endif
GroupMemoryBarrierWithGroupSync();
}
if (GI >= TILES_PER_BIN)
return;
uint ParticleCountInThisThreadsTile = gs_TileParticleCounts[GI];
if (ParticleCountInThisThreadsTile > 0)
{
uint SlowParticlesInThisThreadsTile = gs_SlowTileParticleCounts[GI];
uint Packet = TileCoord.x << 16 | TileCoord.y << 24 | ParticleCountInThisThreadsTile;
uint NewPacketIndex;
if (SlowParticlesInThisThreadsTile > 0)
{
g_DrawPacketCount.InterlockedAdd(0, 1, NewPacketIndex);
g_DrawPackets[NewPacketIndex] = Packet;
}
else
{
g_DrawPacketCount.InterlockedAdd(12, 1, NewPacketIndex);
g_FastDrawPackets[NewPacketIndex] = Packet;
}
}
}