Skip to content

Commit 01e105b

Browse files
committed
apps/nccl: add unroll in allred8
1 parent 6484dce commit 01e105b

File tree

1 file changed

+6
-6
lines changed

1 file changed

+6
-6
lines changed

apps/nccl/src/allreduce.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ __global__ void __launch_bounds__(512, 1)
382382
__syncthreads();
383383
// Starts allgather
384384
for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
385-
for (int i = 0; i < nPeer; i++) {
385+
for (int i = 0; i < NPEERS; i++) {
386386
const int peerIdx = (i + blockIdx.x) % nPeer;
387387
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
388388
int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
@@ -399,13 +399,13 @@ __global__ void __launch_bounds__(512, 1)
399399

400400
for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
401401
int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
402-
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
402+
for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) {
403403
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
404404
int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
405405
data = add_vectors<T>(val, data);
406406
}
407407
resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
408-
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
408+
for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) {
409409
outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
410410
data);
411411
}
@@ -419,7 +419,7 @@ __global__ void __launch_bounds__(512, 1)
419419
}
420420
__syncthreads();
421421
for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
422-
for (int i = 0; i < nPeer; i++) {
422+
for (int i = 0; i < NPEERS; i++) {
423423
const int peerIdx = (i + blockIdx.x) % nPeer;
424424
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
425425
int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
@@ -435,13 +435,13 @@ __global__ void __launch_bounds__(512, 1)
435435

436436
for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
437437
int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
438-
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
438+
for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) {
439439
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
440440
int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
441441
data = add_vectors<T>(val, data);
442442
}
443443
resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
444-
for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
444+
for (int peerIdx = 0; peerIdx < NPEERS; peerIdx++) {
445445
outChannels[peerIdx].write(nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof(int4),
446446
data);
447447
}

0 commit comments

Comments
 (0)