@@ -382,7 +382,7 @@ __global__ void __launch_bounds__(512, 1)
382
382
__syncthreads ();
383
383
// Starts allgather
384
384
for (size_t idx = threadIdx.x ; idx < nInt4PerChunk; idx += blockDim.x ) {
385
- for (int i = 0 ; i < nPeer ; i++) {
385
+ for (int i = 0 ; i < NPEERS ; i++) {
386
386
const int peerIdx = (i + blockIdx.x ) % nPeer;
387
387
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1 ;
388
388
int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
@@ -399,13 +399,13 @@ __global__ void __launch_bounds__(512, 1)
399
399
400
400
for (size_t idx = threadIdx.x ; idx < nInt4PerChunk; idx += blockDim.x ) {
401
401
int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
402
- for (int peerIdx = 0 ; peerIdx < nPeer ; peerIdx++) {
402
+ for (int peerIdx = 0 ; peerIdx < NPEERS ; peerIdx++) {
403
403
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1 ;
404
404
int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
405
405
data = add_vectors<T>(val, data);
406
406
}
407
407
resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
408
- for (int peerIdx = 0 ; peerIdx < nPeer ; peerIdx++) {
408
+ for (int peerIdx = 0 ; peerIdx < NPEERS ; peerIdx++) {
409
409
outChannels[peerIdx].write (nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof (int4),
410
410
data);
411
411
}
@@ -419,7 +419,7 @@ __global__ void __launch_bounds__(512, 1)
419
419
}
420
420
__syncthreads ();
421
421
for (size_t idx = threadIdx.x ; idx < restNInt4; idx += blockDim.x ) {
422
- for (int i = 0 ; i < nPeer ; i++) {
422
+ for (int i = 0 ; i < NPEERS ; i++) {
423
423
const int peerIdx = (i + blockIdx.x ) % nPeer;
424
424
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1 ;
425
425
int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
@@ -435,13 +435,13 @@ __global__ void __launch_bounds__(512, 1)
435
435
436
436
for (size_t idx = threadIdx.x ; idx < restNInt4; idx += blockDim.x ) {
437
437
int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
438
- for (int peerIdx = 0 ; peerIdx < nPeer ; peerIdx++) {
438
+ for (int peerIdx = 0 ; peerIdx < NPEERS ; peerIdx++) {
439
439
const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1 ;
440
440
int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
441
441
data = add_vectors<T>(val, data);
442
442
}
443
443
resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
444
- for (int peerIdx = 0 ; peerIdx < nPeer ; peerIdx++) {
444
+ for (int peerIdx = 0 ; peerIdx < NPEERS ; peerIdx++) {
445
445
outChannels[peerIdx].write (nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof (int4),
446
446
data);
447
447
}
0 commit comments