FZJ-JSC · mhrywniak · Oct 29, 2021 · Oct 7, 2021 · Oct 11, 2021 · Oct 27, 2021
diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/Instructions.md b/10-H_Device-initiated_Communication_with_NVSHMEM/Instructions.md
@@ -0,0 +1,52 @@
+# SC21 Tutorial: Efficient Distributed GPU Programming for Exascale
+
+-   Time: Sunday, 14 November 2021 8AM - 5PM CST
+-   Location: *online*
+-   Program Link: https://sc21.supercomputing.org/presentation/?id=tut138&sess=sess188
+
+
+## Hands-On 10: Device-initiated Communication with NVSHMEM
+
+### Task 0: Using NVSHMEM device API
+
+#### Description
+
+The purpose of this task is to use the NVSHMEM device API instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cu`:
+
+- Initialize NVSHMEM (same as in Hans-On 8-H):
+  - Include NVSHMEM headers.
+  - Initialize and shutdown NVSHMEM using `MPI_COMM_WORLD`.
+  - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size!
+  - Calculate halo/boundary row index of top and bottom neighbors.
+  - Add necessary inter PE synchronization.
+- Modify `jacobi_kernel`
+  - Pass in halo/boundary row index of top and bottom neighbors.
+  - Use `nvshmem_float_p` to directly push values needed by top and bottom neighbors from the kernel.
+  - Remove no longer needed MPI communication.
+
+Compile with
+
+``` {.bash}
+make
+```
+
+Submit your compiled application to the batch system with
+
+``` {.bash}
+make run
+```
+
+Study the performance by glimpsing at the profile generated with
+`make profile`. For `make run` and `make profile` the environment variable `NP` can be set to change the number of processes.
+
+### Task 1: Use `nvshmemx_float_put_nbi_block`
+
+#### Description
+
+This is an optional Task to use `nvshmemx_float_put_nbi_block` instead of `nvshmem_float_p` for more efficient multi node execution. There are no TODOs prepared. Use the solution of Task 0 as a starting point. Some tips:
+
+- You only need to change `jacobi_kernel`.
+- Switching to a 1-dimensional CUDA block can simplify the task.
+- The difficult part is calculating the right offsets and size for calling into `nvshmemx_float_put_nbi_block`.
+- If a CUDA blocks needs to communicate data with `nvshmemx_float_put_nbi_block` all threads in that block need to call into `nvshmemx_float_put_nbi_block`.
+- The [`nvshmem_opt`](https://github.com/NVIDIA/multi-gpu-programming-models/blob/master/nvshmem_opt/jacobi.cu#L154) variant in the [Multi GPU Programming Models Github repository](https://github.com/NVIDIA/multi-gpu-programming-models) implements the same strategy.
diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/Makefile b/10-H_Device-initiated_Communication_with_NVSHMEM/Makefile
@@ -0,0 +1,43 @@
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+NP ?= 4
+NVCC=nvcc
+JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4
+CUDA_HOME ?= /usr/local/cuda
+ifndef NVSHMEM_HOME
+$(error NVSHMEM_HOME is not set)
+endif
+ifndef MPI_HOME
+$(error MPI_HOME is not set)
+endif
+GENCODE_SM30	:= -gencode arch=compute_30,code=sm_30
+GENCODE_SM35	:= -gencode arch=compute_35,code=sm_35
+GENCODE_SM37	:= -gencode arch=compute_37,code=sm_37
+GENCODE_SM50	:= -gencode arch=compute_50,code=sm_50
+GENCODE_SM52	:= -gencode arch=compute_52,code=sm_52
+GENCODE_SM60    := -gencode arch=compute_60,code=sm_60
+GENCODE_SM70    := -gencode arch=compute_70,code=sm_70
+GENCODE_SM80    := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80
+GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
+ifdef DISABLE_CUB
+        NVCC_FLAGS = -Xptxas --optimize-float-atomics
+else
+        NVCC_FLAGS = -DHAVE_CUB
+endif
+NVCC_FLAGS += -dc -Xcompiler -fopenmp -lineinfo -DUSE_NVTX -lnvToolsExt $(GENCODE_FLAGS) -std=c++14 -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include
+NVCC_LDFLAGS = -ccbin=mpic++ -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvToolsExt
+jacobi: Makefile jacobi.cu
+	$(NVCC) $(NVCC_FLAGS) jacobi.cu -c -o jacobi.o
+	$(NVCC) $(GENCODE_FLAGS) jacobi.o -o jacobi $(NVCC_LDFLAGS)
+
+.PHONY.: clean
+clean:
+	rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log
+
+sanitize: jacobi
+	$(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10
+
+run: jacobi
+	$(JSC_SUBMIT_CMD) -n $(NP) ./jacobi
+
+profile: jacobi
+	$(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10
diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/copy.mk b/10-H_Device-initiated_Communication_with_NVSHMEM/copy.mk
@@ -0,0 +1,40 @@
+#!/usr/bin/make -f
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+TASKDIR = ../../tasks/10-H_Device-initiated_Communication_with_NVSHMEM
+SOLUTIONDIR = ../../solutions/10-H_Device-initiated_Communication_with_NVSHMEM
+
+PROCESSFILES = jacobi.cu
+COPYFILES = Makefile Instructions.ipynb Instructions.md
+
+
+TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES))
+TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES))
+SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES))
+SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES))
+
+.PHONY: all task
+all: task
+task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES}
+
+
+${TASKPROCCESFILES}: $(PROCESSFILES)
+	mkdir -p $(TASKDIR)/
+	cppp -USOLUTION $(notdir $@) $@
+
+${SOLUTIONPROCCESFILES}: $(PROCESSFILES)
+	mkdir -p $(SOLUTIONDIR)/
+	cppp -DSOLUTION $(notdir $@) $@
+
+
+${TASKCOPYFILES}: $(COPYFILES)
+	mkdir -p $(TASKDIR)/
+	cp $(notdir $@) $@
+
+${SOLUTIONCOPYFILES}: $(COPYFILES)
+	mkdir -p $(SOLUTIONDIR)/
+	cp $(notdir $@) $@
+
+%.ipynb: %.md
+	pandoc $< -o $@
+	# add metadata so this is seen as python
+	jq -s '.[0] * .[1]' $@ ../template.json | sponge $@