diff --git a/Makefile b/Makefile index 2a244a4..9b99b8b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # Builds all available modules and examples # Default target -.PHONY: all clean test help module1 module2 module3 module4 module5 module6 module7 module8 module9 +.PHONY: all clean test debug profile help module1 module2 module3 module4 module5 module6 module7 module8 module9 # Build all available modules all: module1 module2 module3 module4 module5 module6 module7 module8 module9 @@ -83,6 +83,84 @@ test-module9: @echo "Testing Module 9..." @$(MAKE) -C modules/module9/examples test +# Debug builds for all modules +debug: debug-module1 debug-module2 debug-module3 debug-module4 debug-module5 debug-module6 debug-module7 debug-module8 debug-module9 + +debug-module1: + @echo "Debug build Module 1..." + @$(MAKE) -C modules/module1/examples debug + +debug-module2: + @echo "Debug build Module 2..." + @$(MAKE) -C modules/module2/examples debug + +debug-module3: + @echo "Debug build Module 3..." + @$(MAKE) -C modules/module3/examples debug + +debug-module4: + @echo "Debug build Module 4..." + @$(MAKE) -C modules/module4/examples debug + +debug-module5: + @echo "Debug build Module 5..." + @$(MAKE) -C modules/module5/examples debug + +debug-module6: + @echo "Debug build Module 6..." + @$(MAKE) -C modules/module6/examples debug + +debug-module7: + @echo "Debug build Module 7..." + @$(MAKE) -C modules/module7/examples debug + +debug-module8: + @echo "Debug build Module 8..." + @$(MAKE) -C modules/module8/examples debug + +debug-module9: + @echo "Debug build Module 9..." + @$(MAKE) -C modules/module9/examples debug + +# Profile builds for all modules +profile: profile-module1 profile-module2 profile-module3 profile-module4 profile-module5 profile-module6 profile-module7 profile-module8 profile-module9 + +profile-module1: + @echo "Profile build Module 1..." + @$(MAKE) -C modules/module1/examples profile + +profile-module2: + @echo "Profile build Module 2..." + @$(MAKE) -C modules/module2/examples profile + +profile-module3: + @echo "Profile build Module 3..." + @$(MAKE) -C modules/module3/examples profile + +profile-module4: + @echo "Profile build Module 4..." + @$(MAKE) -C modules/module4/examples profile + +profile-module5: + @echo "Profile build Module 5..." + @$(MAKE) -C modules/module5/examples profile + +profile-module6: + @echo "Profile build Module 6..." + @$(MAKE) -C modules/module6/examples profile + +profile-module7: + @echo "Profile build Module 7..." + @$(MAKE) -C modules/module7/examples profile + +profile-module8: + @echo "Profile build Module 8..." + @$(MAKE) -C modules/module8/examples profile + +profile-module9: + @echo "Profile build Module 9..." + @$(MAKE) -C modules/module9/examples profile + # Clean all builds clean: @echo "Cleaning all modules..." @@ -193,6 +271,8 @@ help: @echo " all - Build all available modules" @echo " clean - Clean all build artifacts" @echo " test - Run all available tests" + @echo " debug - Build all modules with debug flags" + @echo " profile - Build all modules with profiling flags" @echo " help - Show this help message" @echo "" @echo "Module targets:" diff --git a/modules/module1/examples/Makefile b/modules/module1/examples/Makefile index ac6e4bb..799301a 100644 --- a/modules/module1/examples/Makefile +++ b/modules/module1/examples/Makefile @@ -185,12 +185,52 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS) debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: all +# Profile builds with actual profiling +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + @echo "Profile data saved to $(PROFILE_DIR)/" + @ls -la $(PROFILE_DIR)/ + # Clean .PHONY: clean clean: @echo "Cleaning build artifacts..." rm -rf $(BUILD_DIR) $(PROFILE_DIR) +# Test target - run built examples +.PHONY: test +test: all + @echo "Running Module 1 Tests..." + @for target in $(ALL_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Testing $$target..."; \ + $$target || echo "Test completed with exit code $$?"; \ + echo ""; \ + fi; \ + done + # Help .PHONY: help help: @@ -200,5 +240,7 @@ help: @echo " cuda - Build CUDA examples (requires NVIDIA GPU)" @echo " hip - Build HIP examples (requires AMD GPU)" @echo " debug - Build with debug flags" + @echo " profile - Build with profiling flags" + @echo " test - Run all built examples" @echo " clean - Remove build artifacts" @echo " help - Show this help message" \ No newline at end of file diff --git a/modules/module2/examples/Makefile b/modules/module2/examples/Makefile index c4a837b..fa5edf4 100644 --- a/modules/module2/examples/Makefile +++ b/modules/module2/examples/Makefile @@ -138,6 +138,34 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS) debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: all +# Profile builds +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + @echo "Profile data saved to $(PROFILE_DIR)/" + @ls -la $(PROFILE_DIR)/ + # Clean .PHONY: clean clean: @@ -174,15 +202,15 @@ test_cuda: cuda @if command -v nvidia-smi > /dev/null; then \ echo "=== Testing Advanced Memory Management Examples ==="; \ echo "1. Shared Memory Transpose..."; \ - ./01_shared_memory_transpose_cuda || echo "✗ Shared memory transpose failed"; \ + $(BUILD_DIR)/01_shared_memory_transpose_cuda || echo "✗ Shared memory transpose failed"; \ echo "2. Memory Coalescing Analysis..."; \ - ./02_memory_coalescing_cuda || echo "✗ Memory coalescing failed"; \ + $(BUILD_DIR)/02_memory_coalescing_cuda || echo "✗ Memory coalescing failed"; \ echo "3. Texture Memory Examples..."; \ - ./03_texture_memory_cuda || echo "✗ Texture memory failed"; \ + $(BUILD_DIR)/03_texture_memory_cuda || echo "✗ Texture memory failed"; \ echo "4. Unified Memory Examples..."; \ - ./04_unified_memory_cuda || echo "✗ Unified memory failed"; \ + $(BUILD_DIR)/04_unified_memory_cuda || echo "✗ Unified memory failed"; \ echo "5. Bandwidth Optimization..."; \ - ./05_memory_bandwidth_optimization_cuda || echo "✗ Bandwidth optimization failed"; \ + $(BUILD_DIR)/05_memory_bandwidth_optimization_cuda || echo "✗ Bandwidth optimization failed"; \ echo "✓ Module 2 CUDA tests completed"; \ else \ echo "No NVIDIA GPU detected, skipping CUDA tests"; \ @@ -193,9 +221,9 @@ test_hip: hip @if command -v rocm-smi > /dev/null || command -v nvidia-smi > /dev/null; then \ echo "=== Testing HIP Memory Examples ==="; \ echo "1. Shared Memory Transpose..."; \ - ./01_shared_memory_transpose_hip || echo "✗ HIP shared memory transpose failed"; \ + $(BUILD_DIR)/01_shared_memory_transpose_hip || echo "✗ HIP shared memory transpose failed"; \ echo "2. Memory Coalescing Analysis..."; \ - ./02_memory_coalescing_hip || echo "✗ HIP memory coalescing failed"; \ + $(BUILD_DIR)/02_memory_coalescing_hip || echo "✗ HIP memory coalescing failed"; \ echo "✓ Module 2 HIP tests completed"; \ else \ echo "No compatible GPU detected, skipping HIP tests"; \ diff --git a/modules/module3/examples/Makefile b/modules/module3/examples/Makefile index 14ae76b..81a1f24 100644 --- a/modules/module3/examples/Makefile +++ b/modules/module3/examples/Makefile @@ -54,6 +54,11 @@ endif # Add detected GPU architecture to HIP flags HIP_FLAGS += --offload-arch=$(GPU_ARCH) HIP_DEBUG_FLAGS += --offload-arch=$(GPU_ARCH) + +# ROCm library linking for advanced examples +HIP_LIB_DIR := $(ROCM_PATH)/lib +HIP_LDFLAGS := -L$(HIP_LIB_DIR) -lrocblas -Wl,-rpath,$(HIP_LIB_DIR) + CXX_FLAGS = -std=c++17 -O2 # Directories @@ -129,7 +134,7 @@ endif ifeq ($(BUILD_HIP),1) $(BUILD_DIR)/%_hip: $(EXAMPLES_DIR)/%_hip.cpp @echo "Building HIP example: $@" - $(HIPCC) $(HIP_FLAGS) $< -o $@ + $(HIPCC) $(HIP_FLAGS) $< -o $@ $(HIP_LDFLAGS) endif # Debug builds @@ -138,6 +143,32 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS) debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: all +# Profile builds +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + # Clean\n.PHONY: clean\nclean:\n\t@echo \"Cleaning build artifacts...\"\n\trm -rf $(BUILD_DIR) $(PROFILE_DIR) # Help @@ -172,19 +203,19 @@ test_cuda: cuda @if command -v nvidia-smi > /dev/null; then \ echo "=== Testing Advanced Algorithm Examples ==="; \ echo "1. Reduction Algorithms..."; \ - ./01_reduction_algorithms_cuda || echo "✗ Reduction algorithms failed"; \ + $(BUILD_DIR)/01_reduction_algorithms_cuda || echo "✗ Reduction algorithms failed"; \ echo "2. Scan (Prefix Sum)..."; \ - ./02_scan_prefix_sum_cuda || echo "✗ Scan algorithms failed"; \ + $(BUILD_DIR)/02_scan_prefix_sum_cuda || echo "✗ Scan algorithms failed"; \ echo "3. Sorting Algorithms..."; \ - ./03_sorting_algorithms_cuda || echo "✗ Sorting algorithms failed"; \ + $(BUILD_DIR)/03_sorting_algorithms_cuda || echo "✗ Sorting algorithms failed"; \ echo "4. Convolution/Stencil..."; \ - ./04_convolution_stencil_cuda || echo "✗ Convolution failed"; \ + $(BUILD_DIR)/04_convolution_stencil_cuda || echo "✗ Convolution failed"; \ echo "5. Matrix Operations..."; \ - ./05_matrix_operations_cuda || echo "✗ Matrix operations failed"; \ + $(BUILD_DIR)/05_matrix_operations_cuda || echo "✗ Matrix operations failed"; \ echo "6. Graph Algorithms..."; \ - ./06_graph_algorithms_cuda || echo "✗ Graph algorithms failed"; \ + $(BUILD_DIR)/06_graph_algorithms_cuda || echo "✗ Graph algorithms failed"; \ echo "7. Cooperative Groups..."; \ - ./07_cooperative_groups_cuda || echo "✗ Cooperative groups failed"; \ + $(BUILD_DIR)/07_cooperative_groups_cuda || echo "✗ Cooperative groups failed"; \ echo "✓ Module 3 CUDA tests completed"; \ else \ echo "No NVIDIA GPU detected, skipping CUDA tests"; \ @@ -195,13 +226,13 @@ test_hip: hip @if command -v rocm-smi > /dev/null || command -v nvidia-smi > /dev/null; then \ echo "=== Testing HIP Algorithm Examples ==="; \ echo "1. Reduction Algorithms..."; \ - ./01_reduction_algorithms_hip || echo "✗ HIP reduction algorithms failed"; \ + $(BUILD_DIR)/01_reduction_algorithms_hip || echo "✗ HIP reduction algorithms failed"; \ echo "2. Scan (Prefix Sum)..."; \ - ./02_scan_prefix_sum_hip || echo "✗ HIP scan algorithms failed"; \ + $(BUILD_DIR)/02_scan_prefix_sum_hip || echo "✗ HIP scan algorithms failed"; \ echo "3. Sorting Algorithms..."; \ - ./03_sorting_algorithms_hip || echo "✗ HIP sorting algorithms failed"; \ + $(BUILD_DIR)/03_sorting_algorithms_hip || echo "✗ HIP sorting algorithms failed"; \ echo "4. Convolution/Stencil..."; \ - ./04_convolution_stencil_hip || echo "✗ HIP convolution failed"; \ + $(BUILD_DIR)/04_convolution_stencil_hip || echo "✗ HIP convolution failed"; \ echo "✓ Module 3 HIP tests completed"; \ else \ echo "No compatible GPU detected, skipping HIP tests"; \ diff --git a/modules/module4/examples/Makefile b/modules/module4/examples/Makefile index 2b5ac2e..f020cbe 100644 --- a/modules/module4/examples/Makefile +++ b/modules/module4/examples/Makefile @@ -152,6 +152,32 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS) debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: all +# Profile builds +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + # Clean .PHONY: clean clean: @@ -228,15 +254,15 @@ test_cuda: cuda @if command -v nvidia-smi > /dev/null; then \ echo "=== Testing Advanced GPU Programming Examples ==="; \ echo "1. CUDA Streams..."; \ - ./01_cuda_streams_basics || echo "✗ CUDA Streams failed"; \ + $(BUILD_DIR)/01_cuda_streams_basics || echo "✗ CUDA Streams failed"; \ echo "2. Multi-GPU Programming..."; \ - ./02_multi_gpu_programming || echo "✗ Multi-GPU Programming failed"; \ + $(BUILD_DIR)/02_multi_gpu_programming || echo "✗ Multi-GPU Programming failed"; \ echo "3. Unified Memory..."; \ - ./03_unified_memory || echo "✗ Unified Memory failed"; \ + $(BUILD_DIR)/03_unified_memory || echo "✗ Unified Memory failed"; \ echo "4. Peer-to-Peer Communication..."; \ - ./04_peer_to_peer_communication || echo "✗ P2P Communication failed"; \ + $(BUILD_DIR)/04_peer_to_peer_communication || echo "✗ P2P Communication failed"; \ echo "5. Dynamic Parallelism (requires compute capability 3.5+)..."; \ - ./05_dynamic_parallelism || echo "✗ Dynamic Parallelism failed (may require newer GPU)"; \ + $(BUILD_DIR)/05_dynamic_parallelism || echo "✗ Dynamic Parallelism failed (may require newer GPU)"; \ echo "✓ Module 4 tests completed"; \ else \ echo "No NVIDIA GPU detected, skipping GPU tests"; \ @@ -248,13 +274,13 @@ test_hip: hip @if command -v rocm-smi > /dev/null 2>&1 || command -v nvidia-smi > /dev/null 2>&1; then \ echo "=== Testing HIP Advanced GPU Programming Examples ==="; \ echo "1. HIP Streams..."; \ - ./01_hip_streams_basics || echo "✗ HIP Streams failed"; \ + $(BUILD_DIR)/01_hip_streams_basics || echo "✗ HIP Streams failed"; \ echo "2. HIP Multi-GPU Programming..."; \ - ./02_hip_multi_gpu_programming || echo "✗ HIP Multi-GPU Programming failed"; \ + $(BUILD_DIR)/02_hip_multi_gpu_programming || echo "✗ HIP Multi-GPU Programming failed"; \ echo "3. HIP Unified Memory..."; \ - ./03_hip_unified_memory || echo "✗ HIP Unified Memory failed"; \ + $(BUILD_DIR)/03_hip_unified_memory || echo "✗ HIP Unified Memory failed"; \ echo "4. HIP Peer-to-Peer Communication..."; \ - ./04_hip_peer_to_peer_communication || echo "✗ HIP P2P Communication failed"; \ + $(BUILD_DIR)/04_hip_peer_to_peer_communication || echo "✗ HIP P2P Communication failed"; \ echo "✓ HIP Module 4 tests completed"; \ else \ echo "No GPU detected (ROCm or CUDA), skipping HIP tests"; \ diff --git a/modules/module5/examples/Makefile b/modules/module5/examples/Makefile index 5ccbc05..b495e4d 100644 --- a/modules/module5/examples/Makefile +++ b/modules/module5/examples/Makefile @@ -123,6 +123,32 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS) debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: all +# Profile builds +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + # Clean build artifacts .PHONY: clean clean: diff --git a/modules/module6/examples/03_histogram_cuda.cu b/modules/module6/examples/03_histogram_cuda.cu index 0168882..cf1ab24 100644 --- a/modules/module6/examples/03_histogram_cuda.cu +++ b/modules/module6/examples/03_histogram_cuda.cu @@ -128,7 +128,8 @@ __global__ void histogram_coarsened(unsigned char *input, int *histogram, int n) } /** - * Warp-aggregated histogram with intra-warp reduction + * Warp-aggregated histogram with simplified aggregation + * Optimized version that avoids expensive nested shuffle loops */ __global__ void histogram_warp_aggregated(unsigned char *input, int *histogram, int n) { extern __shared__ int private_hist[]; @@ -136,7 +137,6 @@ __global__ void histogram_warp_aggregated(unsigned char *input, int *histogram, int tid = threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x; int lane_id = threadIdx.x % 32; - // int warp_id = threadIdx.x / 32; // Unused, commented out // Initialize private histogram for (int bin = tid; bin < NUM_BINS; bin += blockDim.x) { @@ -144,32 +144,18 @@ __global__ void histogram_warp_aggregated(unsigned char *input, int *histogram, } __syncthreads(); - // Process input with warp aggregation + // Process input with simplified warp aggregation if (idx < n) { int bin = input[idx]; - // Count occurrences of this bin within the warp - int warp_count = 0; - for (int offset = 0; offset < 32; offset++) { - int other_bin = __shfl_sync(0xffffffff, bin, offset); - if (other_bin == bin) { - warp_count++; + // Use ballot to find threads with same bin value efficiently + unsigned int ballot = __ballot_sync(0xffffffff, true); + if (ballot != 0) { + // Only the first active lane updates the shared memory + if (lane_id == __ffs(ballot) - 1) { + atomicAdd(&private_hist[bin], 1); } } - - // Only first thread with this bin value updates the histogram - bool first_thread = true; - for (int offset = 0; offset < lane_id; offset++) { - int other_bin = __shfl_sync(0xffffffff, bin, offset); - if (other_bin == bin) { - first_thread = false; - break; - } - } - - if (first_thread) { - atomicAdd(&private_hist[bin], warp_count); - } } __syncthreads(); @@ -183,43 +169,10 @@ __global__ void histogram_warp_aggregated(unsigned char *input, int *histogram, /** * Optimized warp-aggregated histogram using ballot and popc + * Note: This implementation has been removed due to performance issues + * with the nested loop over all bins. The warp aggregation approach + * works better with selective bin processing rather than exhaustive search. */ -__global__ void histogram_warp_optimized(unsigned char *input, int *histogram, int n) { - extern __shared__ int private_hist[]; - - int tid = threadIdx.x; - int idx = blockIdx.x * blockDim.x + threadIdx.x; - int lane_id = threadIdx.x % 32; - - // Initialize private histogram - for (int bin = tid; bin < NUM_BINS; bin += blockDim.x) { - private_hist[bin] = 0; - } - __syncthreads(); - - // Process input with optimized warp aggregation - if (idx < n) { - int bin = input[idx]; - - // Use ballot to find threads with same bin value - for (int target_bin = 0; target_bin < NUM_BINS; target_bin++) { - unsigned int ballot = __ballot_sync(0xffffffff, bin == target_bin); - int count = __popc(ballot); - - if (count > 0 && lane_id == __ffs(ballot) - 1) { - atomicAdd(&private_hist[target_bin], count); - } - } - } - __syncthreads(); - - // Merge private histogram to global histogram - for (int bin = tid; bin < NUM_BINS; bin += blockDim.x) { - if (private_hist[bin] > 0) { - atomicAdd(&histogram[bin], private_hist[bin]); - } - } -} /** * Multi-pass histogram for very large datasets @@ -369,7 +322,7 @@ void benchmark_histogram(const char* distribution_name, printf("=== %s Distribution Histogram Benchmark ===\n", distribution_name); const int n = 16 * 1024 * 1024; // 16M elements - const int num_iterations = 100; + const int num_iterations = 10; // Reduced from 100 to 10 for faster testing // Allocate host memory unsigned char *h_input = new unsigned char[n]; diff --git a/modules/module6/examples/Makefile b/modules/module6/examples/Makefile index f7381a7..393c985 100644 --- a/modules/module6/examples/Makefile +++ b/modules/module6/examples/Makefile @@ -121,6 +121,32 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS) debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: all +# Profile builds +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + # Algorithm-specific targets .PHONY: convolution convolution: setup diff --git a/modules/module7/examples/Makefile b/modules/module7/examples/Makefile index 63f0110..95c778c 100644 --- a/modules/module7/examples/Makefile +++ b/modules/module7/examples/Makefile @@ -129,6 +129,32 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS) debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: all +# Profile builds +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + # Algorithm-specific targets .PHONY: sorting sorting: setup diff --git a/modules/module8/examples/Makefile b/modules/module8/examples/Makefile index d68b85f..0ef37ab 100644 --- a/modules/module8/examples/Makefile +++ b/modules/module8/examples/Makefile @@ -186,6 +186,32 @@ debug: CUDA_FLAGS = $(CUDA_DEBUG_FLAGS) debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: all +# Profile builds +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + # Professional builds with maximum optimization .PHONY: production production: CUDA_FLAGS += -DNDEBUG -Xptxas -O3 diff --git a/modules/module9/examples/Makefile b/modules/module9/examples/Makefile index 32bb553..8fa458a 100644 --- a/modules/module9/examples/Makefile +++ b/modules/module9/examples/Makefile @@ -200,6 +200,33 @@ debug: HIP_FLAGS = $(HIP_DEBUG_FLAGS) debug: CXX_FLAGS = -std=c++17 -g -DDEBUG_BUILD debug: all +# Profile builds +.PHONY: profile +profile: CUDA_FLAGS += -lineinfo +profile: HIP_FLAGS += -g +profile: CXX_FLAGS += -g -pg +profile: all + @echo "Generating profile data..." + @mkdir -p $(PROFILE_DIR) +ifeq ($(BUILD_HIP),1) + @echo "Running HIP profiling..." + @for target in $(HIP_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + rocprofv3 --runtime-trace --output-format csv -d $(PROFILE_DIR) -o $$(basename $$target).csv -- $$target 2>/dev/null || echo "rocprofv3 completed"; \ + fi; \ + done +endif +ifeq ($(BUILD_CUDA),1) + @echo "Running CUDA profiling..." + @for target in $(CUDA_TARGETS); do \ + if [ -f $$target ]; then \ + echo "Profiling $$target..."; \ + nvprof --csv -o $(PROFILE_DIR)/$$(basename $$target).csv $$target 2>/dev/null || echo "nvprof completed"; \ + fi; \ + done +endif + # Production builds with security hardening .PHONY: production production: CUDA_FLAGS += -DNDEBUG -Xptxas -O3 -DSECURITY_HARDENED