From b7b7a0a48a7bb810c52e8d2e08f232d9c7c83d7e Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Sat, 13 Sep 2025 00:12:44 -0400 Subject: [PATCH 1/5] Update docker compose --- docker/scripts/build.sh | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/docker/scripts/build.sh b/docker/scripts/build.sh index a688315..9ef8e25 100755 --- a/docker/scripts/build.sh +++ b/docker/scripts/build.sh @@ -42,7 +42,8 @@ check_requirements() { exit 1 fi - if ! command -v docker-compose &> /dev/null; then + # Check for Docker Compose (v2 with 'docker compose' or legacy v1 with 'docker-compose') + if ! docker compose version &> /dev/null && ! command -v docker-compose &> /dev/null; then error "Docker Compose is not installed. Please install Docker Compose first." exit 1 fi @@ -77,18 +78,19 @@ build_platform() { fi } -# Build using docker-compose +# Build using docker compose (v2) or docker-compose (v1) build_compose() { local service=$1 - log "Building $service using docker-compose..." + log "Building $service using docker compose..." cd "$DOCKER_DIR" - if docker-compose build "$service"; then - success "$service built successfully using docker-compose" + # Try docker compose (v2) first, then fall back to docker-compose (v1) + if docker compose build "$service" 2>/dev/null || docker-compose build "$service" 2>/dev/null; then + success "$service built successfully using docker compose" return 0 else - error "Failed to build $service using docker-compose" + error "Failed to build $service using docker compose" return 1 fi } @@ -103,7 +105,7 @@ Usage: $0 [OPTIONS] [PLATFORM] Options: -h, --help Show this help message -a, --all Build all platforms (CUDA + ROCm) - -c, --compose Use docker-compose for building + -c, --compose Use docker compose for building --clean Clean existing images before building --no-cache Build without using Docker cache --pull Pull base images before building @@ -116,7 +118,7 @@ Examples: $0 cuda Build CUDA container only $0 rocm Build ROCm container only $0 --all Build both CUDA and ROCm containers - $0 --compose cuda Build using docker-compose + $0 --compose cuda Build using docker compose $0 --clean --all Clean and rebuild all containers EOF @@ -246,7 +248,7 @@ main() { log "Next steps:" log " - Run: ./docker/scripts/run.sh cuda (for NVIDIA GPUs)" log " - Run: ./docker/scripts/run.sh rocm (for AMD GPUs)" - log " - Or use: docker-compose -f docker/docker-compose.yml up cuda-dev" + log " - Or use: docker compose -f docker/docker-compose.yml up cuda-dev" else warning "Some builds failed. Check the logs above for details." exit 1 From 9fd0cf8b21e86a9e2800b286578817cb52ead343 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Sat, 13 Sep 2025 01:03:18 -0400 Subject: [PATCH 2/5] Fixed the GPU docker of Nvidia --- docker/cuda/Dockerfile | 184 +++++++++++++++++++---------------------- docker/scripts/run.sh | 90 ++++++++++++-------- 2 files changed, 140 insertions(+), 134 deletions(-) diff --git a/docker/cuda/Dockerfile b/docker/cuda/Dockerfile index edfd6ec..e320889 100644 --- a/docker/cuda/Dockerfile +++ b/docker/cuda/Dockerfile @@ -13,9 +13,9 @@ LABEL ubuntu.version="22.04" # Avoid interactive prompts during package installation ARG DEBIAN_FRONTEND=noninteractive -# Install essential development tools +# Install essential development tools for GPU programming RUN apt-get update && apt-get install -y \ - # Basic development tools + # Core development tools build-essential \ cmake \ git \ @@ -25,49 +25,33 @@ RUN apt-get update && apt-get install -y \ nano \ htop \ tree \ - # Python development + # Minimal Python for basic scripting (not data science) python3 \ python3-pip \ python3-dev \ # Additional utilities pkg-config \ software-properties-common \ - apt-transport-https \ - ca-certificates \ - gnupg \ - lsb-release \ - # GPU monitoring tools + # GPU monitoring tools (installed but won't work during build) nvidia-utils-535 \ # Debugging and profiling tools gdb \ valgrind \ strace \ - # Network tools for downloading samples + # Network tools net-tools \ iputils-ping \ && rm -rf /var/lib/apt/lists/* -# Install NVIDIA profiling tools (Nsight Systems, Compute) - Latest 2025 versions -RUN apt-get update && apt-get install -y \ - nsight-systems-2025.1.1 \ - nsight-compute-2025.1.1 \ - && rm -rf /var/lib/apt/lists/* || \ - # Fallback to 2024 versions if 2025 not available yet - (apt-get update && apt-get install -y \ - nsight-systems-2024.6.1 \ - nsight-compute-2024.3.1 \ - && rm -rf /var/lib/apt/lists/*) - -# Install Python packages for data analysis and visualization +# Install optional CUDA tools if available +RUN apt-get update && \ + (apt-get install -y cuda-tools-12-9 || apt-get install -y cuda-tools || true) && \ + rm -rf /var/lib/apt/lists/* + +# Install minimal Python packages for basic development (no heavy data science libs) RUN pip3 install --no-cache-dir \ numpy \ - matplotlib \ - seaborn \ - pandas \ - jupyter \ - jupyterlab \ - plotly \ - scipy + matplotlib # Set up CUDA environment variables ENV PATH=/usr/local/cuda/bin:${PATH} @@ -78,8 +62,8 @@ ENV CUDA_VERSION=12.9.1 ENV NVIDIA_VISIBLE_DEVICES=all ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility -# Verify CUDA installation -RUN nvcc --version && nvidia-smi +# Verify CUDA compiler installation (skip nvidia-smi as no GPU during build) +RUN nvcc --version # Create development workspace WORKDIR /workspace @@ -98,85 +82,85 @@ RUN echo 'alias ll="ls -alF"' >> /root/.bashrc && \ echo 'export PS1="\[\e[1;32m\][CUDA-DEV]\[\e[0m\] \w $ "' >> /root/.bashrc # Create a simple GPU test script -RUN cat > /workspace/test-gpu.sh << 'EOF' -#!/bin/bash -echo "=== GPU Programming 101 - CUDA Environment Test ===" -echo "Date: $(date)" -echo "" - -echo "=== CUDA Compiler ===" -nvcc --version -echo "" - -echo "=== GPU Information ===" -nvidia-smi --query-gpu=name,memory.total,compute_cap,driver_version --format=csv -echo "" - -echo "=== CUDA Samples Test ===" -if [ -d "/usr/local/cuda/samples" ]; then - echo "CUDA samples directory found" -else - echo "CUDA samples not found - this is normal for newer CUDA versions" -fi - -echo "=== Environment Variables ===" -echo "CUDA_HOME: $CUDA_HOME" -echo "PATH: $PATH" -echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" -echo "" - -echo "=== Build Test ===" -cd /tmp -cat > test.cu << 'CUDA_EOF' -#include -#include - -__global__ void hello() { - printf("Hello from GPU thread %d!\n", threadIdx.x); -} - -int main() { - printf("CUDA Test Program\n"); - hello<<<1, 5>>>(); - cudaDeviceSynchronize(); - printf("GPU kernel completed!\n"); - return 0; -} -CUDA_EOF - -echo "Compiling test CUDA program..." -if nvcc -o test test.cu; then - echo "✓ Compilation successful" - echo "Running test program:" - ./test - echo "✓ CUDA environment is working correctly!" -else - echo "✗ Compilation failed" - exit 1 -fi - -rm -f test test.cu -echo "" -echo "=== All tests completed ===" -EOF +RUN printf '#!/bin/bash\n\ +echo "=== GPU Programming 101 - CUDA Environment Test ==="\n\ +echo "Date: $(date)"\n\ +echo ""\n\ +\n\ +echo "=== CUDA Compiler ==="\n\ +nvcc --version\n\ +echo ""\n\ +\n\ +echo "=== GPU Information ==="\n\ +if nvidia-smi --query-gpu=name,memory.total,compute_cap,driver_version --format=csv 2>/dev/null; then\n\ + echo "GPU detected successfully"\n\ +else\n\ + echo "No GPU detected or nvidia-smi not available"\n\ +fi\n\ +echo ""\n\ +\n\ +echo "=== Environment Variables ==="\n\ +echo "CUDA_HOME: $CUDA_HOME"\n\ +echo "PATH: $PATH"\n\ +echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"\n\ +echo ""\n\ +\n\ +echo "=== Build Test ==="\n\ +cd /tmp\n\ +cat > test.cu << '"'"'CUDA_EOF'"'"'\n\ +#include \n\ +#include \n\ +\n\ +__global__ void hello() {\n\ + printf("Hello from GPU thread %%d!\\n", threadIdx.x);\n\ +}\n\ +\n\ +int main() {\n\ + printf("CUDA Test Program\\n");\n\ + \n\ + int deviceCount;\n\ + cudaError_t error = cudaGetDeviceCount(&deviceCount);\n\ + \n\ + if (error != cudaSuccess) {\n\ + printf("CUDA Error: %%s\\n", cudaGetErrorString(error));\n\ + printf("No CUDA-capable devices found\\n");\n\ + return 0;\n\ + }\n\ + \n\ + printf("Found %%d CUDA device(s)\\n", deviceCount);\n\ + hello<<<1, 5>>>();\n\ + cudaDeviceSynchronize();\n\ + printf("GPU kernel completed!\\n");\n\ + return 0;\n\ +}\n\ +CUDA_EOF\n\ +\n\ +echo "Compiling test CUDA program..."\n\ +if nvcc -o test test.cu; then\n\ + echo "✓ Compilation successful"\n\ + echo "Running test program:"\n\ + ./test\n\ + echo "✓ CUDA environment is working correctly!"\n\ +else\n\ + echo "✗ Compilation failed"\n\ + exit 1\n\ +fi\n\ +\n\ +rm -f test test.cu\n\ +echo ""\n\ +echo "=== All tests completed ==="\n' > /workspace/test-gpu.sh RUN chmod +x /workspace/test-gpu.sh -# Install additional CUDA samples and utilities +# Install CUDA samples for learning and reference RUN cd /workspace && \ git clone https://github.com/NVIDIA/cuda-samples.git && \ cd cuda-samples && \ git checkout v12.9 -# Create jupyter kernel for CUDA (for notebooks) -RUN python3 -m ipykernel install --name cuda-kernel --display-name "CUDA Python" - -# Expose Jupyter port -EXPOSE 8888 - # Default command CMD ["/bin/bash"] -# Health check to verify GPU access +# Health check to verify GPU access (will only work when GPU is available) HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD nvidia-smi > /dev/null 2>&1 || exit 1 \ No newline at end of file + CMD nvcc --version > /dev/null 2>&1 || exit 1 \ No newline at end of file diff --git a/docker/scripts/run.sh b/docker/scripts/run.sh index 509f0f8..2b56085 100755 --- a/docker/scripts/run.sh +++ b/docker/scripts/run.sh @@ -36,7 +36,7 @@ warning() { # Check if image exists check_image() { local image_name=$1 - if ! docker images "$image_name" | grep -q "$image_name"; then + if ! docker image inspect "$image_name" >/dev/null 2>&1; then error "Image $image_name not found. Please build it first using:" error " ./docker/scripts/build.sh" return 1 @@ -48,20 +48,26 @@ check_image() { detect_gpu() { local gpu_type="none" - # Check for NVIDIA - if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then + # Check for NVIDIA GPU presence (multiple methods) + if command -v nvidia-smi &> /dev/null; then gpu_type="nvidia" - log "NVIDIA GPU detected" + log "NVIDIA GPU tools detected (nvidia-smi found)" >&2 + elif [ -c /dev/nvidia0 ] || [ -c /dev/nvidiactl ]; then + gpu_type="nvidia" + log "NVIDIA GPU devices detected (/dev/nvidia*)" >&2 + elif lspci 2>/dev/null | grep -i nvidia &> /dev/null; then + gpu_type="nvidia" + log "NVIDIA GPU detected via lspci" >&2 # Check for AMD elif command -v rocm-smi &> /dev/null && rocm-smi &> /dev/null; then gpu_type="amd" - log "AMD GPU detected" + log "AMD GPU detected" >&2 elif [ -c /dev/kfd ] && [ -c /dev/dri/renderD128 ]; then gpu_type="amd" - log "AMD GPU devices detected" + log "AMD GPU devices detected" >&2 else - warning "No GPU detected or GPU tools not available" - warning "Container will run in CPU-only mode" + warning "No GPU detected or GPU tools not available" >&2 + warning "Container will run in CPU-only mode" >&2 fi echo "$gpu_type" @@ -74,10 +80,15 @@ run_cuda() { local gpu_args="" local ports_args="-p 8888:8888" local extra_args=() + local no_gpu_requested=false # Parse additional arguments while [[ $# -gt 0 ]]; do case $1 in + -h|--help) + show_usage + exit 0 + ;; --name) container_name="$2" shift 2 @@ -87,19 +98,13 @@ run_cuda() { shift 2 ;; --no-gpu) - gpu_args="" + no_gpu_requested=true shift ;; --detach|-d) extra_args+=("--detach") shift ;; - --jupyter) - extra_args+=("--detach") - ports_args="-p 8888:8888" - log "Starting Jupyter Lab on http://localhost:8888" - shift - ;; *) extra_args+=("$1") shift @@ -112,8 +117,17 @@ run_cuda() { fi # Set up GPU access for NVIDIA - if [ "$(detect_gpu)" = "nvidia" ] && [ -z "${gpu_args+x}" ]; then + local detected_gpu=$(detect_gpu) + log "Detected GPU type: $detected_gpu" + log "No GPU requested: $no_gpu_requested" + + if [ "$detected_gpu" = "nvidia" ] && [ "$no_gpu_requested" = false ]; then gpu_args="--gpus all" + log "Enabling NVIDIA GPU access" + elif [ "$no_gpu_requested" = true ]; then + log "GPU access explicitly disabled with --no-gpu" + else + log "GPU access disabled (no NVIDIA GPU detected or other reason)" fi # Remove existing container if it exists @@ -133,12 +147,14 @@ run_cuda() { --name "$container_name" --hostname "cuda-dev" -it - "$ports_args" -v "$PROJECT_ROOT:/workspace/gpu-programming-101:rw" -v "gpu101-cuda-home:/root" -w "/workspace/gpu-programming-101" ) + # Add port mapping + cmd+=($ports_args) + # Add GPU args if available if [ -n "$gpu_args" ]; then cmd+=($gpu_args) @@ -166,10 +182,15 @@ run_rocm() { local gpu_args="" local ports_args="-p 8889:8888" local extra_args=() + local no_gpu_requested=false # Parse additional arguments while [[ $# -gt 0 ]]; do case $1 in + -h|--help) + show_usage + exit 0 + ;; --name) container_name="$2" shift 2 @@ -179,19 +200,13 @@ run_rocm() { shift 2 ;; --no-gpu) - gpu_args="" + no_gpu_requested=true shift ;; --detach|-d) extra_args+=("--detach") shift ;; - --jupyter) - extra_args+=("--detach") - ports_args="-p 8889:8888" - log "Starting Jupyter Lab on http://localhost:8889" - shift - ;; *) extra_args+=("$1") shift @@ -204,8 +219,12 @@ run_rocm() { fi # Set up GPU access for AMD - if [ "$(detect_gpu)" = "amd" ] && [ -z "${gpu_args+x}" ]; then + local detected_gpu=$(detect_gpu) + if [ "$detected_gpu" = "amd" ] && [ "$no_gpu_requested" = false ]; then gpu_args="--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined" + log "Enabling AMD GPU access" + elif [ "$no_gpu_requested" = true ]; then + log "GPU access explicitly disabled with --no-gpu" fi # Remove existing container if it exists @@ -225,7 +244,6 @@ run_rocm() { --name "$container_name" --hostname "rocm-dev" -it - "$ports_args" -v "$PROJECT_ROOT:/workspace/gpu-programming-101:rw" -v "gpu101-rocm-home:/root" -w "/workspace/gpu-programming-101" @@ -233,6 +251,9 @@ run_rocm() { -e HSA_OVERRIDE_GFX_VERSION=10.3.0 ) + # Add port mapping + cmd+=($ports_args) + # Add GPU args if available if [ -n "$gpu_args" ]; then cmd+=($gpu_args) @@ -294,29 +315,30 @@ Platforms: Options: -h, --help Show this help message --name NAME Set custom container name - --port PORT Map Jupyter to custom port (default: 8888 for CUDA, 8889 for ROCm) + --port PORT Map port to host (default: 8888 for CUDA, 8889 for ROCm) --no-gpu Disable GPU access (CPU-only mode) --detach, -d Run in background (detached mode) - --jupyter Start in detached mode with Jupyter Lab --auto Auto-detect GPU and run appropriate container Examples: $0 cuda Run CUDA container interactively $0 rocm Run ROCm container interactively - $0 cuda --jupyter Start CUDA container with Jupyter Lab - $0 rocm --detach Run ROCm container in background + $0 cuda --detach Run CUDA container in background + $0 rocm --no-gpu Run ROCm container in CPU-only mode $0 --auto Auto-detect GPU type and run appropriate container $0 compose cuda-dev Run using docker-compose -Jupyter Access: - CUDA container: http://localhost:8888 - ROCm container: http://localhost:8889 - Container Management: List containers: docker ps -a Stop container: docker stop gpu101-cuda-dev Remove container: docker rm gpu101-cuda-dev Container logs: docker logs gpu101-cuda-dev + Enter container: docker exec -it gpu101-cuda-dev bash + +GPU Programming Setup: + Inside container: /workspace/test-gpu.sh # Test GPU environment + Build examples: cd modules/module1/examples && make + CUDA samples: cd /workspace/cuda-samples EOF } From 6c7d8569e266889b062c23db6a802a33bee78724 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Sat, 13 Sep 2025 08:53:57 -0400 Subject: [PATCH 3/5] Fixed the Dockerfile for AMD ROCm --- docker/rocm/Dockerfile | 261 +++++++++++++---------------------------- docker/scripts/run.sh | 21 +++- 2 files changed, 98 insertions(+), 184 deletions(-) diff --git a/docker/rocm/Dockerfile b/docker/rocm/Dockerfile index 085f8b9..21de9c0 100644 --- a/docker/rocm/Dockerfile +++ b/docker/rocm/Dockerfile @@ -13,9 +13,9 @@ LABEL ubuntu.version="22.04" # Avoid interactive prompts during package installation ARG DEBIAN_FRONTEND=noninteractive -# Update and install essential development tools +# Install essential development tools for GPU programming RUN apt-get update && apt-get install -y \ - # Basic development tools + # Core development tools build-essential \ cmake \ git \ @@ -25,17 +25,13 @@ RUN apt-get update && apt-get install -y \ nano \ htop \ tree \ - # Python development + # Minimal Python for basic scripting (not data science) python3 \ python3-pip \ python3-dev \ # Additional utilities pkg-config \ software-properties-common \ - apt-transport-https \ - ca-certificates \ - gnupg \ - lsb-release \ # Debugging and profiling tools gdb \ valgrind \ @@ -45,35 +41,21 @@ RUN apt-get update && apt-get install -y \ iputils-ping \ && rm -rf /var/lib/apt/lists/* -# Install ROCm development packages +# Install core ROCm development packages (keep minimal) RUN apt-get update && apt-get install -y \ - # Core ROCm packages - rocm-dev \ - rocm-libs \ + # Core ROCm packages for GPU programming hip-dev \ hip-samples \ hipblas-dev \ - hipfft-dev \ - hipsparse-dev \ - # ROCm profiling and debugging tools + # ROCm profiling tools (essential for performance work) rocprofiler-dev \ roctracer-dev \ - roctx \ - # Additional ROCm libraries - rocrand-dev \ - rocthrust-dev \ && rm -rf /var/lib/apt/lists/* -# Install Python packages for data analysis and visualization +# Install minimal Python packages for basic development (no heavy data science libs) RUN pip3 install --no-cache-dir \ numpy \ - matplotlib \ - seaborn \ - pandas \ - jupyter \ - jupyterlab \ - plotly \ - scipy + matplotlib # Set up ROCm environment variables ENV ROCM_PATH=/opt/rocm @@ -84,11 +66,8 @@ ENV HIP_PLATFORM=amd ENV HSA_OVERRIDE_GFX_VERSION=11.0.0 ENV ROCM_VERSION=6.4.3 -# Add ROCm binaries to PATH -ENV PATH=/opt/rocm/bin:/opt/rocm/hip/bin:${PATH} - -# Verify ROCm installation -RUN hipcc --version && rocminfo > /dev/null 2>&1 || echo "ROCm info check completed (may fail without GPU)" +# Verify HIP compiler installation (skip rocminfo as no GPU during build) +RUN hipcc --version # Create development workspace WORKDIR /workspace @@ -107,127 +86,80 @@ RUN echo 'alias ll="ls -alF"' >> /root/.bashrc && \ echo 'alias rocm-info="rocminfo"' >> /root/.bashrc && \ echo 'export PS1="\[\e[1;34m\][ROCm-DEV]\[\e[0m\] \w $ "' >> /root/.bashrc -# Create a comprehensive GPU test script -RUN cat > /workspace/test-gpu.sh << 'EOF' -#!/bin/bash -echo "=== GPU Programming 101 - ROCm Environment Test ===" -echo "Date: $(date)" -echo "" - -echo "=== HIP Compiler ===" -hipcc --version -echo "" - -echo "=== ROCm Version ===" -if command -v rocminfo > /dev/null 2>&1; then - rocminfo | head -20 -else - echo "rocminfo command not available" -fi -echo "" - -echo "=== GPU Information ===" -if command -v rocm-smi > /dev/null 2>&1; then - rocm-smi --showproductname --showmeminfo vram || echo "No AMD GPU detected or accessible" -else - echo "rocm-smi not available" -fi -echo "" - -echo "=== Environment Variables ===" -echo "ROCM_PATH: $ROCM_PATH" -echo "HIP_PATH: $HIP_PATH" -echo "HIP_PLATFORM: $HIP_PLATFORM" -echo "PATH: $PATH" -echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" -echo "" - -echo "=== HIP Platform Detection ===" -cat > /tmp/platform_test.cpp << 'HIP_EOF' -#include -#include - -int main() { - int deviceCount; - hipError_t error = hipGetDeviceCount(&deviceCount); - - if (error != hipSuccess) { - std::cout << "HIP Error: " << hipGetErrorString(error) << std::endl; - std::cout << "This may be normal if no GPU is available" << std::endl; - } else { - std::cout << "Number of HIP devices: " << deviceCount << std::endl; - - for (int i = 0; i < deviceCount; i++) { - hipDeviceProp_t props; - hipGetDeviceProperties(&props, i); - std::cout << "Device " << i << ": " << props.name << std::endl; - } - } - - return 0; -} -HIP_EOF - -echo "Compiling platform detection test..." -if hipcc -o /tmp/platform_test /tmp/platform_test.cpp; then - echo "✓ Compilation successful" - echo "Running platform test:" - /tmp/platform_test -else - echo "✗ Platform test compilation failed" -fi - -echo "" -echo "=== Build Test ===" -cd /tmp -cat > test.hip.cpp << 'HIP_EOF' -#include -#include - -__global__ void hello() { - printf("Hello from HIP thread %d!\n", hipThreadIdx_x); -} - -int main() { - printf("HIP Test Program\n"); - - // Check for HIP devices - int deviceCount; - hipError_t error = hipGetDeviceCount(&deviceCount); - - if (error == hipSuccess && deviceCount > 0) { - printf("Found %d HIP device(s)\n", deviceCount); - hello<<<1, 5>>>(); - hipDeviceSynchronize(); - printf("GPU kernel completed!\n"); - } else { - printf("No HIP devices found or error: %s\n", hipGetErrorString(error)); - printf("This is normal when running without GPU access\n"); - } - - return 0; -} -HIP_EOF - -echo "Compiling test HIP program..." -if hipcc -o test test.hip.cpp; then - echo "✓ Compilation successful" - echo "Running test program:" - ./test - echo "✓ HIP environment is working correctly!" -else - echo "✗ Compilation failed" - exit 1 -fi - -rm -f test test.hip.cpp platform_test platform_test.cpp -echo "" -echo "=== All tests completed ===" -EOF +# Create a simple GPU test script +RUN printf '#!/bin/bash\n\ +echo "=== GPU Programming 101 - ROCm Environment Test ==="\n\ +echo "Date: $(date)"\n\ +echo ""\n\ +\n\ +echo "=== HIP Compiler ==="\n\ +hipcc --version\n\ +echo ""\n\ +\n\ +echo "=== GPU Information ==="\n\ +if rocm-smi --showproductname --showmeminfo vram 2>/dev/null; then\n\ + echo "AMD GPU detected successfully"\n\ +else\n\ + echo "No AMD GPU detected or rocm-smi not available"\n\ +fi\n\ +echo ""\n\ +\n\ +echo "=== Environment Variables ==="\n\ +echo "ROCM_PATH: $ROCM_PATH"\n\ +echo "HIP_PATH: $HIP_PATH"\n\ +echo "HIP_PLATFORM: $HIP_PLATFORM"\n\ +echo "PATH: $PATH"\n\ +echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"\n\ +echo ""\n\ +\n\ +echo "=== Build Test ==="\n\ +cd /tmp\n\ +cat > test.hip.cpp << '"'"'HIP_EOF'"'"'\n\ +#include \n\ +#include \n\ +\n\ +__global__ void hello() {\n\ + printf("Hello from HIP thread %%d!\\n", hipThreadIdx_x);\n\ +}\n\ +\n\ +int main() {\n\ + printf("HIP Test Program\\n");\n\ + \n\ + int deviceCount;\n\ + hipError_t error = hipGetDeviceCount(&deviceCount);\n\ + \n\ + if (error != hipSuccess) {\n\ + printf("HIP Error: %%s\\n", hipGetErrorString(error));\n\ + printf("No HIP-capable devices found\\n");\n\ + return 0;\n\ + }\n\ + \n\ + printf("Found %%d HIP device(s)\\n", deviceCount);\n\ + hello<<<1, 5>>>();\n\ + hipDeviceSynchronize();\n\ + printf("GPU kernel completed!\\n");\n\ + return 0;\n\ +}\n\ +HIP_EOF\n\ +\n\ +echo "Compiling test HIP program..."\n\ +if hipcc -o test test.hip.cpp; then\n\ + echo "✓ Compilation successful"\n\ + echo "Running test program:"\n\ + ./test\n\ + echo "✓ HIP environment is working correctly!"\n\ +else\n\ + echo "✗ Compilation failed"\n\ + exit 1\n\ +fi\n\ +\n\ +rm -f test test.hip.cpp\n\ +echo ""\n\ +echo "=== All tests completed ==="\n' > /workspace/test-gpu.sh RUN chmod +x /workspace/test-gpu.sh -# Install HIP samples +# Install HIP samples for learning and reference RUN cd /workspace && \ if [ -d "/opt/rocm/hip/samples" ]; then \ cp -r /opt/rocm/hip/samples ./hip-samples; \ @@ -235,36 +167,9 @@ RUN cd /workspace && \ git clone https://github.com/ROCm-Developer-Tools/HIP-Examples.git hip-examples; \ fi -# Create jupyter kernel for HIP (for notebooks) -RUN python3 -m ipykernel install --name hip-kernel --display-name "HIP Python" - -# Set up HIP for both AMD and NVIDIA compatibility -RUN cat > /workspace/setup-hip-nvidia.sh << 'EOF' -#!/bin/bash -# Switch HIP to NVIDIA backend (for systems with NVIDIA GPUs) -export HIP_PLATFORM=nvidia -export HIP_COMPILER=nvcc -echo "HIP configured for NVIDIA backend" -echo "HIP_PLATFORM=$HIP_PLATFORM" -EOF - -RUN cat > /workspace/setup-hip-amd.sh << 'EOF' -#!/bin/bash -# Switch HIP to AMD backend (default) -export HIP_PLATFORM=amd -unset HIP_COMPILER -echo "HIP configured for AMD backend" -echo "HIP_PLATFORM=$HIP_PLATFORM" -EOF - -RUN chmod +x /workspace/setup-hip-*.sh - -# Expose Jupyter port -EXPOSE 8888 - # Default command CMD ["/bin/bash"] -# Health check to verify ROCm access +# Health check to verify HIP compiler access (will only work when GPU is available) HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD rocminfo > /dev/null 2>&1 || hipcc --version > /dev/null 2>&1 || exit 1 \ No newline at end of file + CMD hipcc --version > /dev/null 2>&1 || exit 1 \ No newline at end of file diff --git a/docker/scripts/run.sh b/docker/scripts/run.sh index 2b56085..b01b26a 100755 --- a/docker/scripts/run.sh +++ b/docker/scripts/run.sh @@ -274,15 +274,15 @@ run_rocm() { "${cmd[@]}" } -# Run with docker-compose +# Run with docker compose (v2) or docker-compose (v1) run_compose() { local service=$1 shift - log "Starting $service using docker-compose..." + log "Starting $service using docker compose..." cd "$DOCKER_DIR" - # Parse arguments for docker-compose + # Parse arguments for docker compose local compose_args=() while [[ $# -gt 0 ]]; do case $1 in @@ -297,7 +297,15 @@ run_compose() { esac done - docker-compose up "${compose_args[@]}" "$service" + # Try docker compose (v2) first, then fall back to docker-compose (v1) + if docker compose up "${compose_args[@]}" "$service" 2>/dev/null; then + log "Started $service using docker compose (v2)" + elif docker-compose up "${compose_args[@]}" "$service" 2>/dev/null; then + log "Started $service using docker-compose (v1)" + else + error "Failed to start $service using docker compose" + return 1 + fi } # Show usage @@ -310,7 +318,7 @@ Usage: $0 [PLATFORM] [OPTIONS] Platforms: cuda Run NVIDIA CUDA container rocm Run AMD ROCm container - compose SERVICE Run using docker-compose + compose SERVICE Run using docker compose Options: -h, --help Show this help message @@ -326,7 +334,7 @@ Examples: $0 cuda --detach Run CUDA container in background $0 rocm --no-gpu Run ROCm container in CPU-only mode $0 --auto Auto-detect GPU type and run appropriate container - $0 compose cuda-dev Run using docker-compose + $0 compose cuda-dev Run using docker compose Container Management: List containers: docker ps -a @@ -339,6 +347,7 @@ GPU Programming Setup: Inside container: /workspace/test-gpu.sh # Test GPU environment Build examples: cd modules/module1/examples && make CUDA samples: cd /workspace/cuda-samples + HIP samples: cd /workspace/hip-examples EOF } From 013c76123bd574f75471f4b16b7ba8875e4ada8b Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Sat, 13 Sep 2025 09:08:12 -0400 Subject: [PATCH 4/5] Fixed the CI issue of markdown --- .../workflows/markdown-link-check-config.json | 15 +++++++++++---- README.md | 16 ++++++++-------- modules/module1/README.md | 2 +- modules/module1/examples/README.md | 2 +- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/.github/workflows/markdown-link-check-config.json b/.github/workflows/markdown-link-check-config.json index 3be3e4d..5bb5727 100644 --- a/.github/workflows/markdown-link-check-config.json +++ b/.github/workflows/markdown-link-check-config.json @@ -7,10 +7,17 @@ "pattern": "^https://github.com/yourusername/" } ], - "aliveStatusCodes": [200, 206, 999], - "timeout": "10s", + "httpHeaders": [ + { + "urls": ["https://rocmdocs.amd.com/"], + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + } + ], + "timeout": "15s", "retryOn429": true, - "retryCount": 5, + "retryCount": 3, "fallbackRetryDelay": "30s", - "aliveStatusCodes": [200, 206] + "aliveStatusCodes": [200, 206, 999] } \ No newline at end of file diff --git a/README.md b/README.md index a3bfc24..3195e32 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ *From beginner fundamentals to production-ready optimization techniques* -**Quick Navigation:** [🚀 Quick Start](#-quick-start) • [📚 Modules](#-modules) • [🐳 Docker Setup](#-docker-development) • [📖 Documentation](SUMMARY.md) • [🤝 Contributing](CONTRIBUTING.md) +**Quick Navigation:** [🚀 Quick Start](#-quick-start) • [📚 Modules](#-modules) • [🐳 Docker Setup](#-docker-development) • [🤝 Contributing](CONTRIBUTING.md) --- @@ -114,7 +114,7 @@ cd modules/module1/examples **📈 Progressive Learning Path: 70+ Examples • 50+ Hours • Beginner to Expert** -**[📖 View Detailed Curriculum →](SUMMARY.md)** +**[� View Learning Modules →](modules/)** ## 🛠️ Prerequisites @@ -306,13 +306,13 @@ make check-hip ./docker/scripts/build.sh --clean --all ``` -**[📖 Full Troubleshooting Guide →](docs/troubleshooting.md)** +**[� Need Help? Check Common Issues →](README.md#-troubleshooting)** ## 📖 Documentation | Document | Description | |----------|-------------| -| [**SUMMARY.md**](SUMMARY.md) | Complete curriculum overview and learning paths | +| **README.md** | Main project documentation and getting started guide | | [**CONTRIBUTING.md**](CONTRIBUTING.md) | How to contribute to the project | | [**Docker Guide**](docker/README.md) | Complete Docker setup and usage | | [**Module READMEs**](modules/) | Individual module documentation | @@ -327,13 +327,13 @@ We welcome contributions from the community! This project thrives on: - 🔧 **Optimizations**: Performance improvements and best practices - 🌐 **Platform Support**: Cross-platform compatibility improvements -**[📖 Contributing Guidelines →](CONTRIBUTING.md)** • **[🐛 Report Issues →](../../issues)** • **[💡 Request Features →](../../issues/new?template=feature_request.md)** +**[📖 Contributing Guidelines →](CONTRIBUTING.md)** • **[🐛 Report Issues →](https://github.com/AIComputing101/gpu-programming-101/issues)** • **[💡 Request Features →](https://github.com/AIComputing101/gpu-programming-101/issues/new?template=feature_request.md)** ## 🏆 Community & Support - 🌟 **Star this project** if you find it helpful! -- 🐛 **Report bugs** using our [issue templates](../../issues/new/choose) -- 💬 **Join discussions** in [GitHub Discussions](../../discussions) +- 🐛 **Report bugs** using our [issue templates](https://github.com/AIComputing101/gpu-programming-101/issues/new/choose) +- 💬 **Join discussions** in [GitHub Discussions](https://github.com/AIComputing101/gpu-programming-101/discussions) - 📧 **Get help** from the community and maintainers ## 📄 License @@ -371,7 +371,7 @@ Stephen Shao, "GPU Programming 101: A Comprehensive Educational Project for CUDA **Ready to unlock the power of GPU computing?** -**[🚀 Get Started Now](#-quick-start)** • **[📚 View Curriculum](SUMMARY.md)** • **[🐳 Try Docker](docker/README.md)** +**[🚀 Get Started Now](#-quick-start)** • **[📚 View Modules](modules/)** • **[🐳 Try Docker](docker/README.md)** --- diff --git a/modules/module1/README.md b/modules/module1/README.md index 4c6c0c1..7b8023b 100644 --- a/modules/module1/README.md +++ b/modules/module1/README.md @@ -110,7 +110,7 @@ After completing this module: ## Additional Resources - [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) -- [HIP Programming Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP-GUIDE.html) +- [HIP Programming Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP/index.html) - [GPU Performance Best Practices](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/) --- diff --git a/modules/module1/examples/README.md b/modules/module1/examples/README.md index 61f82e1..307f64d 100644 --- a/modules/module1/examples/README.md +++ b/modules/module1/examples/README.md @@ -337,7 +337,7 @@ export HIP_PLATFORM=nvidia ## Additional Resources - [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) -- [HIP Programming Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP-GUIDE.html) +- [HIP Programming Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP/index.html) - [GPU Performance Guidelines](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/) ## Exercises From 208cf97fc80a91f25f6a0726da456dd3e0445cc8 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Sat, 13 Sep 2025 09:15:00 -0400 Subject: [PATCH 5/5] All instances of the old ROCm documentation URL have been updated --- modules/module1/README.md | 2 +- modules/module1/examples/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/module1/README.md b/modules/module1/README.md index 7b8023b..14b5886 100644 --- a/modules/module1/README.md +++ b/modules/module1/README.md @@ -110,7 +110,7 @@ After completing this module: ## Additional Resources - [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) -- [HIP Programming Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP/index.html) +- [HIP Programming Guide](https://rocmdocs.amd.com/projects/HIP/en/latest/) - [GPU Performance Best Practices](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/) --- diff --git a/modules/module1/examples/README.md b/modules/module1/examples/README.md index 307f64d..ab801e8 100644 --- a/modules/module1/examples/README.md +++ b/modules/module1/examples/README.md @@ -337,7 +337,7 @@ export HIP_PLATFORM=nvidia ## Additional Resources - [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) -- [HIP Programming Guide](https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP/index.html) +- [HIP Programming Guide](https://rocmdocs.amd.com/projects/HIP/en/latest/) - [GPU Performance Guidelines](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/) ## Exercises