gitlab-gpu-pipeline.yml

default:
    image:
        name: $CI_REGISTRY_USER/$gpu_image_name:$gpu_image_version
        entrypoint: [""]

variables:
    HOST: "noarch" # actually possible: snb
    GPU_VENDOR: "nvidia"
    GPU_MODEL: "sm_60"

stages:
    - pre_build
    - build 
    - test
    - check
    - performance_eval

gpu_pre_build:
    stage: pre_build
    allow_failure: false
    variables:
        GIT_STRATEGY: clone
    tags:
        - sccs
        - build
    before_script:
        - git branch -vva
        - echo $commit_author_name
    script:
        - echo "HOST arch.:" $HOST
        - echo "GPU model:" $GPU_MODEL
        - mkdir ./self_usr
        - git clone https://github.com/uphoffc/ImpalaJIT.git impalajit &&
          mkdir -p impalajit/build && cd impalajit/build &&
          cmake .. -DCMAKE_INSTALL_PREFIX=../../self_usr -DCMAKE_CXX_FLAGS="-fPIC" &&
          make -j $(nproc) install &&
          cd ../..
        - git clone --depth 1 --branch v1.0.0 https://github.com/SeisSol/easi.git easi &&
          mkdir -p easi/build && cd easi/build &&
          cmake .. -DASAGI=OFF -DIMPALAJIT=ON -DCMAKE_INSTALL_PREFIX=../../self_usr -DCMAKE_CXX_FLAGS="-fPIC" &&
          make -j $(nproc) install &&
          cd ../..
    artifacts:
        paths:
            - ./self_usr
        expire_in: 2 days

.common_gpu_test_script: &common_gpu_steps
    - export CTEST_OUTPUT_ON_FAILURE=1
    - if [[ "${BACKEND}" == "hipsycl" ]]; then
        export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH ;
        export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH ;
      fi ;
    - if [[ "${BACKEND}" == "hip" ]]; then
        . /etc/profile.d/rocm.sh ;
        export HIP_PLATFORM=nvidia ;
      fi ;
    - ls -l ./self_usr/lib && ls -l ./self_usr/include
    - export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:$PWD/self_usr;
      export LIBRARY_PATH=$LIBRARY_PATH:$PWD/self_usr ;
      export CPATH=$CPATH:$PWD/self_usr ;

gpu_build:
    stage: build
    allow_failure: false
    tags:
        - sccs
        - gpubuild
    needs:
        - job: gpu_pre_build
    parallel:
        matrix:
        - BACKEND: [cuda, hipsycl, hip]
    before_script:
        - git submodule init
        - sed -i 's/\.\.\/\.\./https\:\/\/github\.com/g' .gitmodules
        - git submodule sync
        - git submodule --quiet update --init --recursive 
    script:
        - *common_gpu_steps
        - pip3 install git+https://github.com/SeisSol/gemmforge.git@v0.0.207
        - set -euo pipefail
        - for precision in double single; do
            mkdir -p ./build_${BACKEND}_${precision} && cd ./build_${BACKEND}_${precision} ;
            cmake ..
            -DCMAKE_BUILD_TYPE=Release
            -DDEVICE_BACKEND=${BACKEND}
            -DDEVICE_ARCH=${GPU_MODEL}
            -DHOST_ARCH=${HOST}
            -DPRECISION=${precision}
            -DPROXY_PYBINDING=ON ;
            make -j $(nproc) ;
            cd .. ;
          done;
        - set +u
    artifacts:
        paths:
            - build_*
        expire_in: 2 days
    retry: 2


gpu_convergence_test:
    stage: test
    allow_failure: false
    needs:
        - job: gpu_build
    tags:
        - sccs
        - gpu-nvidia-sm_60
    parallel:
        matrix:
        - BACKEND: [cuda, hipsycl, hip]
    script:
        - *common_gpu_steps
        - git clone https://github.com/SeisSol/Examples.git tests
        - pip3 install -r ./tests/convergence_elastic/requirements.txt
        - set -euo pipefail
        - for precision in double single; do
            cd ./build_${BACKEND}_${precision} ;
            echo "[${BACKEND}] Elastic Convergence test with precision - ${precision}" ;
            cp -r ../tests/convergence_elastic/* . ;
            PYTHONPATH=$PWD python3 ./elastic_convergence_runner
            --executable $PWD/SeisSol_Release_*
            --tmp-dir /tmp/seissol
            --sizes 4 8 16
            --expected-errors 1e-2 1e-4 5e-5
            --norm-type LInf
            --end-time 0.5
            --allow-run-as-root ;
            cd .. ;
          done;
        - set +u
    artifacts:
        paths:
            - build_*
        expire_in: 2 days
    retry: 2


gpu_run_tpv:
    stage: test
    allow_failure: false
    needs:
        - job: gpu_build
    tags:
        - sccs
        - gpu-nvidia-sm_60
    parallel:
        matrix:
            - precision: [single]
              tpv: [5, 5-nuc, 6, 13, 16, 101, 101-slip, 104]
              backend: [cuda]
    script:
        - echo "run TPV${tpv} with ${backend} backend"
        - git clone https://github.com/SeisSol/precomputed-seissol.git
        - ls
        - cd precomputed-seissol
        - git checkout 2ae63167d0c5e35cb01c56198dba94a2b7a224d9
        - cd ..
        - cd ./build_${backend}_${precision} ;
        - cp -r ../precomputed-seissol/tpv${tpv}/* .
        - ls
        - mkdir ./output
        - export NUM_COMP_CORES=$(expr $(nproc) - 1)
        - export OMP_NUM_THREADS=$NUM_COMP_CORES
        - export OMP_PLACES="cores($NUM_COMP_CORES)"
        - ulimit -Ss unlimited
        - ./SeisSol_Release_*_elastic ./parameters.par
        - mv ./precomputed/${precision} ./output/precomputed
        - mv ./output ../output-tpv{$tpv}-${precision}-${backend}
    artifacts:
        paths:
            - output-*
        expire_in: 2 days
    retry: 2


check_faultoutput:
    stage: check
    allow_failure: false
    tags:
        - sccs
        - helper
    needs:
        - job: gpu_run_tpv
    variables:
        elastic_domain_file: "tpv.xdmf"
        fault_file: "tpv-fault.xdmf"
        epsilon: "0.05"
    parallel:
        matrix:
            - precision: [single]
              tpv: [5, 5-nuc, 6, 13, 16, 101, 101-slip, 104]
              backend: [cuda]
    before_script:
        - pip3 install numpy>=1.12.0 lxml==5.0.0 setuptools seissolxdmf pandas
    script: 
        - echo "check TPV{$tpv} with ${precision} precision and ${backend} backend"
        - ls
        - cd output-tpv{$tpv}-${precision}-${backend}
        - ls
        - python3 ../postprocessing/validation/compare-faults.py ./${fault_file} ./precomputed/${fault_file} --epsilon ${epsilon}


check_receivers:
    stage: check
    allow_failure: false
    tags:
        - sccs
        - helper
    needs:
        - job: gpu_run_tpv
    variables:
        epsilon: "0.05"
    parallel:
        matrix:
            - precision: [single]
              tpv: [5, 5-nuc, 6, 13, 101, 101-slip, 104]
              backend: [cuda]
    before_script:
        - pip3 install pandas
    script:
        - echo "check TPV${tpv}"
        - ls
        - cd output-tpv{$tpv}-${precision}-${backend}
        - ls
        - if [ ${tpv} = 5 ] || [ ${tpv} = 5-nuc ] || [ ${tpv} = 6 ] || [ ${tpv} = 16 ]; then mode=lsw; elif [ ${tpv} = 105 ]; then mode=tp; else mode=rs; fi;
            python3 ../postprocessing/validation/compare-receivers.py . ./precomputed --epsilon ${epsilon} --mode $mode

gpu_check_energies:
    stage: check
    allow_failure: false
    tags:
        - sccs
        - helper
    needs:
        - job: gpu_run_tpv
    variables:
        energy_file: "tpv-energy.csv"
        epsilon: "0.05"
    parallel:
        matrix:
            - precision: [single]
              tpv: [5, 5-nuc, 6, 13, 16, 101, 101-slip, 104]
              backend: [cuda]
    before_script:
        - pip3 install pandas
    script:
        - echo "check TPV${tpv}"
        - ls
        - cd output-tpv{$tpv}-${precision}-${backend}
        - ls
        - python3 ../postprocessing/validation/compare-energies.py ./${energy_file} ./precomputed/${energy_file} --epsilon ${epsilon}

gpu_performance_test:
    stage: performance_eval
    allow_failure: true
    when: manual # skip by default
    needs:
        - job: gpu_build
          artifacts: true
        - job: gpu_run_tpv
          artifacts: false
    tags:
        - sccs
        - gpu-nvidia-sm_60
    parallel:
        matrix:
        - BACKEND: [cuda, hipsycl, hip]
    script:
        - *common_gpu_steps
        - set -euo pipefail
        - for precision in double single; do
            echo "[${BACKEND}] Running SeiSol proxy with precision - ${precision}" ;
            cd ./build_${BACKEND}_${precision} ;
            pip3 install -r ./proxy-runners/requirements.txt ;
            python3 ./proxy-runners/run-all.py -c 25000 -t 100 ;
            cat ./proxy_*.pd ;
            cd .. ;
          done;
        - set +u