diff --git a/.github/actions/build-and-test-pyspark/action.yml b/.github/actions/build-and-test-pyspark/action.yml new file mode 100644 index 000000000000..9cfef4b3bc4a --- /dev/null +++ b/.github/actions/build-and-test-pyspark/action.yml @@ -0,0 +1,112 @@ +name: 'Build and Test PySpark' +author: 'Apache Spark' +description: 'A composite GitHub Action that builds and tests a set of PySpark modules' + +inputs: + job-type: + description: "The type of the job: regular, scheduled, pyspark-coverage-scheduled" + required: true + branch: + description: "The branch" + required: true + java-version: + description: "The Java version" + required: true + hadoop-version: + description: "The Hadoop version" + required: true + hive-version: + description: "The Hive version" + required: true + modules: + description: "The modules to be build and tested as a comma-separated list" + required: true + ansi_enabled: + description: "Use ANSI mode: 'true' or 'false'" + required: false + default: "false" + +runs: + using: 'composite' + steps: + - name: Set env + shell: bash + run: | + echo "MODULES_TO_TEST=${{ inputs.modules }}" >> $GITHUB_ENV + echo "EXCLUDED_TAGS=${{ inputs.excluded-tags }}" >> $GITHUB_ENV + echo "INCLUDED_TAGS=${{ inputs.included-tags }}" >> $GITHUB_ENV + echo "HADOOP_PROFILE=${{ inputs.hadoop-version }}" >> $GITHUB_ENV + echo "HIVE_PROFILE=${{ inputs.hive-version }}" >> $GITHUB_ENV + echo "GITHUB_PREV_SHA=${{ github.event.before }}" >> $GITHUB_ENV + echo "SPARK_LOCAL_IP=localhost" >> $GITHUB_ENV + + # Cache local repositories. Note that GitHub Actions cache has a 2G limit. + - name: Cache Scala, SBT and Maven + uses: actions/cache@v2 + with: + path: | + build/apache-maven-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + + - name: Cache Coursier local repository + uses: actions/cache@v2 + with: + path: ~/.cache/coursier + key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + pyspark-coursier- + + - name: Install Java ${{ inputs.java-version }} + uses: actions/setup-java@v1 + with: + java-version: ${{ inputs.java-version }} + + - name: List Python packages (Python 3.9, PyPy3) + shell: bash + run: | + python3.9 -m pip list + pypy3 -m pip list + + - name: Install Conda for pip packaging test + shell: bash + run: | + curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh + bash miniconda.sh -b -p $HOME/miniconda + + # Run the tests. + - name: Run tests + shell: bash + run: | + export PATH=$PATH:$HOME/miniconda/bin + ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" + + - name: Upload coverage to Codecov + if: inputs.job-type == 'pyspark-coverage-scheduled' + uses: codecov/codecov-action@v2 + with: + files: ./python/coverage.xml + flags: unittests + name: PySpark + + - name: Upload test results to report + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-results-${{ inputs.modules }}--8-${{ inputs.hadoop-version }}-hive2.3 + path: "**/target/test-reports/*.xml" + + - name: Upload unit tests log files + if: failure() + uses: actions/upload-artifact@v2 + with: + name: unit-tests-log-${{ inputs.modules }}--8-${{ inputs.hadoop-version }}-hive2.3 + path: "**/target/unit-tests.log" + +branding: + icon: 'check-circle' + color: 'green' diff --git a/.github/actions/build-and-test-spark/action.yml b/.github/actions/build-and-test-spark/action.yml new file mode 100644 index 000000000000..95437737bd7e --- /dev/null +++ b/.github/actions/build-and-test-spark/action.yml @@ -0,0 +1,128 @@ +name: 'Build and Test Spark' +author: 'Apache Spark' +description: 'A composite GitHub Action that builds and tests a set of Spark modules' + +inputs: + job-type: + description: "The type of the job: regular, scheduled, pyspark-coverage-scheduled" + required: true + branch: + description: "The branch" + required: true + java-version: + description: "The Java version" + required: true + hadoop-version: + description: "The Hadoop version" + required: true + hive-version: + description: "The Hive version" + required: true + envs: + description: "Environment vars as JSON object" + required: false + default: "{}" + modules: + description: "The modules to be build and tested as a comma-separated list" + required: true + included-tags: + description: "Tags to include for testing" + required: false + default: "" + excluded-tags: + description: "Tags to exclude for testing" + required: false + default: "" + label: + description: "Job label" + required: false + default: "" + ansi_enabled: + description: "Use ANSI mode: 'true' or 'false'" + required: false + default: "false" + +runs: + using: 'composite' + steps: + - name: Set env + shell: bash + run: | + echo "MODULES_TO_TEST=${{ inputs.modules }}" >> $GITHUB_ENV + echo "EXCLUDED_TAGS=${{ inputs.excluded-tags }}" >> $GITHUB_ENV + echo "INCLUDED_TAGS=${{ inputs.included-tags }}" >> $GITHUB_ENV + echo "HADOOP_PROFILE=${{ inputs.hadoop-version }}" >> $GITHUB_ENV + echo "HIVE_PROFILE=${{ inputs.hive-version }}" >> $GITHUB_ENV + echo "GITHUB_PREV_SHA=${{ github.event.before }}" >> $GITHUB_ENV + echo "SPARK_LOCAL_IP=localhost" >> $GITHUB_ENV + + # Cache local repositories. Note that GitHub Actions cache has a 2G limit. + - name: Cache Scala, SBT and Maven + uses: actions/cache@v2 + with: + path: | + build/apache-maven-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + + - name: Cache Coursier local repository + uses: actions/cache@v2 + with: + path: ~/.cache/coursier + key: ${{ inputs.java-version }}-${{ inputs.hadoop-version }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + ${{ inputs.java-version }}-${{ inputs.hadoop-version }}-coursier- + + - name: Install Java ${{ inputs.java-version }} + uses: actions/setup-java@v1 + with: + java-version: ${{ inputs.java-version }} + + - name: Install Python 3.8 + uses: actions/setup-python@v2 + # We should install one Python that is higher then 3+ for SQL and Yarn because: + # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. + # - Yarn has a Python specific test too, for example, YarnClusterSuite. + if: contains(inputs.modules, 'yarn') || (contains(inputs.modules, 'sql') && !contains(inputs.modules, 'sql-')) + with: + python-version: 3.8 + architecture: x64 + + - name: Install Python packages (Python 3.8) + if: (contains(inputs.modules, 'sql') && !contains(inputs.modules, 'sql-')) + shell: bash + run: | + python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy xmlrunner + python3.8 -m pip list + + # Run the tests. + - name: Run tests + env: ${{fromJSON(inputs.envs)}} + shell: bash + run: | + # Hive "other tests" test needs larger metaspace size based on experiment. + if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi + export SERIAL_SBT_TESTS=1 + ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" + + - name: Upload test results to report + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-results-${{ inputs.modules }}-${{ inputs.label }}-${{ inputs.java-version }}-${{ inputs.hadoop-version }}-${{ inputs.hive-version }} + path: "**/target/test-reports/*.xml" + + - name: Upload unit tests log files + if: failure() + uses: actions/upload-artifact@v2 + with: + name: unit-tests-log-${{ inputs.modules }}-${{ inputs.label }}-${{ inputs.java-version }}-${{ inputs.hadoop-version }}-${{ inputs.hive-version }} + path: "**/target/unit-tests.log" + +branding: + icon: 'check-circle' + color: 'green' diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 81381eb16d47..0682bddd4318 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -55,81 +55,111 @@ jobs: # Run all jobs for Apache Spark repository # Run only non-scheduled jobs for forked repositories if: github.repository == 'apache/spark' || github.event_name != 'schedule' + env: + GITHUB_PREV_SHA: ${{ github.event.before }} outputs: - java: ${{ steps.set-outputs.outputs.java }} - branch: ${{ steps.set-outputs.outputs.branch }} - hadoop: ${{ steps.set-outputs.outputs.hadoop }} - type: ${{ steps.set-outputs.outputs.type }} - envs: ${{ steps.set-outputs.outputs.envs }} + java: ${{ steps.config.outputs.java }} + branch: ${{ steps.config.outputs.branch }} + hadoop: ${{ steps.config.outputs.hadoop }} + type: ${{ steps.config.outputs.type }} + envs: ${{ steps.config.outputs.envs }} + + spark-matrix: ${{ steps.spark.outputs.matrix }} + pyspark-matrix: ${{ steps.pyspark.outputs.matrix }} + + # Run scheduled jobs for Apache Spark only + # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist + spark-required: >- + ${{ steps.config.outputs.type == 'scheduled' + || (steps.config.outputs.type == 'regular' && steps.changes.outputs.build-required == 'true') }} + + # Run PySpark coverage scheduled jobs for Apache Spark only + # Run scheduled jobs with JDK 17 in Apache Spark + # Run regular jobs for commit in both Apache Spark and forked repository, but only if pyspark changes exist + pyspark-required: >- + ${{ steps.config.outputs.type == 'pyspark-coverage-scheduled' + || (steps.config.outputs.type == 'scheduled' && steps.config.outputs.java == '17') + || (steps.config.outputs.type == 'regular' && steps.changes.outputs.pyspark-required == 'true') }} + + # Run scheduled jobs with JDK 17 in Apache Spark + # Run regular jobs for commit in both Apache Spark and forked repository, but only if sparkr changes exist + sparkr-required: >- + ${{ (steps.config.outputs.type == 'scheduled' && steps.config.outputs.java == '17') + || (steps.config.outputs.type == 'regular' && steps.changes.outputs.sparkr-required == 'true') }} + + # Run for regular jobs + lint-required: >- + ${{ steps.config.outputs.type == 'regular' }} + + # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist + java-required: >- + ${{ steps.config.outputs.type == 'regular' && steps.changes.outputs.build-required == 'true' }} + + # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist + scala-required: >- + ${{ steps.config.outputs.type == 'regular' && steps.changes.outputs.build-required == 'true' }} + + # Run regular jobs for commit in both Apache Spark and forked repository, but only if tpcds changes exist + tpcds-required: >- + ${{ steps.config.outputs.type == 'regular' && steps.changes.outputs.tpcds-required == 'true' }} + + # Run regular jobs for commit in both Apache Spark and forked repository, but only if docker changes exist + docker-required: >- + ${{ steps.config.outputs.type == 'regular' && steps.changes.outputs.docker-required == 'true' }} + steps: - name: Configure branch and additional environment variables - id: set-outputs + id: config run: | + # default values + java=8 + branch=master + type=scheduled + hadoop=hadoop3 + + # override default values based on event if [ "${{ github.event.schedule }}" = "0 1 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::master' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{}' - echo '::set-output name=hadoop::hadoop2' + envs='{}' + hadoop='hadoop2' elif [ "${{ github.event.schedule }}" = "0 4 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::master' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}' - echo '::set-output name=hadoop::hadoop3' + envs='{"SCALA_PROFILE": "scala2.13"}' + hadoop='hadoop3' elif [ "${{ github.event.schedule }}" = "0 7 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::branch-3.2' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}' - echo '::set-output name=hadoop::hadoop3.2' + branch='branch-3.2' + envs='{"SCALA_PROFILE": "scala2.13"}' + hadoop='hadoop3.2' elif [ "${{ github.event.schedule }}" = "0 10 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::master' - echo '::set-output name=type::pyspark-coverage-scheduled' - echo '::set-output name=envs::{"PYSPARK_CODECOV": "true"}' - echo '::set-output name=hadoop::hadoop3' + type=pyspark-coverage-scheduled + envs='{"PYSPARK_CODECOV": "true"}' elif [ "${{ github.event.schedule }}" = "0 13 * * *" ]; then - echo '::set-output name=java::11' - echo '::set-output name=branch::master' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}' - echo '::set-output name=hadoop::hadoop3' + java=11 + envs='{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}' elif [ "${{ github.event.schedule }}" = "0 16 * * *" ]; then - echo '::set-output name=java::17' - echo '::set-output name=branch::master' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}' - echo '::set-output name=hadoop::hadoop3' + java=17 + envs='{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}' elif [ "${{ github.event.schedule }}" = "0 19 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::branch-3.3' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}' - echo '::set-output name=hadoop::hadoop3' + branch='branch-3.3' + envs='{"SCALA_PROFILE": "scala2.13"}' else - echo '::set-output name=java::8' - echo '::set-output name=branch::master' # NOTE: UPDATE THIS WHEN CUTTING BRANCH - echo '::set-output name=type::regular' - echo '::set-output name=envs::{"SPARK_ANSI_SQL_MODE": "${{ inputs.ansi_enabled }}"}' - echo '::set-output name=hadoop::hadoop3' + # NOTE: UPDATE THIS WHEN CUTTING BRANCH + branch=master + type=regular + envs='{"SPARK_ANSI_SQL_MODE": "${{ inputs.ansi_enabled }}"}' fi - precondition: - name: Check changes - runs-on: ubuntu-20.04 - needs: configure-jobs - env: - GITHUB_PREV_SHA: ${{ github.event.before }} - outputs: - required: ${{ steps.set-outputs.outputs.required }} - steps: + # output values + echo "::set-output name=java::$java" + echo "::set-output name=branch::$branch" + echo "::set-output name=type::$type" + echo "::set-output name=envs::$envs" + echo "::set-output name=hadoop::$hadoop" + - name: Checkout Spark repository uses: actions/checkout@v2 with: fetch-depth: 0 repository: apache/spark - ref: ${{ needs.configure-jobs.outputs.branch }} + ref: ${{ steps.config.outputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | @@ -137,8 +167,9 @@ jobs: git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + - name: Check all modules - id: set-outputs + id: changes run: | # is-changed.py is missing in branch-3.2, and it might run in scheduled build, see also SPARK-39517 build=true; pyspark=true; sparkr=true; tpcds=true; docker=true; @@ -149,75 +180,138 @@ jobs: tpcds=`./dev/is-changed.py -m build,catalyst,core,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe` docker=`./dev/is-changed.py -m build,catalyst,core,docker-integration-tests,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe` fi - echo "{\"build\": \"$build\", \"pyspark\": \"$pyspark\", \"sparkr\": \"$sparkr\", \"tpcds\": \"$tpcds\", \"docker\": \"$docker\"}" > required.json - cat required.json - echo "::set-output name=required::$(cat required.json)" - - # Build: build Spark and run the tests for specified modules. - build: - name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" - needs: [configure-jobs, precondition] - # Run scheduled jobs for Apache Spark only - # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist - if: >- - needs.configure-jobs.outputs.type == 'scheduled' - || (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true') + + echo "::set-output name=build-required::$build" + echo "::set-output name=pyspark-required::$pyspark" + echo "::set-output name=sparkr-required::$sparkr" + echo "::set-output name=tpcds-required::$tpcds" + echo "::set-output name=docker-required::$docker" + + - name: Configure Spark matrix + id: spark + # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. + # Kinesis tests depends on external Amazon kinesis service. + # Here, we split Hive and SQL tests into some of slow ones and the rest of them. + # Note that the modules below are from sparktestsupport/modules.py. + run: | + cat > matrix.json << EOF + { + "include": [ + { + "modules": "core, unsafe, kvstore, avro, network-common, network-shuffle, repl, launcher, examples, sketch, graphx", + "java": "${{ steps.config.outputs.java }}", + "hadoop": "${{ steps.config.outputs.hadoop }}", + "hive": "hive2.3" + }, + { + "modules": "catalyst, hive-thriftserver", + "java": "${{ steps.config.outputs.java }}", + "hadoop": "${{ steps.config.outputs.hadoop }}", + "hive": "hive2.3" + }, + { + "modules": "streaming, sql-kafka-0-10, streaming-kafka-0-10, mllib-local, mllib, yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl", + "java": "${{ steps.config.outputs.java }}", + "hadoop": "${{ steps.config.outputs.hadoop }}", + "hive": "hive2.3" + }, + { + "modules": "hive", + "java": "${{ steps.config.outputs.java }}", + "hadoop": "${{ steps.config.outputs.hadoop }}", + "hive": "hive2.3", + "included-tags": "org.apache.spark.tags.SlowHiveTest", + "label": "- slow tests" + }, + { + "modules": "hive", + "java": "${{ steps.config.outputs.java }}", + "hadoop": "${{ steps.config.outputs.hadoop }}", + "hive": "hive2.3", + "excluded-tags": "org.apache.spark.tags.SlowHiveTest", + "label": "- other tests" + }, + { + "modules": "sql", + "java": "${{ steps.config.outputs.java }}", + "hadoop": "${{ steps.config.outputs.hadoop }}", + "hive": "hive2.3", + "included-tags": "org.apache.spark.tags.ExtendedSQLTest", + "label": "- slow tests" + }, + { + "modules": "sql", + "java": "${{ steps.config.outputs.java }}", + "hadoop": "${{ steps.config.outputs.hadoop }}", + "hive": "hive2.3", + "excluded-tags": "org.apache.spark.tags.ExtendedSQLTest", + "label": "- other tests" + } + ] + } + EOF + + # detect changes for each of these module sets + jq -c ".include[] | ." < matrix.json | while read line; do + modules=$(jq -r .modules <<<"$line") + changed="$(./dev/is-changed.py -m "$modules")" + jq ". + { changed: \"$changed\" }" <<<"$line" + done | jq -n '{ include: [inputs] }' > matrix-with-changes.json + + # output json + echo "::set-output name=matrix::$(jq -c . < matrix-with-changes.json)" + echo "The strategy matrix:" + jq . < matrix-with-changes.json + + - name: Configure PySpark matrix + id: pyspark + run: | + cat > matrix.json << EOF + { + "include": [ + { + "modules": "pyspark-sql, pyspark-mllib, pyspark-resource", + "java": "${{ steps.config.outputs.java }}" + }, + { + "modules": "pyspark-core, pyspark-streaming, pyspark-ml", + "java": "${{ steps.config.outputs.java }}" + }, + { + "modules": "pyspark-pandas", + "java": "${{ steps.config.outputs.java }}" + }, + { + "modules": "pyspark-pandas-slow", + "java": "${{ steps.config.outputs.java }}" + } + ] + } + EOF + + # detect changes for each of these module sets + jq -c ".include[] | ." < matrix.json | while read line; do + modules=$(jq -r .modules <<<"$line") + changed="$(./dev/is-changed.py -m "$modules")" + jq ". + { changed: \"$changed\" }" <<<"$line" + done | jq -n '{ include: [inputs] }' > matrix-with-changes.json + + # output json + echo "::set-output name=matrix::$(jq -c . < matrix-with-changes.json)" + echo "The strategy matrix:" + jq . < matrix-with-changes.json + + + # Build Spark and run the tests for specified modules. + spark: + name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }} ${{ matrix.label }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" + needs: [configure-jobs] + if: needs.configure-jobs.outputs.spark-required == 'true' # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. runs-on: ubuntu-20.04 strategy: fail-fast: false - matrix: - java: - - ${{ needs.configure-jobs.outputs.java }} - hadoop: - - ${{ needs.configure-jobs.outputs.hadoop }} - hive: - - hive2.3 - # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. - # Kinesis tests depends on external Amazon kinesis service. - # Note that the modules below are from sparktestsupport/modules.py. - modules: - - >- - core, unsafe, kvstore, avro, - network-common, network-shuffle, repl, launcher, - examples, sketch, graphx - - >- - catalyst, hive-thriftserver - - >- - streaming, sql-kafka-0-10, streaming-kafka-0-10, - mllib-local, mllib, - yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl - # Here, we split Hive and SQL tests into some of slow ones and the rest of them. - included-tags: [""] - excluded-tags: [""] - comment: [""] - include: - # Hive tests - - modules: hive - java: ${{ needs.configure-jobs.outputs.java }} - hadoop: ${{ needs.configure-jobs.outputs.hadoop }} - hive: hive2.3 - included-tags: org.apache.spark.tags.SlowHiveTest - comment: "- slow tests" - - modules: hive - java: ${{ needs.configure-jobs.outputs.java }} - hadoop: ${{ needs.configure-jobs.outputs.hadoop }} - hive: hive2.3 - excluded-tags: org.apache.spark.tags.SlowHiveTest - comment: "- other tests" - # SQL tests - - modules: sql - java: ${{ needs.configure-jobs.outputs.java }} - hadoop: ${{ needs.configure-jobs.outputs.hadoop }} - hive: hive2.3 - included-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- slow tests" - - modules: sql - java: ${{ needs.configure-jobs.outputs.java }} - hadoop: ${{ needs.configure-jobs.outputs.hadoop }} - hive: hive2.3 - excluded-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- other tests" + matrix: ${{fromJSON(needs.configure-jobs.outputs.spark-matrix)}} env: MODULES_TO_TEST: ${{ matrix.modules }} EXCLUDED_TAGS: ${{ matrix.excluded-tags }} @@ -234,6 +328,7 @@ jobs: fetch-depth: 0 repository: apache/spark ref: ${{ needs.configure-jobs.outputs.branch }} + - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | @@ -241,91 +336,35 @@ jobs: git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} - - name: Install Python 3.8 - uses: actions/setup-python@v2 - # We should install one Python that is higher then 3+ for SQL and Yarn because: - # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. - # - Yarn has a Python specific test too, for example, YarnClusterSuite. - if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - with: - python-version: 3.8 - architecture: x64 - - name: Install Python packages (Python 3.8) - if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - run: | - python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy xmlrunner - python3.8 -m pip list - # Run the tests. - - name: Run tests - env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }} - run: | - # Hive "other tests" test needs larger metaspace size based on experiment. - if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi - export SERIAL_SBT_TESTS=1 - ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 + + - name: "Build and test" + uses: ./.github/actions/build-and-test-spark + # should be 'true' if not 'false', but we want to fall back to running tests if output is unexpected + if: matrix.changed != 'false' with: - name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/unit-tests.log" + job-type: ${{ needs.configure-jobs.outputs.type }} + branch: ${{ needs.configure-jobs.outputs.branch }} + java-version: ${{ needs.configure-jobs.outputs.java }} + hadoop-version: ${{ needs.configure-jobs.outputs.hadoop }} + hive-version: hive2.3 + envs: ${{ needs.configure-jobs.outputs.envs }} + modules: ${{ matrix.modules }} + included-tags: ${{ matrix.included-tags }} + excluded-tags: ${{ matrix.excluded-tags }} + label: ${{ matrix.label }} + ansi_enabled: ${{ inputs.ansi_enabled }} + # Build PySpark and run the tests for specified modules. pyspark: - needs: [configure-jobs, precondition] - # Run PySpark coverage scheduled jobs for Apache Spark only - # Run scheduled jobs with JDK 17 in Apache Spark - # Run regular jobs for commit in both Apache Spark and forked repository, but only if pyspark changes exist - if: >- - needs.configure-jobs.outputs.type == 'pyspark-coverage-scheduled' - || (needs.configure-jobs.outputs.type == 'scheduled' && needs.configure-jobs.outputs.java == '17') - || (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).pyspark == 'true') name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }}" + needs: [configure-jobs] + if: needs.configure-jobs.outputs.pyspark-required == 'true' runs-on: ubuntu-20.04 container: image: dongjoon/apache-spark-github-action-image:20220207 strategy: fail-fast: false - matrix: - java: - - ${{ needs.configure-jobs.outputs.java }} - modules: - - >- - pyspark-sql, pyspark-mllib, pyspark-resource - - >- - pyspark-core, pyspark-streaming, pyspark-ml - - >- - pyspark-pandas - - >- - pyspark-pandas-slow + matrix: ${{fromJSON(needs.configure-jobs.outputs.pyspark-matrix)}} env: MODULES_TO_TEST: ${{ matrix.modules }} HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }} @@ -344,6 +383,7 @@ jobs: fetch-depth: 0 repository: apache/spark ref: ${{ needs.configure-jobs.outputs.branch }} + - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | @@ -351,71 +391,24 @@ jobs: git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - pyspark-coursier- - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} - - name: List Python packages (Python 3.9, PyPy3) - run: | - python3.9 -m pip list - pypy3 -m pip list - - name: Install Conda for pip packaging test - run: | - curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - # Run the tests. - - name: Run tests - env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }} - run: | - export PATH=$PATH:$HOME/miniconda/bin - ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" - - name: Upload coverage to Codecov - if: needs.configure-jobs.outputs.type == 'pyspark-coverage-scheduled' - uses: codecov/codecov-action@v2 - with: - files: ./python/coverage.xml - flags: unittests - name: PySpark - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-${{ matrix.modules }}--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 + + - name: "Build and test" + uses: ./.github/actions/build-and-test-pyspark + # should be 'true' if not 'false', but we want to fall back to running tests if output is unexpected + if: matrix.changed != 'false' with: - name: unit-tests-log-${{ matrix.modules }}--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 - path: "**/target/unit-tests.log" + job-type: ${{ needs.configure-jobs.outputs.type }} + branch: ${{ needs.configure-jobs.outputs.branch }} + java-version: ${{ needs.configure-jobs.outputs.java }} + hadoop-version: ${{ needs.configure-jobs.outputs.hadoop }} + hive-version: hive2.3 + modules: ${{ matrix.modules }} + ansi_enabled: ${{ inputs.ansi_enabled }} sparkr: - needs: [configure-jobs, precondition] - # Run scheduled jobs with JDK 17 in Apache Spark - # Run regular jobs for commit in both Apache Spark and forked repository, but only if sparkr changes exist - if: >- - (needs.configure-jobs.outputs.type == 'scheduled' && needs.configure-jobs.outputs.java == '17') - || (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).sparkr == 'true') name: "Build modules: sparkr" + needs: [configure-jobs] + if: needs.configure-jobs.outputs.sparkr-required == 'true' runs-on: ubuntu-20.04 container: image: dongjoon/apache-spark-github-action-image:20220207 @@ -480,9 +473,9 @@ jobs: # Static analysis, and documentation build lint: - needs: configure-jobs - if: needs.configure-jobs.outputs.type == 'regular' - name: Linters, licenses, dependencies and documentation generation + name: "Linters, licenses, dependencies and documentation generation" + needs: [configure-jobs] + if: needs.configure-jobs.outputs.lint-required == 'true' runs-on: ubuntu-20.04 env: LC_ALL: C.UTF-8 @@ -595,17 +588,16 @@ jobs: bundle exec jekyll build java-11-17: - needs: [configure-jobs, precondition] - # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist - if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true' - name: Java ${{ matrix.java }} build with Maven + name: "Java ${{ matrix.java }} build with Maven" + needs: [configure-jobs] + if: needs.configure-jobs.outputs.java-required == 'true' + runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: java: - 11 - 17 - runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository uses: actions/checkout@v2 @@ -651,10 +643,9 @@ jobs: rm -rf ~/.m2/repository/org/apache/spark scala-213: - needs: [configure-jobs, precondition] - # Run regular jobs for commit in both Apache Spark and forked repository, but only if changes exist - if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true' - name: Scala 2.13 build with SBT + name: "Scala 2.13 build with SBT" + needs: [configure-jobs] + if: needs.configure-jobs.outputs.scala-required == 'true' runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository @@ -697,10 +688,9 @@ jobs: ./build/sbt -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile tpcds-1g: - needs: [configure-jobs, precondition] - # Run regular jobs for commit in both Apache Spark and forked repository, but only if tpcds changes exist - if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).tpcds == 'true' - name: Run TPC-DS queries with SF=1 + name: "Run TPC-DS queries with SF=1" + needs: [configure-jobs] + if: needs.configure-jobs.outputs.tpcds-required == 'true' runs-on: ubuntu-20.04 env: SPARK_LOCAL_IP: localhost @@ -793,10 +783,9 @@ jobs: path: "**/target/unit-tests.log" docker-integration-tests: - needs: [configure-jobs, precondition] - # Run regular jobs for commit in both Apache Spark and forked repository, but only if docker changes exist - if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).docker == 'true' - name: Run Docker integration tests + name: "Run Docker integration tests" + needs: [configure-jobs] + if: needs.configure-jobs.outputs.docker-required == 'true' runs-on: ubuntu-20.04 env: HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }} diff --git a/dev/is-changed.py b/dev/is-changed.py index 85f0d3cda6df..7df94627da78 100755 --- a/dev/is-changed.py +++ b/dev/is-changed.py @@ -53,7 +53,7 @@ def parse_opts(): def main(): opts = parse_opts() - test_modules = opts.modules.split(",") + test_modules = [m.strip() for m in opts.modules.split(",")] changed_files = [] if os.environ.get("APACHE_SPARK_REF"): changed_files = identify_changed_files_from_git_commits(