From 67317e30f7c74b06eb826f057270d967dfb02b55 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 7 Oct 2024 15:46:35 -0700 Subject: [PATCH 01/38] added to needs of job stop_ec2_instance so all jobs run before it starts to stop the instance. Added if: always() so that stop_ec2_instance always runs even when test fail --- .github/workflows/run-simulators.yml | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 885e386a4..9f34127c7 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -27,7 +27,7 @@ jobs: sleep 10 instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') done - + # Check if instance state is "stopped" if [[ "$instance_state" == "stopped" ]]; then echo "Instance is stopped, starting it..." @@ -42,7 +42,7 @@ jobs: exit 1 fi - # wait for status checks to pass + # wait for status checks to pass TIMEOUT=300 # Timeout in seconds START_TIME=$(date +%s) END_TIME=$((START_TIME + TIMEOUT)) @@ -69,7 +69,7 @@ jobs: name: check_simulator_version_updates runs-on: ubuntu-latest needs: start_ec2_instance - steps: + steps: - name: Check for Simulator Version Updates env: PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} @@ -109,11 +109,11 @@ jobs: echo "NVIDIA Driver is not set" exit 1 fi - ' + ' - name: NVIDIA Driver is not set if: ${{ failure() }} run: | - echo "NVIDIA SMI is not working, please run the steps here on the instance:" + echo "NVIDIA SMI is not working, please run the steps here on the instance:" echo "https://scenic-lang.atlassian.net/wiki/spaces/KAN/pages/2785287/Setting+Up+AWS+VM?parentProduct=JSW&initialAllowedFeatures=byline-contributors.byline-extensions.page-comments.delete.page-reactions.inline-comments.non-licensed-share&themeState=dark%253Adark%2520light%253Alight%2520spacing%253Aspacing%2520colorMode%253Alight&locale=en-US#Install-NVIDIA-Drivers" run_carla_simulators: @@ -133,12 +133,12 @@ jobs: source venv/bin/activate && carla_versions=($(find /software -maxdepth 1 -type d -name 'carla*')) && for version in "${carla_versions[@]}"; do - echo "============================= CARLA $version =============================" + echo "============================= CARLA $version =============================" export CARLA_ROOT="$version" pytest tests/simulators/carla done ' - + run_webots_simulators: name: run_webots_simulators runs-on: ubuntu-latest @@ -164,12 +164,13 @@ jobs: done kill %1 ' - + stop_ec2_instance: name: stop_ec2_instance runs-on: ubuntu-latest - needs: [run_carla_simulators, run_webots_simulators] - steps: + needs: [start_ec2_instance, check_simulator_version_updates, check_nvidia_smi, run_carla_simulators, run_webots_simulators] + if: always() + steps: - name: Stop EC2 Instance env: INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} @@ -186,7 +187,7 @@ jobs: sleep 10 instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') done - + # Check if instance state is "stopped" if [[ "$instance_state" == "running" ]]; then echo "Instance is running, stopping it..." From 96c758ac11d3c3ed7e53dd545060fc7e30950793 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 7 Oct 2024 16:07:41 -0700 Subject: [PATCH 02/38] changed start_ec2_instance to create volume from latest snapshot and attatch to instance before starting instance --- .github/workflows/run-simulators.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 9f34127c7..af244df9d 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -11,6 +11,29 @@ jobs: concurrency: group: sim steps: + - name: Create Volume from Latest Snapshot and Attach to Instance + env: + INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} + run: | + # Retrieve the latest snapshot ID + LATEST_SNAPSHOT_ID=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots | sort_by(@, &StartTime) | [-1].SnapshotId' --output text) + echo "Using latest snapshot with ID: $LATEST_SNAPSHOT_ID" + + # Create a new volume from the latest snapshot + volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone $(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query "Reservations[0].Instances[0].Placement.AvailabilityZone" --output text) --volume-type gp2 --query "VolumeId" --output text) + echo "Created volume with ID: $volume_id" + + # Wait until the volume is available + aws ec2 wait volume-available --volume-ids $volume_id + echo "Volume is now available" + + # Attach the volume to the instance + aws ec2 attach-volume --volume-id $volume_id --instance-id $INSTANCE_ID --device /dev/sdf + echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sdf" + - name: Start EC2 Instance env: INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} From 8453174e21f8b1533e40407e6a46aac8c2cdc701 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 7 Oct 2024 16:28:08 -0700 Subject: [PATCH 03/38] changed stop_ec2_instance so that it stops the instance, then takes snapshot of volume, then deletes volume) --- .github/workflows/run-simulators.yml | 53 ++++++++++++++++++---------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index af244df9d..542985037 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -25,6 +25,7 @@ jobs: # Create a new volume from the latest snapshot volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone $(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query "Reservations[0].Instances[0].Placement.AvailabilityZone" --output text) --volume-type gp2 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" + echo "::set-output name=volume_id::$volume_id" # Wait until the volume is available aws ec2 wait volume-available --volume-ids $volume_id @@ -201,26 +202,42 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} run: | - # Get the instance state + # Get the instance state and stop it if running instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - - # If the machine is pending wait for it to fully start - while [ "$instance_state" == "pending" ]; do - echo "Instance is pending startup, waiting for it to fully start..." - sleep 10 - instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - done - - # Check if instance state is "stopped" if [[ "$instance_state" == "running" ]]; then - echo "Instance is running, stopping it..." - aws ec2 stop-instances --instance-ids $INSTANCE_ID - elif [[ "$instance_state" == "stopping" ]]; then - echo "Instance is stopping..." + echo "Instance is running, stopping it..." + aws ec2 stop-instances --instance-ids $INSTANCE_ID + aws ec2 wait instance-stopped --instance-ids $INSTANCE_ID + echo "Instance has stopped." elif [[ "$instance_state" == "stopped" ]]; then - echo "Instance is already stopped..." - exit 0 + echo "Instance is already stopped." else - echo "Unknown instance state: $instance_state" - exit 1 + echo "Unexpected instance state: $instance_state" + exit 1 fi + + - name: Take Snapshot of Volume + env: + VOLUME_ID: ${{ needs.start_ec2_instance.outputs.created_volume_id }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} + run: | + # Create a snapshot of the volume + snapshot_id=$(aws ec2 create-snapshot --volume-id $VOLUME_ID --description "Snapshot before deletion" --query "SnapshotId" --output text) + echo "Snapshot ID: $snapshot_id" + + # Wait for the snapshot to complete + aws ec2 wait snapshot-completed --snapshot-ids $snapshot_id + echo "Snapshot completed." + + - name: Delete Volume + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} + VOLUME_ID: ${{ needs.start_ec2_instance.outputs.created_volume_id }} + run: | + # Delete the volume after snapshot is complete + aws ec2 delete-volume --volume-id $VOLUME_ID + echo "Volume $VOLUME_ID deleted." From 2eebb2237fb93d144096152fa54cb7bc12d064e4 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 7 Oct 2024 16:36:36 -0700 Subject: [PATCH 04/38] corrected volume id output --- .github/workflows/run-simulators.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 542985037..1177015a1 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -218,7 +218,7 @@ jobs: - name: Take Snapshot of Volume env: - VOLUME_ID: ${{ needs.start_ec2_instance.outputs.created_volume_id }} + VOLUME_ID: ${{ needs.start_ec2_instance.outputs.volume_id }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} From 252c837e5b8088b3f59693a1525f09ba4e161245 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 7 Oct 2024 16:58:12 -0700 Subject: [PATCH 05/38] changed volume to be root volume /dev/sda1 --- .github/workflows/run-simulators.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 1177015a1..771d5fa65 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -32,8 +32,8 @@ jobs: echo "Volume is now available" # Attach the volume to the instance - aws ec2 attach-volume --volume-id $volume_id --instance-id $INSTANCE_ID --device /dev/sdf - echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sdf" + aws ec2 attach-volume --volume-id $volume_id --instance-id $INSTANCE_ID --device /dev/sda1 + echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sda1" - name: Start EC2 Instance env: @@ -236,7 +236,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - VOLUME_ID: ${{ needs.start_ec2_instance.outputs.created_volume_id }} + VOLUME_ID: ${{ needs.start_ec2_instance.outputs.volume_id }} run: | # Delete the volume after snapshot is complete aws ec2 delete-volume --volume-id $VOLUME_ID From 8ee54ab6e3e451e3905409eb479d75e29f487cf8 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 8 Oct 2024 09:34:19 -0700 Subject: [PATCH 06/38] updated volume Id to use Github variables --- .github/workflows/run-simulators.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 771d5fa65..7a7d1ebc4 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -25,7 +25,9 @@ jobs: # Create a new volume from the latest snapshot volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone $(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query "Reservations[0].Instances[0].Placement.AvailabilityZone" --output text) --volume-type gp2 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" - echo "::set-output name=volume_id::$volume_id" + + # Store volume_id in GITHUB_ENV + echo "volume_id=$volume_id" >> $GITHUB_ENV # Wait until the volume is available aws ec2 wait volume-available --volume-ids $volume_id @@ -218,7 +220,7 @@ jobs: - name: Take Snapshot of Volume env: - VOLUME_ID: ${{ needs.start_ec2_instance.outputs.volume_id }} + VOLUME_ID: ${{ env.volume_id }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} @@ -236,7 +238,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - VOLUME_ID: ${{ needs.start_ec2_instance.outputs.volume_id }} + VOLUME_ID: ${{ env.volume_id }} run: | # Delete the volume after snapshot is complete aws ec2 delete-volume --volume-id $VOLUME_ID From 4caa5cc749041fbf32ed7aa5444a00bd1a18239e Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 8 Oct 2024 10:25:34 -0700 Subject: [PATCH 07/38] testing volume id --- .github/workflows/run-simulators.yml | 45 ++++++++++++++-------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 7a7d1ebc4..dc0c5d873 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -142,28 +142,28 @@ jobs: echo "NVIDIA SMI is not working, please run the steps here on the instance:" echo "https://scenic-lang.atlassian.net/wiki/spaces/KAN/pages/2785287/Setting+Up+AWS+VM?parentProduct=JSW&initialAllowedFeatures=byline-contributors.byline-extensions.page-comments.delete.page-reactions.inline-comments.non-licensed-share&themeState=dark%253Adark%2520light%253Alight%2520spacing%253Aspacing%2520colorMode%253Alight&locale=en-US#Install-NVIDIA-Drivers" - run_carla_simulators: - name: run_carla_simulators - runs-on: ubuntu-latest - needs: [check_simulator_version_updates, check_nvidia_smi] - steps: - - name: Run CARLA Tests - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{secrets.SSH_HOST}} - USER_NAME: ${{secrets.SSH_USERNAME}} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - cd /home/ubuntu/actions/Scenic && - source venv/bin/activate && - carla_versions=($(find /software -maxdepth 1 -type d -name 'carla*')) && - for version in "${carla_versions[@]}"; do - echo "============================= CARLA $version =============================" - export CARLA_ROOT="$version" - pytest tests/simulators/carla - done - ' + # run_carla_simulators: + # name: run_carla_simulators + # runs-on: ubuntu-latest + # needs: [check_simulator_version_updates, check_nvidia_smi] + # steps: + # - name: Run CARLA Tests + # env: + # PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + # HOSTNAME: ${{secrets.SSH_HOST}} + # USER_NAME: ${{secrets.SSH_USERNAME}} + # run: | + # echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + # ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + # cd /home/ubuntu/actions/Scenic && + # source venv/bin/activate && + # carla_versions=($(find /software -maxdepth 1 -type d -name 'carla*')) && + # for version in "${carla_versions[@]}"; do + # echo "============================= CARLA $version =============================" + # export CARLA_ROOT="$version" + # pytest tests/simulators/carla + # done + # ' run_webots_simulators: name: run_webots_simulators @@ -226,6 +226,7 @@ jobs: AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} run: | # Create a snapshot of the volume + echo "Volume ID is: $VOLUME_ID" snapshot_id=$(aws ec2 create-snapshot --volume-id $VOLUME_ID --description "Snapshot before deletion" --query "SnapshotId" --output text) echo "Snapshot ID: $snapshot_id" From bea467eab0551d958c712a8ae19191222fcb4ef6 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 8 Oct 2024 10:29:35 -0700 Subject: [PATCH 08/38] change needs for stop_ec2_instance for testing purposes --- .github/workflows/run-simulators.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index dc0c5d873..e1dfc0932 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -194,7 +194,7 @@ jobs: stop_ec2_instance: name: stop_ec2_instance runs-on: ubuntu-latest - needs: [start_ec2_instance, check_simulator_version_updates, check_nvidia_smi, run_carla_simulators, run_webots_simulators] + needs: [start_ec2_instance, check_simulator_version_updates, check_nvidia_smi, run_webots_simulators] if: always() steps: - name: Stop EC2 Instance From 5b8e3bdc1e93257262dd95a221289ff5de64f891 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 8 Oct 2024 11:21:17 -0700 Subject: [PATCH 09/38] testing volume id --- .github/workflows/run-simulators.yml | 327 +++++++++++++-------------- 1 file changed, 159 insertions(+), 168 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index e1dfc0932..fb9c516a8 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -10,13 +10,13 @@ jobs: runs-on: ubuntu-latest concurrency: group: sim + env: + INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} steps: - name: Create Volume from Latest Snapshot and Attach to Instance - env: - INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} run: | # Retrieve the latest snapshot ID LATEST_SNAPSHOT_ID=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots | sort_by(@, &StartTime) | [-1].SnapshotId' --output text) @@ -25,9 +25,7 @@ jobs: # Create a new volume from the latest snapshot volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone $(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query "Reservations[0].Instances[0].Placement.AvailabilityZone" --output text) --volume-type gp2 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" - - # Store volume_id in GITHUB_ENV - echo "volume_id=$volume_id" >> $GITHUB_ENV + echo "::set-output name=volume_id::$volume_id" # Wait until the volume is available aws ec2 wait volume-available --volume-ids $volume_id @@ -37,110 +35,105 @@ jobs: aws ec2 attach-volume --volume-id $volume_id --instance-id $INSTANCE_ID --device /dev/sda1 echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sda1" - - name: Start EC2 Instance - env: - INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - run: | - # Get the instance state - instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - - # If the machine is stopping wait for it to fully stop - while [ "$instance_state" == "stopping" ]; do - echo "Instance is stopping, waiting for it to fully stop..." - sleep 10 - instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - done - - # Check if instance state is "stopped" - if [[ "$instance_state" == "stopped" ]]; then - echo "Instance is stopped, starting it..." - aws ec2 start-instances --instance-ids $INSTANCE_ID - elif [[ "$instance_state" == "pending" ]]; then - echo "Instance startup is pending, continuing..." - elif [[ "$instance_state" == "running" ]]; then - echo "Instance is already running..." - exit 0 - else - echo "Unknown instance state: $instance_state" - exit 1 - fi - - # wait for status checks to pass - TIMEOUT=300 # Timeout in seconds - START_TIME=$(date +%s) - END_TIME=$((START_TIME + TIMEOUT)) - while true; do - response=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID) - system_status=$(echo "$response" | jq -r '.InstanceStatuses[0].SystemStatus.Status') - instance_status=$(echo "$response" | jq -r '.InstanceStatuses[0].InstanceStatus.Status') - - if [[ "$system_status" == "ok" && "$instance_status" == "ok" ]]; then - echo "Both SystemStatus and InstanceStatus are 'ok'" - exit 0 - fi - - CURRENT_TIME=$(date +%s) - if [[ "$CURRENT_TIME" -ge "$END_TIME" ]]; then - echo "Timeout: Both SystemStatus and InstanceStatus have not reached 'ok' state within $TIMEOUT seconds." - exit 1 - fi - - sleep 10 # Check status every 10 seconds - done - - check_simulator_version_updates: - name: check_simulator_version_updates - runs-on: ubuntu-latest - needs: start_ec2_instance - steps: - - name: Check for Simulator Version Updates - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - cd /home/ubuntu/actions/ && - rm -rf Scenic && - git clone --branch $(basename "${{ github.ref }}") --single-branch https://$GH_ACCESS_TOKEN@github.com/BerkeleyLearnVerify/Scenic.git && - cd Scenic && - python3 -m venv venv && - source venv/bin/activate && - python3 -m pip install -e .[test-full] && - python3 .github/check_latest_simulators.py - ' - - check_nvidia_smi: - name: check_nvidia_smi - runs-on: ubuntu-latest - needs: start_ec2_instance - continue-on-error: true - steps: - - name: Check NVIDIA SMI - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST}} - USER_NAME: ${{ secrets.SSH_USERNAME}} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - output=$(nvidia-smi) - echo "$output" - if [ -z "$output" ]; then - echo "NVIDIA Driver is not set" - exit 1 - fi - ' - - name: NVIDIA Driver is not set - if: ${{ failure() }} - run: | - echo "NVIDIA SMI is not working, please run the steps here on the instance:" - echo "https://scenic-lang.atlassian.net/wiki/spaces/KAN/pages/2785287/Setting+Up+AWS+VM?parentProduct=JSW&initialAllowedFeatures=byline-contributors.byline-extensions.page-comments.delete.page-reactions.inline-comments.non-licensed-share&themeState=dark%253Adark%2520light%253Alight%2520spacing%253Aspacing%2520colorMode%253Alight&locale=en-US#Install-NVIDIA-Drivers" + # - name: Start EC2 Instance + # run: | + # # Get the instance state + # instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') + + # # If the machine is stopping wait for it to fully stop + # while [ "$instance_state" == "stopping" ]; do + # echo "Instance is stopping, waiting for it to fully stop..." + # sleep 10 + # instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') + # done + + # # Check if instance state is "stopped" + # if [[ "$instance_state" == "stopped" ]]; then + # echo "Instance is stopped, starting it..." + # aws ec2 start-instances --instance-ids $INSTANCE_ID + # elif [[ "$instance_state" == "pending" ]]; then + # echo "Instance startup is pending, continuing..." + # elif [[ "$instance_state" == "running" ]]; then + # echo "Instance is already running..." + # exit 0 + # else + # echo "Unknown instance state: $instance_state" + # exit 1 + # fi + + # # wait for status checks to pass + # TIMEOUT=300 # Timeout in seconds + # START_TIME=$(date +%s) + # END_TIME=$((START_TIME + TIMEOUT)) + # while true; do + # response=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID) + # system_status=$(echo "$response" | jq -r '.InstanceStatuses[0].SystemStatus.Status') + # instance_status=$(echo "$response" | jq -r '.InstanceStatuses[0].InstanceStatus.Status') + + # if [[ "$system_status" == "ok" && "$instance_status" == "ok" ]]; then + # echo "Both SystemStatus and InstanceStatus are 'ok'" + # exit 0 + # fi + + # CURRENT_TIME=$(date +%s) + # if [[ "$CURRENT_TIME" -ge "$END_TIME" ]]; then + # echo "Timeout: Both SystemStatus and InstanceStatus have not reached 'ok' state within $TIMEOUT seconds." + # exit 1 + # fi + + # sleep 10 # Check status every 10 seconds + # done + + # check_simulator_version_updates: + # name: check_simulator_version_updates + # runs-on: ubuntu-latest + # needs: start_ec2_instance + # steps: + # - name: Check for Simulator Version Updates + # env: + # PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + # HOSTNAME: ${{ secrets.SSH_HOST }} + # USER_NAME: ${{ secrets.SSH_USERNAME }} + # GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} + # run: | + # echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + # ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + # cd /home/ubuntu/actions/ && + # rm -rf Scenic && + # git clone --branch $(basename "${{ github.ref }}") --single-branch https://$GH_ACCESS_TOKEN@github.com/BerkeleyLearnVerify/Scenic.git && + # cd Scenic && + # python3 -m venv venv && + # source venv/bin/activate && + # python3 -m pip install -e .[test-full] && + # python3 .github/check_latest_simulators.py + # ' + + # check_nvidia_smi: + # name: check_nvidia_smi + # runs-on: ubuntu-latest + # needs: start_ec2_instance + # continue-on-error: true + # steps: + # - name: Check NVIDIA SMI + # env: + # PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + # HOSTNAME: ${{ secrets.SSH_HOST}} + # USER_NAME: ${{ secrets.SSH_USERNAME}} + # run: | + # echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + # ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + # output=$(nvidia-smi) + # echo "$output" + # if [ -z "$output" ]; then + # echo "NVIDIA Driver is not set" + # exit 1 + # fi + # ' + # - name: NVIDIA Driver is not set + # if: ${{ failure() }} + # run: | + # echo "NVIDIA SMI is not working, please run the steps here on the instance:" + # echo "https://scenic-lang.atlassian.net/wiki/spaces/KAN/pages/2785287/Setting+Up+AWS+VM?parentProduct=JSW&initialAllowedFeatures=byline-contributors.byline-extensions.page-comments.delete.page-reactions.inline-comments.non-licensed-share&themeState=dark%253Adark%2520light%253Alight%2520spacing%253Aspacing%2520colorMode%253Alight&locale=en-US#Install-NVIDIA-Drivers" # run_carla_simulators: # name: run_carla_simulators @@ -165,65 +158,61 @@ jobs: # done # ' - run_webots_simulators: - name: run_webots_simulators - runs-on: ubuntu-latest - needs: [check_simulator_version_updates, check_nvidia_smi] - steps: - - name: Run Webots Tests - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{secrets.SSH_HOST}} - USER_NAME: ${{secrets.SSH_USERNAME}} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - Xvfb :99 -screen 0 1024x768x16 & - cd /home/ubuntu/actions/Scenic && - source venv/bin/activate && - webots_versions=($(find /software -maxdepth 1 -type d -name 'webots*')) && - export DISPLAY=:99 && - for version in "${webots_versions[@]}"; do - echo "============================= Webots $version =============================" - export WEBOTS_ROOT="$version" - pytest tests/simulators/webots - done - kill %1 - ' + # run_webots_simulators: + # name: run_webots_simulators + # runs-on: ubuntu-latest + # needs: [check_simulator_version_updates, check_nvidia_smi] + # steps: + # - name: Run Webots Tests + # env: + # PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + # HOSTNAME: ${{secrets.SSH_HOST}} + # USER_NAME: ${{secrets.SSH_USERNAME}} + # run: | + # echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + # ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + # Xvfb :99 -screen 0 1024x768x16 & + # cd /home/ubuntu/actions/Scenic && + # source venv/bin/activate && + # webots_versions=($(find /software -maxdepth 1 -type d -name 'webots*')) && + # export DISPLAY=:99 && + # for version in "${webots_versions[@]}"; do + # echo "============================= Webots $version =============================" + # export WEBOTS_ROOT="$version" + # pytest tests/simulators/webots + # done + # kill %1 + # ' stop_ec2_instance: name: stop_ec2_instance runs-on: ubuntu-latest - needs: [start_ec2_instance, check_simulator_version_updates, check_nvidia_smi, run_webots_simulators] + needs: [start_ec2_instance ] if: always() + env: + VOLUME_ID: ${{ needs.start_ec2_instance.outputs.volume_id }} + INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} steps: - - name: Stop EC2 Instance - env: - INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - run: | - # Get the instance state and stop it if running - instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - if [[ "$instance_state" == "running" ]]; then - echo "Instance is running, stopping it..." - aws ec2 stop-instances --instance-ids $INSTANCE_ID - aws ec2 wait instance-stopped --instance-ids $INSTANCE_ID - echo "Instance has stopped." - elif [[ "$instance_state" == "stopped" ]]; then - echo "Instance is already stopped." - else - echo "Unexpected instance state: $instance_state" - exit 1 - fi + # - name: Stop EC2 Instance + # run: | + # # Get the instance state and stop it if running + # instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') + # if [[ "$instance_state" == "running" ]]; then + # echo "Instance is running, stopping it..." + # aws ec2 stop-instances --instance-ids $INSTANCE_ID + # aws ec2 wait instance-stopped --instance-ids $INSTANCE_ID + # echo "Instance has stopped." + # elif [[ "$instance_state" == "stopped" ]]; then + # echo "Instance is already stopped." + # else + # echo "Unexpected instance state: $instance_state" + # exit 1 + # fi - name: Take Snapshot of Volume - env: - VOLUME_ID: ${{ env.volume_id }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} run: | # Create a snapshot of the volume echo "Volume ID is: $VOLUME_ID" @@ -234,12 +223,14 @@ jobs: aws ec2 wait snapshot-completed --snapshot-ids $snapshot_id echo "Snapshot completed." + - name: Detach Volume + run: | + # Detach the volume + aws ec2 detach-volume --volume-id $VOLUME_ID + aws ec2 wait volume-available --volume-ids $VOLUME_ID + echo "Volume $VOLUME_ID detached." + - name: Delete Volume - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - VOLUME_ID: ${{ env.volume_id }} run: | # Delete the volume after snapshot is complete aws ec2 delete-volume --volume-id $VOLUME_ID From 0e0c556c12be01ea03862c4c991babfffa61972d Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 8 Oct 2024 11:32:02 -0700 Subject: [PATCH 10/38] using new github_output --- .github/workflows/run-simulators.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index fb9c516a8..aed9329dd 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -25,7 +25,9 @@ jobs: # Create a new volume from the latest snapshot volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone $(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query "Reservations[0].Instances[0].Placement.AvailabilityZone" --output text) --volume-type gp2 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" - echo "::set-output name=volume_id::$volume_id" + + # Set volume_id as output + echo "volume_id=$volume_id" >> $GITHUB_OUTPUT # Wait until the volume is available aws ec2 wait volume-available --volume-ids $volume_id From 0b39c4d2b4b52ff4f82002d10d687e5c85d80590 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 8 Oct 2024 12:04:11 -0700 Subject: [PATCH 11/38] trying outputs to pass volume id --- .github/workflows/run-simulators.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index aed9329dd..a2a65021a 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -10,6 +10,8 @@ jobs: runs-on: ubuntu-latest concurrency: group: sim + outputs: + volume_id: ${{ steps.create_volume_step.outputs.volume_id }} env: INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} @@ -17,6 +19,7 @@ jobs: AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} steps: - name: Create Volume from Latest Snapshot and Attach to Instance + id: create_volume_step run: | # Retrieve the latest snapshot ID LATEST_SNAPSHOT_ID=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots | sort_by(@, &StartTime) | [-1].SnapshotId' --output text) @@ -28,6 +31,7 @@ jobs: # Set volume_id as output echo "volume_id=$volume_id" >> $GITHUB_OUTPUT + cat $GITHUB_OUTPUT # Wait until the volume is available aws ec2 wait volume-available --volume-ids $volume_id From a9659f5a4576408be82d035a43814b8456ef725f Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 8 Oct 2024 12:11:49 -0700 Subject: [PATCH 12/38] volume id passed correctly to last job. checking workflow with tests --- .github/workflows/run-simulators.yml | 328 +++++++++++++-------------- 1 file changed, 164 insertions(+), 164 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index a2a65021a..062b1080e 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -41,159 +41,159 @@ jobs: aws ec2 attach-volume --volume-id $volume_id --instance-id $INSTANCE_ID --device /dev/sda1 echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sda1" - # - name: Start EC2 Instance - # run: | - # # Get the instance state - # instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - - # # If the machine is stopping wait for it to fully stop - # while [ "$instance_state" == "stopping" ]; do - # echo "Instance is stopping, waiting for it to fully stop..." - # sleep 10 - # instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - # done - - # # Check if instance state is "stopped" - # if [[ "$instance_state" == "stopped" ]]; then - # echo "Instance is stopped, starting it..." - # aws ec2 start-instances --instance-ids $INSTANCE_ID - # elif [[ "$instance_state" == "pending" ]]; then - # echo "Instance startup is pending, continuing..." - # elif [[ "$instance_state" == "running" ]]; then - # echo "Instance is already running..." - # exit 0 - # else - # echo "Unknown instance state: $instance_state" - # exit 1 - # fi - - # # wait for status checks to pass - # TIMEOUT=300 # Timeout in seconds - # START_TIME=$(date +%s) - # END_TIME=$((START_TIME + TIMEOUT)) - # while true; do - # response=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID) - # system_status=$(echo "$response" | jq -r '.InstanceStatuses[0].SystemStatus.Status') - # instance_status=$(echo "$response" | jq -r '.InstanceStatuses[0].InstanceStatus.Status') - - # if [[ "$system_status" == "ok" && "$instance_status" == "ok" ]]; then - # echo "Both SystemStatus and InstanceStatus are 'ok'" - # exit 0 - # fi - - # CURRENT_TIME=$(date +%s) - # if [[ "$CURRENT_TIME" -ge "$END_TIME" ]]; then - # echo "Timeout: Both SystemStatus and InstanceStatus have not reached 'ok' state within $TIMEOUT seconds." - # exit 1 - # fi - - # sleep 10 # Check status every 10 seconds - # done - - # check_simulator_version_updates: - # name: check_simulator_version_updates - # runs-on: ubuntu-latest - # needs: start_ec2_instance - # steps: - # - name: Check for Simulator Version Updates - # env: - # PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - # HOSTNAME: ${{ secrets.SSH_HOST }} - # USER_NAME: ${{ secrets.SSH_USERNAME }} - # GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} - # run: | - # echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - # ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - # cd /home/ubuntu/actions/ && - # rm -rf Scenic && - # git clone --branch $(basename "${{ github.ref }}") --single-branch https://$GH_ACCESS_TOKEN@github.com/BerkeleyLearnVerify/Scenic.git && - # cd Scenic && - # python3 -m venv venv && - # source venv/bin/activate && - # python3 -m pip install -e .[test-full] && - # python3 .github/check_latest_simulators.py - # ' - - # check_nvidia_smi: - # name: check_nvidia_smi - # runs-on: ubuntu-latest - # needs: start_ec2_instance - # continue-on-error: true - # steps: - # - name: Check NVIDIA SMI - # env: - # PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - # HOSTNAME: ${{ secrets.SSH_HOST}} - # USER_NAME: ${{ secrets.SSH_USERNAME}} - # run: | - # echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - # ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - # output=$(nvidia-smi) - # echo "$output" - # if [ -z "$output" ]; then - # echo "NVIDIA Driver is not set" - # exit 1 - # fi - # ' - # - name: NVIDIA Driver is not set - # if: ${{ failure() }} - # run: | - # echo "NVIDIA SMI is not working, please run the steps here on the instance:" - # echo "https://scenic-lang.atlassian.net/wiki/spaces/KAN/pages/2785287/Setting+Up+AWS+VM?parentProduct=JSW&initialAllowedFeatures=byline-contributors.byline-extensions.page-comments.delete.page-reactions.inline-comments.non-licensed-share&themeState=dark%253Adark%2520light%253Alight%2520spacing%253Aspacing%2520colorMode%253Alight&locale=en-US#Install-NVIDIA-Drivers" - - # run_carla_simulators: - # name: run_carla_simulators - # runs-on: ubuntu-latest - # needs: [check_simulator_version_updates, check_nvidia_smi] - # steps: - # - name: Run CARLA Tests - # env: - # PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - # HOSTNAME: ${{secrets.SSH_HOST}} - # USER_NAME: ${{secrets.SSH_USERNAME}} - # run: | - # echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - # ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - # cd /home/ubuntu/actions/Scenic && - # source venv/bin/activate && - # carla_versions=($(find /software -maxdepth 1 -type d -name 'carla*')) && - # for version in "${carla_versions[@]}"; do - # echo "============================= CARLA $version =============================" - # export CARLA_ROOT="$version" - # pytest tests/simulators/carla - # done - # ' - - # run_webots_simulators: - # name: run_webots_simulators - # runs-on: ubuntu-latest - # needs: [check_simulator_version_updates, check_nvidia_smi] - # steps: - # - name: Run Webots Tests - # env: - # PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - # HOSTNAME: ${{secrets.SSH_HOST}} - # USER_NAME: ${{secrets.SSH_USERNAME}} - # run: | - # echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - # ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - # Xvfb :99 -screen 0 1024x768x16 & - # cd /home/ubuntu/actions/Scenic && - # source venv/bin/activate && - # webots_versions=($(find /software -maxdepth 1 -type d -name 'webots*')) && - # export DISPLAY=:99 && - # for version in "${webots_versions[@]}"; do - # echo "============================= Webots $version =============================" - # export WEBOTS_ROOT="$version" - # pytest tests/simulators/webots - # done - # kill %1 - # ' + - name: Start EC2 Instance + run: | + # Get the instance state + instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') + + # If the machine is stopping wait for it to fully stop + while [ "$instance_state" == "stopping" ]; do + echo "Instance is stopping, waiting for it to fully stop..." + sleep 10 + instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') + done + + # Check if instance state is "stopped" + if [[ "$instance_state" == "stopped" ]]; then + echo "Instance is stopped, starting it..." + aws ec2 start-instances --instance-ids $INSTANCE_ID + elif [[ "$instance_state" == "pending" ]]; then + echo "Instance startup is pending, continuing..." + elif [[ "$instance_state" == "running" ]]; then + echo "Instance is already running..." + exit 0 + else + echo "Unknown instance state: $instance_state" + exit 1 + fi + + # wait for status checks to pass + TIMEOUT=300 # Timeout in seconds + START_TIME=$(date +%s) + END_TIME=$((START_TIME + TIMEOUT)) + while true; do + response=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID) + system_status=$(echo "$response" | jq -r '.InstanceStatuses[0].SystemStatus.Status') + instance_status=$(echo "$response" | jq -r '.InstanceStatuses[0].InstanceStatus.Status') + + if [[ "$system_status" == "ok" && "$instance_status" == "ok" ]]; then + echo "Both SystemStatus and InstanceStatus are 'ok'" + exit 0 + fi + + CURRENT_TIME=$(date +%s) + if [[ "$CURRENT_TIME" -ge "$END_TIME" ]]; then + echo "Timeout: Both SystemStatus and InstanceStatus have not reached 'ok' state within $TIMEOUT seconds." + exit 1 + fi + + sleep 10 # Check status every 10 seconds + done + + check_simulator_version_updates: + name: check_simulator_version_updates + runs-on: ubuntu-latest + needs: start_ec2_instance + steps: + - name: Check for Simulator Version Updates + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST }} + USER_NAME: ${{ secrets.SSH_USERNAME }} + GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + cd /home/ubuntu/actions/ && + rm -rf Scenic && + git clone --branch $(basename "${{ github.ref }}") --single-branch https://$GH_ACCESS_TOKEN@github.com/BerkeleyLearnVerify/Scenic.git && + cd Scenic && + python3 -m venv venv && + source venv/bin/activate && + python3 -m pip install -e .[test-full] && + python3 .github/check_latest_simulators.py + ' + + check_nvidia_smi: + name: check_nvidia_smi + runs-on: ubuntu-latest + needs: start_ec2_instance + continue-on-error: true + steps: + - name: Check NVIDIA SMI + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST}} + USER_NAME: ${{ secrets.SSH_USERNAME}} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + output=$(nvidia-smi) + echo "$output" + if [ -z "$output" ]; then + echo "NVIDIA Driver is not set" + exit 1 + fi + ' + - name: NVIDIA Driver is not set + if: ${{ failure() }} + run: | + echo "NVIDIA SMI is not working, please run the steps here on the instance:" + echo "https://scenic-lang.atlassian.net/wiki/spaces/KAN/pages/2785287/Setting+Up+AWS+VM?parentProduct=JSW&initialAllowedFeatures=byline-contributors.byline-extensions.page-comments.delete.page-reactions.inline-comments.non-licensed-share&themeState=dark%253Adark%2520light%253Alight%2520spacing%253Aspacing%2520colorMode%253Alight&locale=en-US#Install-NVIDIA-Drivers" + + run_carla_simulators: + name: run_carla_simulators + runs-on: ubuntu-latest + needs: [check_simulator_version_updates, check_nvidia_smi] + steps: + - name: Run CARLA Tests + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{secrets.SSH_HOST}} + USER_NAME: ${{secrets.SSH_USERNAME}} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + cd /home/ubuntu/actions/Scenic && + source venv/bin/activate && + carla_versions=($(find /software -maxdepth 1 -type d -name 'carla*')) && + for version in "${carla_versions[@]}"; do + echo "============================= CARLA $version =============================" + export CARLA_ROOT="$version" + pytest tests/simulators/carla + done + ' + + run_webots_simulators: + name: run_webots_simulators + runs-on: ubuntu-latest + needs: [check_simulator_version_updates, check_nvidia_smi] + steps: + - name: Run Webots Tests + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{secrets.SSH_HOST}} + USER_NAME: ${{secrets.SSH_USERNAME}} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + Xvfb :99 -screen 0 1024x768x16 & + cd /home/ubuntu/actions/Scenic && + source venv/bin/activate && + webots_versions=($(find /software -maxdepth 1 -type d -name 'webots*')) && + export DISPLAY=:99 && + for version in "${webots_versions[@]}"; do + echo "============================= Webots $version =============================" + export WEBOTS_ROOT="$version" + pytest tests/simulators/webots + done + kill %1 + ' stop_ec2_instance: name: stop_ec2_instance runs-on: ubuntu-latest - needs: [start_ec2_instance ] + needs: [start_ec2_instance, check_simulator_version_updates, check_nvidia_smi, run_carla_simulators, run_webots_simulators] if: always() env: VOLUME_ID: ${{ needs.start_ec2_instance.outputs.volume_id }} @@ -202,21 +202,21 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} steps: - # - name: Stop EC2 Instance - # run: | - # # Get the instance state and stop it if running - # instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - # if [[ "$instance_state" == "running" ]]; then - # echo "Instance is running, stopping it..." - # aws ec2 stop-instances --instance-ids $INSTANCE_ID - # aws ec2 wait instance-stopped --instance-ids $INSTANCE_ID - # echo "Instance has stopped." - # elif [[ "$instance_state" == "stopped" ]]; then - # echo "Instance is already stopped." - # else - # echo "Unexpected instance state: $instance_state" - # exit 1 - # fi + - name: Stop EC2 Instance + run: | + # Get the instance state and stop it if running + instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') + if [[ "$instance_state" == "running" ]]; then + echo "Instance is running, stopping it..." + aws ec2 stop-instances --instance-ids $INSTANCE_ID + aws ec2 wait instance-stopped --instance-ids $INSTANCE_ID + echo "Instance has stopped." + elif [[ "$instance_state" == "stopped" ]]; then + echo "Instance is already stopped." + else + echo "Unexpected instance state: $instance_state" + exit 1 + fi - name: Take Snapshot of Volume run: | From 556436a92eb77a318f0af7e67a4052f079221243 Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 10 Oct 2024 07:28:37 -0700 Subject: [PATCH 13/38] changed volume type and size --- .github/workflows/run-simulators.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 062b1080e..91fb0a323 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -26,7 +26,7 @@ jobs: echo "Using latest snapshot with ID: $LATEST_SNAPSHOT_ID" # Create a new volume from the latest snapshot - volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone $(aws ec2 describe-instances --instance-ids $INSTANCE_ID --query "Reservations[0].Instances[0].Placement.AvailabilityZone" --output text) --volume-type gp2 --query "VolumeId" --output text) + volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type standard --size 400 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" # Set volume_id as output From ceabc75fd47cd9f9bbc3ddf78f91367b0586f5d5 Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 10 Oct 2024 10:59:48 -0700 Subject: [PATCH 14/38] Add disk usage checks before and after simulator tests to evaluate volume size needs --- .github/workflows/run-simulators.yml | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 91fb0a323..ddc66d01f 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -41,6 +41,16 @@ jobs: aws ec2 attach-volume --volume-id $volume_id --instance-id $INSTANCE_ID --device /dev/sda1 echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sda1" + - name: Check Disc Usage After Volume Attachment + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST }} + USER_NAME: ${{ secrets.SSH_USERNAME }} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' + + - name: Start EC2 Instance run: | # Get the instance state @@ -146,6 +156,15 @@ jobs: runs-on: ubuntu-latest needs: [check_simulator_version_updates, check_nvidia_smi] steps: + - name: Check Disk Usage Before CARLA Tests + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST }} + USER_NAME: ${{ secrets.SSH_USERNAME }} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' + - name: Run CARLA Tests env: PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} @@ -164,11 +183,29 @@ jobs: done ' + - name: Check Disk Usage After CARLA Tests + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST }} + USER_NAME: ${{ secrets.SSH_USERNAME }} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' + run_webots_simulators: name: run_webots_simulators runs-on: ubuntu-latest needs: [check_simulator_version_updates, check_nvidia_smi] steps: + - name: Check Disk Usage Before Webots Tests + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST }} + USER_NAME: ${{ secrets.SSH_USERNAME }} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' + - name: Run Webots Tests env: PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} @@ -189,6 +226,14 @@ jobs: done kill %1 ' + - name: Check Disk Usage After Webots Tests + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST }} + USER_NAME: ${{ secrets.SSH_USERNAME }} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' stop_ec2_instance: name: stop_ec2_instance @@ -202,6 +247,15 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} steps: + - name: Check Disk Before Stopping EC2 Instance + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST }} + USER_NAME: ${{ secrets.SSH_USERNAME }} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' + - name: Stop EC2 Instance run: | # Get the instance state and stop it if running From 63b071493061bd7301fa836825c602fd1e0b752b Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 10 Oct 2024 11:10:24 -0700 Subject: [PATCH 15/38] Reorder disk usage check to run after EC2 instance start --- .github/workflows/run-simulators.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index ddc66d01f..e4bed1d89 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -41,16 +41,6 @@ jobs: aws ec2 attach-volume --volume-id $volume_id --instance-id $INSTANCE_ID --device /dev/sda1 echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sda1" - - name: Check Disc Usage After Volume Attachment - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' - - - name: Start EC2 Instance run: | # Get the instance state @@ -100,6 +90,16 @@ jobs: sleep 10 # Check status every 10 seconds done + - name: Check Disc Usage After Volume Attachment + env: + PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + HOSTNAME: ${{ secrets.SSH_HOST }} + USER_NAME: ${{ secrets.SSH_USERNAME }} + run: | + echo "$PRIVATE_KEY" > private_key && chmod 600 private_key + ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' + + check_simulator_version_updates: name: check_simulator_version_updates runs-on: ubuntu-latest From 8025f26ad8559c492032763075b505160489b8a5 Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 10 Oct 2024 11:47:28 -0700 Subject: [PATCH 16/38] checking workflow and volume usage --- .github/workflows/run-simulators.yml | 65 ---------------------------- 1 file changed, 65 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index e4bed1d89..af892e895 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -90,16 +90,6 @@ jobs: sleep 10 # Check status every 10 seconds done - - name: Check Disc Usage After Volume Attachment - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' - - check_simulator_version_updates: name: check_simulator_version_updates runs-on: ubuntu-latest @@ -156,15 +146,6 @@ jobs: runs-on: ubuntu-latest needs: [check_simulator_version_updates, check_nvidia_smi] steps: - - name: Check Disk Usage Before CARLA Tests - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' - - name: Run CARLA Tests env: PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} @@ -183,29 +164,11 @@ jobs: done ' - - name: Check Disk Usage After CARLA Tests - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' - run_webots_simulators: name: run_webots_simulators runs-on: ubuntu-latest needs: [check_simulator_version_updates, check_nvidia_smi] steps: - - name: Check Disk Usage Before Webots Tests - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' - - name: Run Webots Tests env: PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} @@ -226,14 +189,6 @@ jobs: done kill %1 ' - - name: Check Disk Usage After Webots Tests - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' stop_ec2_instance: name: stop_ec2_instance @@ -247,15 +202,6 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} steps: - - name: Check Disk Before Stopping EC2 Instance - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} 'df -h /dev/sda1' - - name: Stop EC2 Instance run: | # Get the instance state and stop it if running @@ -272,17 +218,6 @@ jobs: exit 1 fi - - name: Take Snapshot of Volume - run: | - # Create a snapshot of the volume - echo "Volume ID is: $VOLUME_ID" - snapshot_id=$(aws ec2 create-snapshot --volume-id $VOLUME_ID --description "Snapshot before deletion" --query "SnapshotId" --output text) - echo "Snapshot ID: $snapshot_id" - - # Wait for the snapshot to complete - aws ec2 wait snapshot-completed --snapshot-ids $snapshot_id - echo "Snapshot completed." - - name: Detach Volume run: | # Detach the volume From bbcef09263b99d52635386e42a0853fdfa837571 Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 10 Oct 2024 12:17:06 -0700 Subject: [PATCH 17/38] Update workflow to create 100 GiB sc1 volume from snapshot --- .github/workflows/run-simulators.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index af892e895..d1634811f 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -26,7 +26,7 @@ jobs: echo "Using latest snapshot with ID: $LATEST_SNAPSHOT_ID" # Create a new volume from the latest snapshot - volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type standard --size 400 --query "VolumeId" --output text) + volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type sc1 --size 100 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" # Set volume_id as output From 574bed181d98a32c4c9d644ef074cb7b3bed5862 Mon Sep 17 00:00:00 2001 From: lola Date: Fri, 11 Oct 2024 10:40:38 -0700 Subject: [PATCH 18/38] Fix format workflow by ensuring Python environment and installing isort and black --- .github/workflows/check-formatting.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-formatting.yml b/.github/workflows/check-formatting.yml index 251ec326f..22e4b5bfa 100644 --- a/.github/workflows/check-formatting.yml +++ b/.github/workflows/check-formatting.yml @@ -12,8 +12,16 @@ jobs: - name: Checkout code uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install isort and black + run: pip install isort black + - name: Run black to check formatting - uses: psf/black@stable + run: black --check . - name: Run isort to check import order - uses: isort/isort-action@v1 + run: isort --check-only --diff . From 5ce23eb1341ca389f9f2c6e3a09d9f1644146d50 Mon Sep 17 00:00:00 2001 From: lola Date: Fri, 11 Oct 2024 11:16:33 -0700 Subject: [PATCH 19/38] Increase timeout and temporarily disable volume deletion to allow snapshot creation and resizing in AWS --- .github/workflows/run-simulators.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index d1634811f..6aacecce3 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -26,7 +26,7 @@ jobs: echo "Using latest snapshot with ID: $LATEST_SNAPSHOT_ID" # Create a new volume from the latest snapshot - volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type sc1 --size 100 --query "VolumeId" --output text) + volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type sc1 --size 150 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" # Set volume_id as output @@ -68,7 +68,7 @@ jobs: fi # wait for status checks to pass - TIMEOUT=300 # Timeout in seconds + TIMEOUT=600 # Timeout in seconds START_TIME=$(date +%s) END_TIME=$((START_TIME + TIMEOUT)) while true; do @@ -225,8 +225,8 @@ jobs: aws ec2 wait volume-available --volume-ids $VOLUME_ID echo "Volume $VOLUME_ID detached." - - name: Delete Volume - run: | - # Delete the volume after snapshot is complete - aws ec2 delete-volume --volume-id $VOLUME_ID - echo "Volume $VOLUME_ID deleted." + # - name: Delete Volume + # run: | + # # Delete the volume after snapshot is complete + # aws ec2 delete-volume --volume-id $VOLUME_ID + # echo "Volume $VOLUME_ID deleted." From 1a4eff9028414750d2098b6276291f5718eceb57 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 15 Oct 2024 09:51:24 -0700 Subject: [PATCH 20/38] check workflow with new volume size 100GB --- .github/workflows/run-simulators.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 6aacecce3..280713413 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -26,7 +26,7 @@ jobs: echo "Using latest snapshot with ID: $LATEST_SNAPSHOT_ID" # Create a new volume from the latest snapshot - volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type sc1 --size 150 --query "VolumeId" --output text) + volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type standard --size 100 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" # Set volume_id as output @@ -68,7 +68,7 @@ jobs: fi # wait for status checks to pass - TIMEOUT=600 # Timeout in seconds + TIMEOUT=300 # Timeout in seconds START_TIME=$(date +%s) END_TIME=$((START_TIME + TIMEOUT)) while true; do @@ -225,8 +225,8 @@ jobs: aws ec2 wait volume-available --volume-ids $VOLUME_ID echo "Volume $VOLUME_ID detached." - # - name: Delete Volume - # run: | - # # Delete the volume after snapshot is complete - # aws ec2 delete-volume --volume-id $VOLUME_ID - # echo "Volume $VOLUME_ID deleted." + - name: Delete Volume + run: | + # Delete the volume after snapshot is complete + aws ec2 delete-volume --volume-id $VOLUME_ID + echo "Volume $VOLUME_ID deleted." From 338835bfb4c0a08c2589b5475346ba9055c6b079 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 15 Oct 2024 10:42:53 -0700 Subject: [PATCH 21/38] Refactor workflow to streamline instance startup and monitoring: - Added snapshot readiness check for volume creation - Enhanced logging for detailed instance state tracking - Extended timeout for instance status checks to 10 minutes --- .github/workflows/run-simulators.yml | 35 +++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 280713413..e253e28d6 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -23,7 +23,19 @@ jobs: run: | # Retrieve the latest snapshot ID LATEST_SNAPSHOT_ID=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots | sort_by(@, &StartTime) | [-1].SnapshotId' --output text) - echo "Using latest snapshot with ID: $LATEST_SNAPSHOT_ID" + echo "Checking availability for snapshot: $LATEST_SNAPSHOT_ID" + + # Wait until snapshot is in 'completed' status + while true; do + snapshot_status=$(aws ec2 describe-snapshots --snapshot-ids $LATEST_SNAPSHOT_ID --query 'Snapshots[0].State' --output text) + if [ "$snapshot_status" == "completed" ]; then + echo "Snapshot is ready." + break + else + echo "Snapshot still in $snapshot_status state, waiting..." + sleep 10 + fi + done # Create a new volume from the latest snapshot volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type standard --size 100 --query "VolumeId" --output text) @@ -67,8 +79,25 @@ jobs: exit 1 fi - # wait for status checks to pass - TIMEOUT=300 # Timeout in seconds + - name: Get and Log Instance State + run: | + # Capture detailed instance status + instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') + instance_status=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID | jq -r '.InstanceStatuses[0].InstanceStatus.Status') + system_status=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID | jq -r '.InstanceStatuses[0].SystemStatus.Status') + echo "Instance State: $instance_state" + echo "Instance Status: $instance_status" + echo "System Status: $system_status" + + # Check for any errors in status + if [[ "$instance_status" != "ok" || "$system_status" != "ok" ]]; then + echo "Instance failed to initialize correctly. Exiting job with failure." + exit 1 + fi + + - name: Wait for Status Checks to Pass + run: | + TIMEOUT=600 # Timeout in seconds START_TIME=$(date +%s) END_TIME=$((START_TIME + TIMEOUT)) while true; do From 7a43c1ae3aa94fde8e28b43a85b0319c45c8c166 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 15 Oct 2024 10:58:25 -0700 Subject: [PATCH 22/38] Refactor instance status check code for initialization --- .github/workflows/run-simulators.yml | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index e253e28d6..9f03bdc75 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -79,24 +79,7 @@ jobs: exit 1 fi - - name: Get and Log Instance State - run: | - # Capture detailed instance status - instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - instance_status=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID | jq -r '.InstanceStatuses[0].InstanceStatus.Status') - system_status=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID | jq -r '.InstanceStatuses[0].SystemStatus.Status') - echo "Instance State: $instance_state" - echo "Instance Status: $instance_status" - echo "System Status: $system_status" - - # Check for any errors in status - if [[ "$instance_status" != "ok" || "$system_status" != "ok" ]]; then - echo "Instance failed to initialize correctly. Exiting job with failure." - exit 1 - fi - - - name: Wait for Status Checks to Pass - run: | + # wait for status checks to pass TIMEOUT=600 # Timeout in seconds START_TIME=$(date +%s) END_TIME=$((START_TIME + TIMEOUT)) From 843f67e0dd0a96b065f41f367d4c916213937bd2 Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 17 Oct 2024 15:34:54 -0700 Subject: [PATCH 23/38] Revert to original 400 GiB standard volume creation in workflow --- .github/workflows/run-simulators.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 9f03bdc75..dd9596874 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -38,7 +38,7 @@ jobs: done # Create a new volume from the latest snapshot - volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type standard --size 100 --query "VolumeId" --output text) + volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type standard --size 400 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" # Set volume_id as output From fd06c06f91db2eca7a7f43f02d9ab35a1bf92149 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 22 Oct 2024 16:34:03 -0700 Subject: [PATCH 24/38] Increase CARLA connection timeout and improve error handling - Extended CARLA server connection attempt loop from 30 to 120 seconds. - Added pytest failure if unable to connect to CARLA within 2 minutes. - Increased CarlaSimulator communication timeout from 10 to 60 seconds. --- tests/simulators/carla/test_actions.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index f0aede475..cf35cc8a8 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -43,10 +43,12 @@ def getCarlaSimulator(getAssetPath): f"bash {CARLA_ROOT}/CarlaUE4.sh -RenderOffScreen", shell=True ) - for _ in range(30): + for _ in range(120): if isCarlaServerRunning(): break time.sleep(1) + else: + pytest.fail("Unable to connect to CARLA.") # Extra 5 seconds to ensure server startup time.sleep(5) @@ -55,7 +57,7 @@ def getCarlaSimulator(getAssetPath): def _getCarlaSimulator(town): path = os.path.join(base, f"{town}.xodr") - simulator = CarlaSimulator(map_path=path, carla_map=town) + simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=60) return simulator, town, path yield _getCarlaSimulator @@ -76,7 +78,7 @@ def test_throttle(getCarlaSimulator): behavior DriveWithThrottle(): while True: take SetThrottleAction(1) - + ego = new Car at (369, -326), with behavior DriveWithThrottle record ego.speed as CarSpeed terminate after 5 steps @@ -109,8 +111,8 @@ def test_brake(getCarlaSimulator): do DriveWithThrottle() for 2 steps do Brake() for 6 steps - ego = new Car at (369, -326), - with blueprint 'vehicle.toyota.prius', + ego = new Car at (369, -326), + with blueprint 'vehicle.toyota.prius', with behavior DriveThenBrake record final ego.speed as CarSpeed terminate after 8 steps From 18b77943946a7e8ed2b12d26e6a1613718320417 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 22 Oct 2024 16:39:48 -0700 Subject: [PATCH 25/38] Change volume type to gp3 --- .github/workflows/run-simulators.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index dd9596874..1a8369965 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -38,7 +38,7 @@ jobs: done # Create a new volume from the latest snapshot - volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type standard --size 400 --query "VolumeId" --output text) + volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type gp3 --size 400 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" # Set volume_id as output From 2da31a094bc3a62d61e016b780175c8d92a5580d Mon Sep 17 00:00:00 2001 From: lola Date: Wed, 23 Oct 2024 15:32:01 -0700 Subject: [PATCH 26/38] Revert changes: restored file to original state --- .github/workflows/check-formatting.yml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/check-formatting.yml b/.github/workflows/check-formatting.yml index 22e4b5bfa..251ec326f 100644 --- a/.github/workflows/check-formatting.yml +++ b/.github/workflows/check-formatting.yml @@ -12,16 +12,8 @@ jobs: - name: Checkout code uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - - name: Install isort and black - run: pip install isort black - - name: Run black to check formatting - run: black --check . + uses: psf/black@stable - name: Run isort to check import order - run: isort --check-only --diff . + uses: isort/isort-action@v1 From 94f4b35bf951c135c6aac69f9eedf68075b7089b Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 24 Oct 2024 09:54:31 -0700 Subject: [PATCH 27/38] Increase CARLA startup wait time and log connection duration --- tests/simulators/carla/test_actions.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index cf35cc8a8..274f400da 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -43,13 +43,23 @@ def getCarlaSimulator(getAssetPath): f"bash {CARLA_ROOT}/CarlaUE4.sh -RenderOffScreen", shell=True ) - for _ in range(120): + # Start timing the connection process + start_time = time.time() + + for _ in range(300): if isCarlaServerRunning(): break time.sleep(1) else: pytest.fail("Unable to connect to CARLA.") + # End timing and calculate elapsed time + end_time = time.time() + elapsed_time = end_time - start_time + + # Print the time it took to connect + print(f"Connected to CARLA after {elapsed_time:.2f} seconds.") + # Extra 5 seconds to ensure server startup time.sleep(5) From 66d6b032421f73b47ca6688adeb869c462545f29 Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 24 Oct 2024 13:29:23 -0700 Subject: [PATCH 28/38] revert back to previous timeout times --- tests/simulators/carla/test_actions.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index 274f400da..cf35cc8a8 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -43,23 +43,13 @@ def getCarlaSimulator(getAssetPath): f"bash {CARLA_ROOT}/CarlaUE4.sh -RenderOffScreen", shell=True ) - # Start timing the connection process - start_time = time.time() - - for _ in range(300): + for _ in range(120): if isCarlaServerRunning(): break time.sleep(1) else: pytest.fail("Unable to connect to CARLA.") - # End timing and calculate elapsed time - end_time = time.time() - elapsed_time = end_time - start_time - - # Print the time it took to connect - print(f"Connected to CARLA after {elapsed_time:.2f} seconds.") - # Extra 5 seconds to ensure server startup time.sleep(5) From fac58115cee2e3bb19613a9f7ce61559422863f5 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 28 Oct 2024 11:44:01 -0700 Subject: [PATCH 29/38] Increase CARLA startup time to 10 mins, log startup duration, and set volume throughput to 250 --- .github/workflows/run-simulators.yml | 2 +- tests/simulators/carla/test_actions.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 1a8369965..350610373 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -38,7 +38,7 @@ jobs: done # Create a new volume from the latest snapshot - volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type gp3 --size 400 --query "VolumeId" --output text) + volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type gp3 --size 400 --throughput 250 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" # Set volume_id as output diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index cf35cc8a8..cef14c406 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -37,19 +37,25 @@ def isCarlaServerRunning(host="localhost", port=2000): @pytest.fixture(scope="package") def getCarlaSimulator(getAssetPath): carla_process = None + start_time = time.time() if not isCarlaServerRunning(): CARLA_ROOT = checkCarlaPath() carla_process = subprocess.Popen( f"bash {CARLA_ROOT}/CarlaUE4.sh -RenderOffScreen", shell=True ) - for _ in range(120): + for _ in range(600): if isCarlaServerRunning(): break time.sleep(1) else: pytest.fail("Unable to connect to CARLA.") + # Log the time it took for CARLA to start + end_time = time.time() + elapsed_time = end_time - start_time + print(f"CARLA started successfully in {elapsed_time:.2f} seconds.") + # Extra 5 seconds to ensure server startup time.sleep(5) From f1fb28114be94a834dd27f6a8b28feb04e365201 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 28 Oct 2024 12:34:38 -0700 Subject: [PATCH 30/38] Adjust CARLA connection settings: decrease wait loop, increase timeout, add logging --- tests/simulators/carla/test_actions.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index cef14c406..a81c21b56 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -44,7 +44,7 @@ def getCarlaSimulator(getAssetPath): f"bash {CARLA_ROOT}/CarlaUE4.sh -RenderOffScreen", shell=True ) - for _ in range(600): + for _ in range(180): if isCarlaServerRunning(): break time.sleep(1) @@ -62,8 +62,12 @@ def getCarlaSimulator(getAssetPath): base = getAssetPath("maps/CARLA") def _getCarlaSimulator(town): + start_connect_time = time.time() path = os.path.join(base, f"{town}.xodr") - simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=60) + simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=180) + end_connect_time = time.time() + connect_elapsed_time = end_connect_time - start_connect_time + print(f"CARLA connection established in {connect_elapsed_time:.2f} seconds.") return simulator, town, path yield _getCarlaSimulator From 3156704ce6dd1445af0311f917439eee3c0f7a0b Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 28 Oct 2024 13:08:25 -0700 Subject: [PATCH 31/38] Increase CARLA startup loop to 360 iterations and keep timeout at 180s to address connection stability --- tests/simulators/carla/test_actions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index a81c21b56..422a464a0 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -44,7 +44,7 @@ def getCarlaSimulator(getAssetPath): f"bash {CARLA_ROOT}/CarlaUE4.sh -RenderOffScreen", shell=True ) - for _ in range(180): + for _ in range(360): if isCarlaServerRunning(): break time.sleep(1) From 6c7748bd885cabacefbfe9bf1bd0eff1bc555518 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 28 Oct 2024 13:36:38 -0700 Subject: [PATCH 32/38] Revert to original connection settings (600 loops, 60s timeout) to investigate instance stability and identify potential issues --- tests/simulators/carla/test_actions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index 422a464a0..5affdcda6 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -44,7 +44,7 @@ def getCarlaSimulator(getAssetPath): f"bash {CARLA_ROOT}/CarlaUE4.sh -RenderOffScreen", shell=True ) - for _ in range(360): + for _ in range(600): if isCarlaServerRunning(): break time.sleep(1) @@ -64,7 +64,7 @@ def getCarlaSimulator(getAssetPath): def _getCarlaSimulator(town): start_connect_time = time.time() path = os.path.join(base, f"{town}.xodr") - simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=180) + simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=60) end_connect_time = time.time() connect_elapsed_time = end_connect_time - start_connect_time print(f"CARLA connection established in {connect_elapsed_time:.2f} seconds.") From ebf69fab725ae3c7fb5b39a3934a94a800c38ee2 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 28 Oct 2024 15:07:07 -0700 Subject: [PATCH 33/38] Increased CARLA map load timeout to 120s, adjusted startup sleep time to 10s. No instance interruption this time. --- tests/simulators/carla/test_actions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index 5affdcda6..17bbcfb51 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -57,14 +57,14 @@ def getCarlaSimulator(getAssetPath): print(f"CARLA started successfully in {elapsed_time:.2f} seconds.") # Extra 5 seconds to ensure server startup - time.sleep(5) + time.sleep(10) base = getAssetPath("maps/CARLA") def _getCarlaSimulator(town): start_connect_time = time.time() path = os.path.join(base, f"{town}.xodr") - simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=60) + simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=120) end_connect_time = time.time() connect_elapsed_time = end_connect_time - start_connect_time print(f"CARLA connection established in {connect_elapsed_time:.2f} seconds.") From a87a4f0a540ed8b197331a93a79218ce21c7ddcb Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 28 Oct 2024 16:21:06 -0700 Subject: [PATCH 34/38] Increased CARLA timeout to 180 seconds and kept 10-second sleep to ensure startup stability. --- tests/simulators/carla/test_actions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index 17bbcfb51..dc287f6cf 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -64,7 +64,7 @@ def getCarlaSimulator(getAssetPath): def _getCarlaSimulator(town): start_connect_time = time.time() path = os.path.join(base, f"{town}.xodr") - simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=120) + simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=180) end_connect_time = time.time() connect_elapsed_time = end_connect_time - start_connect_time print(f"CARLA connection established in {connect_elapsed_time:.2f} seconds.") From 38c4b446959898cf73110009935b06ca11735db4 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 28 Oct 2024 17:19:28 -0700 Subject: [PATCH 35/38] Lowered Carla timeout to 180s --- tests/simulators/carla/test_actions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index dc287f6cf..ba58fa1a2 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -44,7 +44,7 @@ def getCarlaSimulator(getAssetPath): f"bash {CARLA_ROOT}/CarlaUE4.sh -RenderOffScreen", shell=True ) - for _ in range(600): + for _ in range(180): if isCarlaServerRunning(): break time.sleep(1) From 87c88a1e0c22c53644b99f4f4fdf838f3a4baff0 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 29 Oct 2024 09:00:59 -0700 Subject: [PATCH 36/38] Add SSH keep-alive options to CARLA tests to prevent broken pipe errors --- .github/workflows/run-simulators.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 350610373..258312f17 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -165,7 +165,7 @@ jobs: USER_NAME: ${{secrets.SSH_USERNAME}} run: | echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' + ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 -o ServerAliveCountMax=3 -i private_key ${USER_NAME}@${HOSTNAME} ' cd /home/ubuntu/actions/Scenic && source venv/bin/activate && carla_versions=($(find /software -maxdepth 1 -type d -name 'carla*')) && From e364c5d317a05b4d4a8611b159ffbbf53b903278 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 29 Oct 2024 10:51:01 -0700 Subject: [PATCH 37/38] Removed logging of CARLA connection times --- tests/simulators/carla/test_actions.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/simulators/carla/test_actions.py b/tests/simulators/carla/test_actions.py index ba58fa1a2..7914ad04a 100644 --- a/tests/simulators/carla/test_actions.py +++ b/tests/simulators/carla/test_actions.py @@ -37,7 +37,6 @@ def isCarlaServerRunning(host="localhost", port=2000): @pytest.fixture(scope="package") def getCarlaSimulator(getAssetPath): carla_process = None - start_time = time.time() if not isCarlaServerRunning(): CARLA_ROOT = checkCarlaPath() carla_process = subprocess.Popen( @@ -51,23 +50,14 @@ def getCarlaSimulator(getAssetPath): else: pytest.fail("Unable to connect to CARLA.") - # Log the time it took for CARLA to start - end_time = time.time() - elapsed_time = end_time - start_time - print(f"CARLA started successfully in {elapsed_time:.2f} seconds.") - # Extra 5 seconds to ensure server startup time.sleep(10) base = getAssetPath("maps/CARLA") def _getCarlaSimulator(town): - start_connect_time = time.time() path = os.path.join(base, f"{town}.xodr") simulator = CarlaSimulator(map_path=path, carla_map=town, timeout=180) - end_connect_time = time.time() - connect_elapsed_time = end_connect_time - start_connect_time - print(f"CARLA connection established in {connect_elapsed_time:.2f} seconds.") return simulator, town, path yield _getCarlaSimulator From 3250886828897fcc28bec45e709d553d2384c209 Mon Sep 17 00:00:00 2001 From: lola Date: Thu, 31 Oct 2024 10:27:41 -0700 Subject: [PATCH 38/38] Simplify snapshot and instance status checks using AWS wait commands --- .github/workflows/run-simulators.yml | 41 ++++++---------------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 258312f17..3b04f79df 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -25,17 +25,9 @@ jobs: LATEST_SNAPSHOT_ID=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots | sort_by(@, &StartTime) | [-1].SnapshotId' --output text) echo "Checking availability for snapshot: $LATEST_SNAPSHOT_ID" - # Wait until snapshot is in 'completed' status - while true; do - snapshot_status=$(aws ec2 describe-snapshots --snapshot-ids $LATEST_SNAPSHOT_ID --query 'Snapshots[0].State' --output text) - if [ "$snapshot_status" == "completed" ]; then - echo "Snapshot is ready." - break - else - echo "Snapshot still in $snapshot_status state, waiting..." - sleep 10 - fi - done + # Wait for the snapshot to complete + aws ec2 wait snapshot-completed --snapshot-ids $LATEST_SNAPSHOT_ID + echo "Snapshot is ready." # Create a new volume from the latest snapshot volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type gp3 --size 400 --throughput 250 --query "VolumeId" --output text) @@ -79,28 +71,11 @@ jobs: exit 1 fi - # wait for status checks to pass - TIMEOUT=600 # Timeout in seconds - START_TIME=$(date +%s) - END_TIME=$((START_TIME + TIMEOUT)) - while true; do - response=$(aws ec2 describe-instance-status --instance-ids $INSTANCE_ID) - system_status=$(echo "$response" | jq -r '.InstanceStatuses[0].SystemStatus.Status') - instance_status=$(echo "$response" | jq -r '.InstanceStatuses[0].InstanceStatus.Status') - - if [[ "$system_status" == "ok" && "$instance_status" == "ok" ]]; then - echo "Both SystemStatus and InstanceStatus are 'ok'" - exit 0 - fi - - CURRENT_TIME=$(date +%s) - if [[ "$CURRENT_TIME" -ge "$END_TIME" ]]; then - echo "Timeout: Both SystemStatus and InstanceStatus have not reached 'ok' state within $TIMEOUT seconds." - exit 1 - fi - - sleep 10 # Check status every 10 seconds - done + # Wait for instance status checks to pass + echo "Waiting for instance status checks to pass..." + aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID + echo "Instance is now ready for use." + check_simulator_version_updates: name: check_simulator_version_updates