From 93bb9a8d8dd6d74b41bc5dc745d423ff6fd38be8 Mon Sep 17 00:00:00 2001 From: cassieesvelt <73311224+cassieesvelt@users.noreply.github.com> Date: Mon, 13 May 2024 13:28:16 -0700 Subject: [PATCH] fix when the job fails and doesn't kick the node (#3186) * fix when the job fails and doesn't kick the node * reformat --- .../largescale-deep-learning/Debugging/Compute/run_nhc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/best-practices/largescale-deep-learning/Debugging/Compute/run_nhc.py b/best-practices/largescale-deep-learning/Debugging/Compute/run_nhc.py index 3b3bf15850..250b2660a6 100644 --- a/best-practices/largescale-deep-learning/Debugging/Compute/run_nhc.py +++ b/best-practices/largescale-deep-learning/Debugging/Compute/run_nhc.py @@ -100,6 +100,7 @@ def parse_output(output_file): output, error = process.communicate() print(output) print(error) + if full_errors: raise Exception( "Failures were found while running the node health checks. Please see the std_log_process.txt files under the 'outputs and logs' tab of the job for more information." + full_errors