Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fail loudly and terminate if version upgrade fails #1986

Open
wants to merge 2 commits into
base: dev/v0.7.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None,
return cuda_visible_gpu_ids_str

except Exception as e:
logging.error(f"Error {e} Exception {traceback.format_exc()}")
return None
raise Exception(f"Error occurred while occupying gpu ids: {e} \n"
f"Exception {traceback.format_exc()}")

@staticmethod
def search_and_refresh_available_gpu_ids(available_gpu_ids):
Expand Down
12 changes: 4 additions & 8 deletions python/fedml/computing/scheduler/comm_utils/sys_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,7 @@ def daemon_ota_upgrade(in_args):
fedml_is_latest_version, local_ver, remote_ver = check_fedml_is_latest_version(in_args.version)
should_upgrade = False if fedml_is_latest_version else True
except Exception as e:
return
raise Exception("Failed to check the latest version with error {}.".format(str(e)))

if not should_upgrade:
return
Expand All @@ -803,7 +803,7 @@ def daemon_ota_upgrade_with_version(in_version="release"):
fedml_is_latest_version, local_ver, remote_ver = check_fedml_is_latest_version(in_version)
should_upgrade = False if fedml_is_latest_version else True
except Exception as e:
return
raise Exception("Failed to check the latest version with error {}.".format(str(e)))

if not should_upgrade:
return
Expand All @@ -829,8 +829,6 @@ def run_cmd(command, show_local_console=False):
print(out_str)

log_return_info(command, 0)

is_cmd_run_ok = True
else:
if err is not None:
try:
Expand All @@ -844,10 +842,8 @@ def run_cmd(command, show_local_console=False):
print(err_str)

log_return_info(command, ret_code)

is_cmd_run_ok = False

return is_cmd_run_ok
raise Exception("Run command '{}' failed with return code {}.".format(command, ret_code))
return True


def get_local_fedml_version(fedml_init_file):
Expand Down
Loading