From f071465e2bb8c9d1046b3562219e0fbe04d8b014 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 25 Feb 2020 23:52:37 -0500 Subject: [PATCH] mgr/volumes: unregister job upon async threads exception If the async threads hit a temporary exception the job is never unregistered and therefore gets skipped by the async threads on subsequent scans. Patrick hit this in nautilus when one of the purge threads hit an exception when trying to log a message. The trash entry was never picked up again by the purge threads. Fixes: http://tracker.ceph.com/issues/44315 Signed-off-by: Venky Shankar (cherry picked from commit 46476ef2e290bd15af2fec2410cb4f3f86b27cd2) --- src/pybind/mgr/volumes/fs/async_job.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/volumes/fs/async_job.py b/src/pybind/mgr/volumes/fs/async_job.py index b47b45ca87bf5..7eb8ca2c37fe3 100644 --- a/src/pybind/mgr/volumes/fs/async_job.py +++ b/src/pybind/mgr/volumes/fs/async_job.py @@ -28,6 +28,7 @@ def run(self): thread_name = thread_id.getName() while retries < JobThread.MAX_RETRIES_ON_EXCEPTION: + vol_job = None try: # fetch next job to execute with self.async_job.lock: @@ -40,10 +41,6 @@ def run(self): # execute the job (outside lock) self.async_job.execute_job(vol_job[0], vol_job[1], should_cancel=lambda: thread_id.should_cancel()) - - # when done, unregister the job - with self.async_job.lock: - self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id) retries = 0 except NotImplementedException: raise @@ -56,7 +53,12 @@ def run(self): exc_type, exc_value, exc_traceback = sys.exc_info() log.warning("traceback: {0}".format("".join( traceback.format_exception(exc_type, exc_value, exc_traceback)))) - time.sleep(1) + finally: + # when done, unregister the job + if vol_job: + with self.async_job.lock: + self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id) + time.sleep(1) log.error("thread [{0}] reached exception limit, bailing out...".format(thread_name)) self.vc.cluster_log("thread {0} bailing out due to exception".format(thread_name))