From 27330ac85c4353d9124b8788c727e1ce40f55ea8 Mon Sep 17 00:00:00 2001 From: Wangpan Date: Sun, 17 Feb 2013 09:57:23 +0800 Subject: [PATCH] Fix instance can not be deleted after soft reboot The reason is that: 1. soft reboot will wait for instance become to 'shutdown', and then start it 2. delete operation also wait for this, and then clean up the instance 3. if soft reboot found the instance become to 'shutdown' firstly, it will start it immediately 4. then the delete operation will go to the _wait_for_destroy loop, and the loop may be endless 5. when we delete the instance again, because the lock was hold by the delete operation before, this one will wait the lock and don't implement actually. So the domain id is checked during _wait_for_destroy loop, if it changed and the instance is still running, we need to destroy it again. If the domain is booted after _wait_for_destroy, it may result in unfilter_instance failed because the nwfilter is in use, so doing the same thing as above. Fixes Bug #1111213 Change-Id: I98dc902dd86fa828f5821465c611953e08f9f637 --- nova/tests/test_libvirt.py | 13 ++++--- nova/virt/libvirt/driver.py | 71 +++++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/nova/tests/test_libvirt.py b/nova/tests/test_libvirt.py index d2cd5a757ff..fd90e5fa97b 100644 --- a/nova/tests/test_libvirt.py +++ b/nova/tests/test_libvirt.py @@ -2755,6 +2755,7 @@ def fake_unfilter_instance(instance, network_info): def test_destroy_undefines(self): mock = self.mox.CreateMock(libvirt.virDomain) + mock.ID() mock.destroy() mock.undefineFlags(1).AndReturn(1) @@ -2764,7 +2765,7 @@ def fake_lookup_by_name(instance_name): return mock def fake_get_info(instance_name): - return {'state': power_state.SHUTDOWN} + return {'state': power_state.SHUTDOWN, 'id': -1} conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name) @@ -2775,6 +2776,7 @@ def fake_get_info(instance_name): def test_destroy_undefines_no_undefine_flags(self): mock = self.mox.CreateMock(libvirt.virDomain) + mock.ID() mock.destroy() mock.undefineFlags(1).AndRaise(libvirt.libvirtError('Err')) mock.undefine() @@ -2785,7 +2787,7 @@ def fake_lookup_by_name(instance_name): return mock def fake_get_info(instance_name): - return {'state': power_state.SHUTDOWN} + return {'state': power_state.SHUTDOWN, 'id': -1} conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name) @@ -2796,6 +2798,7 @@ def fake_get_info(instance_name): def test_destroy_undefines_no_attribute_with_managed_save(self): mock = self.mox.CreateMock(libvirt.virDomain) + mock.ID() mock.destroy() mock.undefineFlags(1).AndRaise(AttributeError()) mock.hasManagedSaveImage(0).AndReturn(True) @@ -2808,7 +2811,7 @@ def fake_lookup_by_name(instance_name): return mock def fake_get_info(instance_name): - return {'state': power_state.SHUTDOWN} + return {'state': power_state.SHUTDOWN, 'id': -1} conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name) @@ -2819,6 +2822,7 @@ def fake_get_info(instance_name): def test_destroy_undefines_no_attribute_no_managed_save(self): mock = self.mox.CreateMock(libvirt.virDomain) + mock.ID() mock.destroy() mock.undefineFlags(1).AndRaise(AttributeError()) mock.hasManagedSaveImage(0).AndRaise(AttributeError()) @@ -2830,7 +2834,7 @@ def fake_lookup_by_name(instance_name): return mock def fake_get_info(instance_name): - return {'state': power_state.SHUTDOWN} + return {'state': power_state.SHUTDOWN, 'id': -1} conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name) @@ -2841,6 +2845,7 @@ def fake_get_info(instance_name): def test_private_destroy_not_found(self): mock = self.mox.CreateMock(libvirt.virDomain) + mock.ID() mock.destroy() self.mox.ReplayAll() diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 386fe836cb5..7282c9e6132 100755 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -493,8 +493,10 @@ def _destroy(self, instance): # If the instance is already terminated, we're still happy # Otherwise, destroy it + old_domid = -1 if virt_dom is not None: try: + old_domid = virt_dom.ID() virt_dom.destroy() except libvirt.libvirtError as e: is_okay = False @@ -514,14 +516,16 @@ def _destroy(self, instance): locals(), instance=instance) raise - def _wait_for_destroy(): + def _wait_for_destroy(expected_domid): """Called at an interval until the VM is gone.""" # NOTE(vish): If the instance disappears during the destroy # we ignore it so the cleanup can still be # attempted because we would prefer destroy to # never fail. try: - state = self.get_info(instance)['state'] + dom_info = self.get_info(instance) + state = dom_info['state'] + new_domid = dom_info['id'] except exception.NotFound: LOG.error(_("During wait destroy, instance disappeared."), instance=instance) @@ -532,8 +536,23 @@ def _wait_for_destroy(): instance=instance) raise utils.LoopingCallDone() - timer = utils.FixedIntervalLoopingCall(_wait_for_destroy) + # NOTE(wangpan): If the instance was booted again after destroy, + # this may be a endless loop, so check the id of + # domain here, if it changed and the instance is + # still running, we should destroy it again. + # see https://bugs.launchpad.net/nova/+bug/1111213 for more details + if new_domid != expected_domid: + LOG.info(_("Instance may be started again."), + instance=instance) + kwargs['is_running'] = True + raise utils.LoopingCallDone() + + kwargs = {'is_running': False} + timer = utils.FixedIntervalLoopingCall(_wait_for_destroy, old_domid) timer.start(interval=0.5).wait() + if kwargs['is_running']: + LOG.info(_("Going to destroy instance again."), instance=instance) + self._destroy(instance) def destroy(self, instance, network_info, block_device_info=None, destroy_disks=True): @@ -575,16 +594,39 @@ def _cleanup(self, instance, network_info, block_device_info, destroy_disks): self._undefine_domain(instance) self.unplug_vifs(instance, network_info) - try: - self.firewall_driver.unfilter_instance(instance, - network_info=network_info) - except libvirt.libvirtError as e: - errcode = e.get_error_code() - LOG.error(_("Error from libvirt during unfilter. " - "Code=%(errcode)s Error=%(e)s") % - locals(), instance=instance) - reason = "Error unfiltering instance." - raise exception.InstanceTerminationFailure(reason=reason) + retry = True + while retry: + try: + self.firewall_driver.unfilter_instance(instance, + network_info=network_info) + except libvirt.libvirtError as e: + try: + state = self.get_info(instance)['state'] + except exception.NotFound: + state = power_state.SHUTDOWN + + if state != power_state.SHUTDOWN: + LOG.warn(_("Instance may be still running, destroy " + "it again."), instance=instance) + self._destroy(instance) + else: + retry = False + errcode = e.get_error_code() + LOG.error(_("Error from libvirt during unfilter. " + "Code=%(errcode)s Error=%(e)s") % + locals(), instance=instance) + reason = "Error unfiltering instance." + raise exception.InstanceTerminationFailure(reason=reason) + except Exception: + retry = False + raise + else: + retry = False + + # FIXME(wangpan): if the instance is booted again here, such as the + # the soft reboot operation boot it here, it will + # become "running deleted", should we check and destroy + # it at the end of this method? # NOTE(vish): we disconnect from volumes regardless block_device_mapping = driver.block_device_info_get_mapping( @@ -2017,7 +2059,8 @@ def get_info(self, instance): 'max_mem': max_mem, 'mem': mem, 'num_cpu': num_cpu, - 'cpu_time': cpu_time} + 'cpu_time': cpu_time, + 'id': virt_dom.ID()} def _create_domain(self, xml=None, domain=None, instance=None, launch_flags=0):