Skip to content

Commit

Permalink
Fix instance can not be deleted after soft reboot
Browse files Browse the repository at this point in the history
The reason is that:
1. soft reboot will wait for instance become to 'shutdown', and then start it
2. delete operation also wait for this, and then clean up the instance
3. if soft reboot found the instance become to 'shutdown' firstly, it will
start it immediately
4. then the delete operation will go to the _wait_for_destroy loop, and the
loop may be endless
5. when we delete the instance again, because the lock was hold by the delete
operation before, this one will wait the lock and don't implement actually.
So the domain id is checked during _wait_for_destroy loop, if it changed and
the instance is still running, we need to destroy it again.
If the domain is booted after _wait_for_destroy, it may result in
unfilter_instance failed because the nwfilter is in use, so doing the same
thing as above.

Fixes Bug #1111213

Change-Id: I98dc902dd86fa828f5821465c611953e08f9f637
  • Loading branch information
aspirer committed Feb 17, 2013
1 parent d49d504 commit 27330ac
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 18 deletions.
13 changes: 9 additions & 4 deletions nova/tests/test_libvirt.py
Expand Up @@ -2755,6 +2755,7 @@ def fake_unfilter_instance(instance, network_info):

def test_destroy_undefines(self):
mock = self.mox.CreateMock(libvirt.virDomain)
mock.ID()
mock.destroy()
mock.undefineFlags(1).AndReturn(1)

Expand All @@ -2764,7 +2765,7 @@ def fake_lookup_by_name(instance_name):
return mock

def fake_get_info(instance_name):
return {'state': power_state.SHUTDOWN}
return {'state': power_state.SHUTDOWN, 'id': -1}

conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name)
Expand All @@ -2775,6 +2776,7 @@ def fake_get_info(instance_name):

def test_destroy_undefines_no_undefine_flags(self):
mock = self.mox.CreateMock(libvirt.virDomain)
mock.ID()
mock.destroy()
mock.undefineFlags(1).AndRaise(libvirt.libvirtError('Err'))
mock.undefine()
Expand All @@ -2785,7 +2787,7 @@ def fake_lookup_by_name(instance_name):
return mock

def fake_get_info(instance_name):
return {'state': power_state.SHUTDOWN}
return {'state': power_state.SHUTDOWN, 'id': -1}

conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name)
Expand All @@ -2796,6 +2798,7 @@ def fake_get_info(instance_name):

def test_destroy_undefines_no_attribute_with_managed_save(self):
mock = self.mox.CreateMock(libvirt.virDomain)
mock.ID()
mock.destroy()
mock.undefineFlags(1).AndRaise(AttributeError())
mock.hasManagedSaveImage(0).AndReturn(True)
Expand All @@ -2808,7 +2811,7 @@ def fake_lookup_by_name(instance_name):
return mock

def fake_get_info(instance_name):
return {'state': power_state.SHUTDOWN}
return {'state': power_state.SHUTDOWN, 'id': -1}

conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name)
Expand All @@ -2819,6 +2822,7 @@ def fake_get_info(instance_name):

def test_destroy_undefines_no_attribute_no_managed_save(self):
mock = self.mox.CreateMock(libvirt.virDomain)
mock.ID()
mock.destroy()
mock.undefineFlags(1).AndRaise(AttributeError())
mock.hasManagedSaveImage(0).AndRaise(AttributeError())
Expand All @@ -2830,7 +2834,7 @@ def fake_lookup_by_name(instance_name):
return mock

def fake_get_info(instance_name):
return {'state': power_state.SHUTDOWN}
return {'state': power_state.SHUTDOWN, 'id': -1}

conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.stubs.Set(conn, '_lookup_by_name', fake_lookup_by_name)
Expand All @@ -2841,6 +2845,7 @@ def fake_get_info(instance_name):

def test_private_destroy_not_found(self):
mock = self.mox.CreateMock(libvirt.virDomain)
mock.ID()
mock.destroy()
self.mox.ReplayAll()

Expand Down
71 changes: 57 additions & 14 deletions nova/virt/libvirt/driver.py
Expand Up @@ -493,8 +493,10 @@ def _destroy(self, instance):

# If the instance is already terminated, we're still happy
# Otherwise, destroy it
old_domid = -1
if virt_dom is not None:
try:
old_domid = virt_dom.ID()
virt_dom.destroy()
except libvirt.libvirtError as e:
is_okay = False
Expand All @@ -514,14 +516,16 @@ def _destroy(self, instance):
locals(), instance=instance)
raise

def _wait_for_destroy():
def _wait_for_destroy(expected_domid):
"""Called at an interval until the VM is gone."""
# NOTE(vish): If the instance disappears during the destroy
# we ignore it so the cleanup can still be
# attempted because we would prefer destroy to
# never fail.
try:
state = self.get_info(instance)['state']
dom_info = self.get_info(instance)
state = dom_info['state']
new_domid = dom_info['id']
except exception.NotFound:
LOG.error(_("During wait destroy, instance disappeared."),
instance=instance)
Expand All @@ -532,8 +536,23 @@ def _wait_for_destroy():
instance=instance)
raise utils.LoopingCallDone()

timer = utils.FixedIntervalLoopingCall(_wait_for_destroy)
# NOTE(wangpan): If the instance was booted again after destroy,
# this may be a endless loop, so check the id of
# domain here, if it changed and the instance is
# still running, we should destroy it again.
# see https://bugs.launchpad.net/nova/+bug/1111213 for more details
if new_domid != expected_domid:
LOG.info(_("Instance may be started again."),
instance=instance)
kwargs['is_running'] = True
raise utils.LoopingCallDone()

kwargs = {'is_running': False}
timer = utils.FixedIntervalLoopingCall(_wait_for_destroy, old_domid)
timer.start(interval=0.5).wait()
if kwargs['is_running']:
LOG.info(_("Going to destroy instance again."), instance=instance)
self._destroy(instance)

def destroy(self, instance, network_info, block_device_info=None,
destroy_disks=True):
Expand Down Expand Up @@ -575,16 +594,39 @@ def _cleanup(self, instance, network_info, block_device_info,
destroy_disks):
self._undefine_domain(instance)
self.unplug_vifs(instance, network_info)
try:
self.firewall_driver.unfilter_instance(instance,
network_info=network_info)
except libvirt.libvirtError as e:
errcode = e.get_error_code()
LOG.error(_("Error from libvirt during unfilter. "
"Code=%(errcode)s Error=%(e)s") %
locals(), instance=instance)
reason = "Error unfiltering instance."
raise exception.InstanceTerminationFailure(reason=reason)
retry = True
while retry:
try:
self.firewall_driver.unfilter_instance(instance,
network_info=network_info)
except libvirt.libvirtError as e:
try:
state = self.get_info(instance)['state']
except exception.NotFound:
state = power_state.SHUTDOWN

if state != power_state.SHUTDOWN:
LOG.warn(_("Instance may be still running, destroy "
"it again."), instance=instance)
self._destroy(instance)
else:
retry = False
errcode = e.get_error_code()
LOG.error(_("Error from libvirt during unfilter. "
"Code=%(errcode)s Error=%(e)s") %
locals(), instance=instance)
reason = "Error unfiltering instance."
raise exception.InstanceTerminationFailure(reason=reason)
except Exception:
retry = False
raise
else:
retry = False

# FIXME(wangpan): if the instance is booted again here, such as the
# the soft reboot operation boot it here, it will
# become "running deleted", should we check and destroy
# it at the end of this method?

# NOTE(vish): we disconnect from volumes regardless
block_device_mapping = driver.block_device_info_get_mapping(
Expand Down Expand Up @@ -2017,7 +2059,8 @@ def get_info(self, instance):
'max_mem': max_mem,
'mem': mem,
'num_cpu': num_cpu,
'cpu_time': cpu_time}
'cpu_time': cpu_time,
'id': virt_dom.ID()}

def _create_domain(self, xml=None, domain=None,
instance=None, launch_flags=0):
Expand Down

0 comments on commit 27330ac

Please sign in to comment.