Skip to content

Commit

Permalink
Fix run/terminate race conditions.
Browse files Browse the repository at this point in the history
 * synchronize run,terminate,stop,start on instance_uuid
 * don't surpress error when unfiltering instance, which
   can result in a zombified instance.
 * Fixes bug 956719
 * Remove debug raise

Change-Id: I8b2eaffdabfd5c1a9414adb1b5ed11e4c48711fc
  • Loading branch information
sleepsonthefloor committed Mar 17, 2012
1 parent eb42e7f commit fe7055a
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 38 deletions.
56 changes: 36 additions & 20 deletions nova/compute/manager.py
Expand Up @@ -645,18 +645,24 @@ def _get_instance_volume_block_device_info(self, context, instance_id):
@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@wrap_instance_fault
def run_instance(self, context, instance_uuid, **kwargs):
self._run_instance(context, instance_uuid, **kwargs)
@utils.synchronized(instance_uuid)
def do_run_instance():
self._run_instance(context, instance_uuid, **kwargs)
do_run_instance()

@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@checks_instance_lock
@wrap_instance_fault
def start_instance(self, context, instance_uuid):
"""Starting an instance on this host."""
# TODO(yamahata): injected_files isn't supported.
# Anyway OSAPI doesn't support stop/start yet
# FIXME(vish): I've kept the files during stop instance, but
# I think start will fail due to the files still
self._run_instance(context, instance_uuid)
@utils.synchronized(instance_uuid)
def do_start_instance():
"""Starting an instance on this host."""
# TODO(yamahata): injected_files isn't supported.
# Anyway OSAPI doesn't support stop/start yet
# FIXME(vish): I've kept the files during stop instance, but
# I think start will fail due to the files still
self._run_instance(context, instance_uuid)
do_start_instance()

def _shutdown_instance(self, context, instance, action_str):
"""Shutdown an instance on this host."""
Expand Down Expand Up @@ -725,25 +731,35 @@ def _delete_instance(self, context, instance):
@wrap_instance_fault
def terminate_instance(self, context, instance_uuid):
"""Terminate an instance on this host."""
elevated = context.elevated()
instance = self.db.instance_get_by_uuid(elevated, instance_uuid)
compute_utils.notify_usage_exists(instance, current_period=True)
try:
self._delete_instance(context, instance)
except exception.InstanceNotFound as e:
LOG.warn(e)
@utils.synchronized(instance_uuid)
def do_terminate_instance():
elevated = context.elevated()
instance = self.db.instance_get_by_uuid(elevated, instance_uuid)
compute_utils.notify_usage_exists(instance, current_period=True)
try:
self._delete_instance(context, instance)
except exception.InstanceTerminationFailure as error:
msg = _('%s. Setting instance vm_state to ERROR')
LOG.error(msg % error)
self._set_instance_error_state(context, instance_uuid)
except exception.InstanceNotFound as e:
LOG.warn(e)
do_terminate_instance()

@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@checks_instance_lock
@wrap_instance_fault
def stop_instance(self, context, instance_uuid):
"""Stopping an instance on this host."""
instance = self.db.instance_get_by_uuid(context, instance_uuid)
self._shutdown_instance(context, instance, 'Stopping')
self._instance_update(context,
instance_uuid,
vm_state=vm_states.STOPPED,
task_state=None)
@utils.synchronized(instance_uuid)
def do_stop_instance():
instance = self.db.instance_get_by_uuid(context, instance_uuid)
self._shutdown_instance(context, instance, 'Stopping')
self._instance_update(context,
instance_uuid,
vm_state=vm_states.STOPPED,
task_state=None)
do_stop_instance()

@exception.wrap_exception(notifier=notifier, publisher_id=publisher_id())
@checks_instance_lock
Expand Down
4 changes: 4 additions & 0 deletions nova/exception.py
Expand Up @@ -330,6 +330,10 @@ class InstanceRebootFailure(Invalid):
message = _("Failed to reboot instance") + ": %(reason)s"


class InstanceTerminationFailure(Invalid):
message = _("Failed to terminate instance") + ": %(reason)s"


class ServiceUnavailable(Invalid):
message = _("Service is unavailable at this time.")

Expand Down
25 changes: 10 additions & 15 deletions nova/tests/test_compute.py
Expand Up @@ -932,26 +932,21 @@ def test_instance_set_to_error_on_uncaught_exception(self):

self.compute.terminate_instance(self.context, instance['uuid'])

def test_instance_set_to_error_on_deleted_instance_doesnt_raise(self):
"""Test that we don't raise InstanceNotFound when trying to set
an instance to ERROR that has already been deleted from under us.
The original exception should be re-raised.
def test_instance_termination_exception_sets_error(self):
"""Test that we handle InstanceTerminationFailure
which is propagated up from the underlying driver.
"""
instance = self._create_fake_instance()
instance_uuid = instance['uuid']

def fake_allocate_network(context, instance, requested_networks):
# Remove the instance to simulate race condition
self.compute.terminate_instance(self.context, instance['uuid'])
raise rpc_common.RemoteError()
def fake_delete_instance(context, instance):
raise exception.InstanceTerminationFailure(reason='')

self.stubs.Set(self.compute, '_allocate_network',
fake_allocate_network)
self.stubs.Set(self.compute, '_delete_instance',
fake_delete_instance)

self.assertRaises(rpc_common.RemoteError,
self.compute.run_instance,
self.context,
instance_uuid)
self.compute.terminate_instance(self.context, instance['uuid'])
instance = db.instance_get_by_uuid(self.context, instance['uuid'])
self.assertEqual(instance['vm_state'], vm_states.ERROR)

def test_network_is_deallocated_on_spawn_failure(self):
"""When a spawn fails the network must be deallocated"""
Expand Down
12 changes: 10 additions & 2 deletions nova/virt/libvirt/connection.py
Expand Up @@ -406,8 +406,16 @@ def _wait_for_destroy():
timer = utils.LoopingCall(_wait_for_destroy)
timer.start(interval=0.5, now=True)

self.firewall_driver.unfilter_instance(instance,
network_info=network_info)
try:
self.firewall_driver.unfilter_instance(instance,
network_info=network_info)
except libvirt.libvirtError as e:
errcode = e.get_error_code()
LOG.warning(_("Error from libvirt during unfilter. "
"Code=%(errcode)s Error=%(e)s") %
locals(), instance=instance)
reason = "Error unfiltering instance."
raise exception.InstanceTerminationFailure(reason=reason)

# NOTE(vish): we disconnect from volumes regardless
block_device_mapping = driver.block_device_info_get_mapping(
Expand Down
7 changes: 6 additions & 1 deletion nova/virt/libvirt/firewall.py
Expand Up @@ -156,7 +156,12 @@ def unfilter_instance(self, instance, network_info):
try:
_nw = self._conn.nwfilterLookupByName(instance_filter_name)
_nw.undefine()
except libvirt.libvirtError:
except libvirt.libvirtError as e:
errcode = e.get_error_code()
if errcode == libvirt.VIR_ERR_OPERATION_INVALID:
# This happens when the instance filter is still in
# use (ie. when the instance has not terminated properly)
raise
LOG.debug(_('The nwfilter(%(instance_filter_name)s) '
'is not found.') % locals(),
instance=instance)
Expand Down

0 comments on commit fe7055a

Please sign in to comment.