Skip to content

Commit

Permalink
Fix VM deletion from down compute node
Browse files Browse the repository at this point in the history
* free network resources
* free volume resources
* delete.start and delete.stop notifications added
* Handle network deallocate in multi_host mode

Fixes bug 1067214

Co-authored-by: Vishvananda Ishaya <vishvananda@gmail.com>
Change-Id: I0d4a7dc5836d39e405824528de214f23b214849f
(cherry picked from commit 3dff433)
  • Loading branch information
jogo authored and vishvananda committed Nov 8, 2012
1 parent 3d418dc commit c0e1247
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 50 deletions.
56 changes: 49 additions & 7 deletions nova/compute/api.py
Expand Up @@ -33,6 +33,7 @@
from nova.compute import power_state
from nova.compute import rpcapi as compute_rpcapi
from nova.compute import task_states
from nova.compute import utils as compute_utils
from nova.compute import vm_states
from nova.consoleauth import rpcapi as consoleauth_rpcapi
from nova import crypto
Expand Down Expand Up @@ -906,21 +907,23 @@ def _delete(self, context, instance):
host=src_host, cast=False,
reservations=downsize_reservations)

services = self.db.service_get_all_compute_by_host(
context.elevated(), instance['host'])
is_up = False
bdms = self.db.block_device_mapping_get_all_by_instance(
context, instance["uuid"])
#Note(jogo): db allows for multiple compute services per host
try:
services = self.db.service_get_all_compute_by_host(
context.elevated(), instance['host'])
except exception.ComputeHostNotFound:
services = []
for service in services:
if utils.service_is_up(service):
is_up = True
self.compute_rpcapi.terminate_instance(context, instance)
break
if is_up == False:
if not is_up:
# If compute node isn't up, just delete from DB
LOG.warning(_('host for instance is down, deleting from '
'database'), instance=instance)
self.db.instance_destroy(context, instance['uuid'])

self._local_delete(context, instance, bdms)
if reservations:
QUOTAS.commit(context, reservations)
except exception.InstanceNotFound:
Expand All @@ -932,6 +935,45 @@ def _delete(self, context, instance):
if reservations:
QUOTAS.rollback(context, reservations)

def _local_delete(self, context, instance, bdms):
LOG.warning(_('host for instance is down, deleting from '
'database'), instance=instance)
instance_uuid = instance['uuid']
self.db.instance_info_cache_delete(context, instance_uuid)
compute_utils.notify_about_instance_usage(
context, instance, "delete.start")

elevated = context.elevated()
self.network_api.deallocate_for_instance(elevated,
instance)
self.db.instance_destroy(context, instance_uuid)
system_meta = self.db.instance_system_metadata_get(context,
instance_uuid)

# cleanup volumes
for bdm in bdms:
if bdm['volume_id']:
volume = self.volume_api.get(context, bdm['volume_id'])
# NOTE(vish): We don't have access to correct volume
# connector info, so just pass a fake
# connector. This can be improved when we
# expose get_volume_connector to rpc.
connector = {'ip': '127.0.0.1', 'initiator': 'iqn.fake'}
self.volume_api.terminate_connection(context,
volume,
connector)
self.volume_api.detach(elevated, volume)
if bdm['delete_on_termination']:
self.volume_api.delete(context, volume)
self.db.block_device_mapping_destroy(context, bdm['id'])
instance = self._instance_update(context,
instance_uuid,
vm_state=vm_states.DELETED,
task_state=None,
terminated_at=timeutils.utcnow())
compute_utils.notify_about_instance_usage(
context, instance, "delete.end", system_metadata=system_meta)

# NOTE(maoy): we allow delete to be called no matter what vm_state says.
@wrap_check_policy
@check_instance_lock
Expand Down
109 changes: 66 additions & 43 deletions nova/network/manager.py
Expand Up @@ -223,7 +223,7 @@ def _rpc_allocate_fixed_ip(self, context, instance_id, network_id,
network = self._get_network_by_id(context, network_id)
return self.allocate_fixed_ip(context, instance_id, network, **kwargs)

def deallocate_fixed_ip(self, context, address, host=None):
def deallocate_fixed_ip(self, context, address, host=None, teardown=True):
"""Call the superclass deallocate_fixed_ip if i'm the correct host
otherwise call to the correct host"""
fixed_ip = self.db.fixed_ip_get_by_address(context, address)
Expand All @@ -233,18 +233,27 @@ def deallocate_fixed_ip(self, context, address, host=None):
# NOTE(tr3buchet): but if we are, host came from instance['host']
if not network['multi_host']:
host = network['host']
if host != self.host:
# need to call deallocate_fixed_ip on correct network host
topic = rpc.queue_get_for(context, FLAGS.network_topic, host)
args = {'address': address,
'host': host}
rpc.call(context, topic,
{'method': 'deallocate_fixed_ip',
'args': args})
else:
# i am the correct host, run here
super(RPCAllocateFixedIP, self).deallocate_fixed_ip(context,
address)
if host == self.host:
# NOTE(vish): deallocate the fixed ip locally
return super(RPCAllocateFixedIP, self).deallocate_fixed_ip(context,
address)

if network['multi_host']:
service = self.db.service_get_by_host_and_topic(context,
host,
'network')
if not service or not utils.service_is_up(service):
# NOTE(vish): deallocate the fixed ip locally but don't
# teardown network devices
return super(RPCAllocateFixedIP, self).deallocate_fixed_ip(
context, address, teardown=False)

topic = rpc.queue_get_for(context, FLAGS.network_topic, host)
args = {'address': address,
'host': host}
rpc.call(context, topic,
{'method': 'deallocate_fixed_ip',
'args': args})


def wrap_check_policy(func):
Expand Down Expand Up @@ -594,14 +603,24 @@ def disassociate_floating_ip(self, context, address,

# send to correct host, unless i'm the correct host
network = self._get_network_by_id(context, fixed_ip['network_id'])
interface = FLAGS.public_interface or floating_ip['interface']
if network['multi_host']:
instance = self.db.instance_get_by_uuid(context,
fixed_ip['instance_uuid'])
host = instance['host']
service = self.db.service_get_by_host_and_topic(
context, instance['host'], 'network')
if service and utils.service_is_up(service):
host = instance['host']
else:
# NOTE(vish): if the service is down just deallocate the data
# locally. Set the host to local so the call will
# not go over rpc and set interface to None so the
# teardown in the driver does not happen.
host = self.host
interface = None
else:
host = network['host']

interface = FLAGS.public_interface or floating_ip['interface']
if host == self.host:
# i'm the correct host
self._disassociate_floating_ip(context, address, interface)
Expand All @@ -618,8 +637,9 @@ def _disassociate_floating_ip(self, context, address, interface):
# disassociate floating ip
fixed_address = self.db.floating_ip_disassociate(context, address)

# go go driver time
self.l3driver.remove_floating_ip(address, fixed_address, interface)
if interface:
# go go driver time
self.l3driver.remove_floating_ip(address, fixed_address, interface)
payload = dict(project_id=context.project_id, floating_ip=address)
notifier.notify(context,
notifier.publisher_id("network"),
Expand Down Expand Up @@ -1284,7 +1304,7 @@ def allocate_fixed_ip(self, context, instance_id, network, **kwargs):
self._setup_network_on_host(context, network)
return address

def deallocate_fixed_ip(self, context, address, host=None):
def deallocate_fixed_ip(self, context, address, host=None, teardown=True):
"""Returns a fixed ip to the pool."""
fixed_ip_ref = self.db.fixed_ip_get_by_address(context, address)
vif_id = fixed_ip_ref['virtual_interface_id']
Expand All @@ -1301,29 +1321,31 @@ def deallocate_fixed_ip(self, context, address, host=None):
self.instance_dns_manager.delete_entry(n,
self.instance_dns_domain)

network = self._get_network_by_id(context, fixed_ip_ref['network_id'])
self._teardown_network_on_host(context, network)

if FLAGS.force_dhcp_release:
dev = self.driver.get_dev(network)
# NOTE(vish): The below errors should never happen, but there may
# be a race condition that is causing them per
# https://code.launchpad.net/bugs/968457, so we log
# an error to help track down the possible race.
msg = _("Unable to release %s because vif doesn't exist.")
if not vif_id:
LOG.error(msg % address)
return

vif = self.db.virtual_interface_get(context, vif_id)

if not vif:
LOG.error(msg % address)
return

# NOTE(vish): This forces a packet so that the release_fixed_ip
# callback will get called by nova-dhcpbridge.
self.driver.release_dhcp(dev, address, vif['address'])
if teardown:
network = self._get_network_by_id(context,
fixed_ip_ref['network_id'])
self._teardown_network_on_host(context, network)

if FLAGS.force_dhcp_release:
dev = self.driver.get_dev(network)
# NOTE(vish): The below errors should never happen, but there
# may be a race condition that is causing them per
# https://code.launchpad.net/bugs/968457, so we log
# an error to help track down the possible race.
msg = _("Unable to release %s because vif doesn't exist.")
if not vif_id:
LOG.error(msg % address)
return

vif = self.db.virtual_interface_get(context, vif_id)

if not vif:
LOG.error(msg % address)
return

# NOTE(vish): This forces a packet so that the release_fixed_ip
# callback will get called by nova-dhcpbridge.
self.driver.release_dhcp(dev, address, vif['address'])

self.db.fixed_ip_update(context, address,
{'allocated': False,
Expand Down Expand Up @@ -1831,9 +1853,10 @@ def _allocate_fixed_ips(self, context, instance_id, host, networks,
self.allocate_fixed_ip(context, instance_id,
network, address=address)

def deallocate_fixed_ip(self, context, address, host=None):
def deallocate_fixed_ip(self, context, address, host=None, teardown=True):
"""Returns a fixed ip to the pool."""
super(FlatManager, self).deallocate_fixed_ip(context, address)
super(FlatManager, self).deallocate_fixed_ip(context, address, host,
teardown)
self.db.fixed_ip_disassociate(context, address)

def _setup_network_on_host(self, context, network):
Expand Down
26 changes: 26 additions & 0 deletions nova/tests/compute/test_compute.py
Expand Up @@ -3002,6 +3002,32 @@ def test_delete_in_resized(self):

db.instance_destroy(self.context, instance['uuid'])

def test_delete_with_down_host(self):
self.network_api_called = False

def dummy(*args, **kwargs):
self.network_api_called = True
pass
self.stubs.Set(self.compute_api.network_api, 'deallocate_for_instance',
dummy)

#use old time to disable machine
old_time = datetime.datetime(2012, 4, 1)

instance, instance_uuid = self._run_instance(params={
'host': FLAGS.host})
timeutils.set_time_override(old_time)
self.compute_api.delete(self.context, instance)
timeutils.clear_time_override()

self.assertEqual(instance['task_state'], None)
self.assertTrue(self.network_api_called)

#local delete, so db should be clean
self.assertRaises(exception.InstanceNotFound, db.instance_destroy,
self.context,
instance['uuid'])

def test_repeated_delete_quota(self):
in_use = {'instances': 1}

Expand Down

0 comments on commit c0e1247

Please sign in to comment.