Skip to content

Commit

Permalink
Adds the ability to automatically issue a hard reboot to instances th…
Browse files Browse the repository at this point in the history
…at have been stuck in a 'rebooting' state for longer than a specified window.

Fixes bug 873099.

Change-Id: Ife2c64326fdb3ec849242583d1bd1d96f9f4be0f
  • Loading branch information
jk0 committed Oct 13, 2011
1 parent 52b5611 commit e50e9b4
Show file tree
Hide file tree
Showing 12 changed files with 104 additions and 1 deletion.
12 changes: 12 additions & 0 deletions nova/compute/manager.py
Expand Up @@ -73,6 +73,10 @@
flags.DEFINE_integer('live_migration_retry_count', 30,
"Retry count needed in live_migration."
" sleep 1 sec for each count")
flags.DEFINE_integer("reboot_timeout", 0,
"Automatically hard reboot an instance if it has been "
"stuck in a rebooting state longer than N seconds."
" Set to 0 to disable.")
flags.DEFINE_integer("rescue_timeout", 0,
"Automatically unrescue an instance after N seconds."
" Set to 0 to disable.")
Expand Down Expand Up @@ -1784,6 +1788,14 @@ def periodic_tasks(self, context=None):
if error_list is None:
error_list = []

try:
if FLAGS.reboot_timeout > 0:
self.driver.poll_rebooting_instances(FLAGS.reboot_timeout)
except Exception as ex:
LOG.warning(_("Error during poll_rebooting_instances: %s"),
unicode(ex))
error_list.append(ex)

try:
if FLAGS.rescue_timeout > 0:
self.driver.poll_rescued_instances(FLAGS.rescue_timeout)
Expand Down
6 changes: 6 additions & 0 deletions nova/db/api.py
Expand Up @@ -585,6 +585,12 @@ def instance_get_project_vpn(context, project_id):
return IMPL.instance_get_project_vpn(context, project_id)


def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
"""Get all instances stuck in a rebooting state."""
return IMPL.instance_get_all_hung_in_rebooting(context, reboot_window,
session)


def instance_set_state(context, instance_id, state, description=None):
"""Set the state of an instance."""
return IMPL.instance_set_state(context, instance_id, state, description)
Expand Down
15 changes: 15 additions & 0 deletions nova/db/sqlalchemy/api.py
Expand Up @@ -1529,6 +1529,21 @@ def instance_get_floating_address(context, instance_id):
return fixed_ip_refs[0].floating_ips[0]['address']


@require_admin_context
def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
reboot_window = datetime.datetime.utcnow() - datetime.timedelta(
seconds=reboot_window)

if not session:
session = get_session()

results = session.query(models.Instance).\
filter(models.Instance.updated_at <= reboot_window).\
filter_by(task_state="rebooting").all()

return results


@require_context
def instance_update(context, instance_id, values):
session = get_session()
Expand Down
24 changes: 24 additions & 0 deletions nova/tests/test_db_api.py
Expand Up @@ -123,3 +123,27 @@ def test_migration_get_all_unconfirmed(self):
results = db.migration_get_all_unconfirmed(ctxt, 10)
self.assertEqual(0, len(results))
db.migration_update(ctxt, migration.id, {"status": "CONFIRMED"})

def test_instance_get_all_hung_in_rebooting(self):
ctxt = context.get_admin_context()

# Ensure no instances are returned.
results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
self.assertEqual(0, len(results))

# Ensure one rebooting instance with updated_at older than 10 seconds
# is returned.
updated_at = datetime.datetime(2000, 01, 01, 12, 00, 00)
values = {"task_state": "rebooting", "updated_at": updated_at}
instance = db.instance_create(ctxt, values)
results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
self.assertEqual(1, len(results))
db.instance_update(ctxt, instance.id, {"task_state": None})

# Ensure the newly rebooted instance is not returned.
updated_at = datetime.datetime.utcnow()
values = {"task_state": "rebooting", "updated_at": updated_at}
instance = db.instance_create(ctxt, values)
results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
self.assertEqual(0, len(results))
db.instance_update(ctxt, instance.id, {"task_state": None})
4 changes: 4 additions & 0 deletions nova/tests/test_virt_drivers.py
Expand Up @@ -172,6 +172,10 @@ def test_unrescue_rescued_instance(self):
lambda x: None, network_info)
self.connection.unrescue(instance_ref, lambda x: None, network_info)

@catch_notimplementederror
def test_poll_rebooting_instances(self):
self.connection.poll_rebooting_instances(10)

@catch_notimplementederror
def test_poll_rescued_instances(self):
self.connection.poll_rescued_instances(10)
Expand Down
5 changes: 5 additions & 0 deletions nova/virt/driver.py
Expand Up @@ -485,6 +485,11 @@ def inject_network_info(self, instance, nw_info):
# TODO(Vek): Need to pass context in for access to auth_token
pass

def poll_rebooting_instances(self, timeout):
"""Poll for rebooting instances"""
# TODO(Vek): Need to pass context in for access to auth_token
raise NotImplementedError()

def poll_rescued_instances(self, timeout):
"""Poll for rescued instances"""
# TODO(Vek): Need to pass context in for access to auth_token
Expand Down
3 changes: 3 additions & 0 deletions nova/virt/fake.py
Expand Up @@ -131,6 +131,9 @@ def rescue(self, context, instance, callback, network_info):
def unrescue(self, instance, callback, network_info):
pass

def poll_rebooting_instances(self, timeout):
pass

def poll_rescued_instances(self, timeout):
pass

Expand Down
6 changes: 6 additions & 0 deletions nova/virt/hyperv.py
Expand Up @@ -485,10 +485,16 @@ def detach_volume(self, connection_info, instance_name, mountpoint):
if vm is None:
raise exception.InstanceNotFound(instance_id=instance_name)

def poll_rebooting_instances(self, timeout):
"""See xenapi_conn.py implementation."""
pass

def poll_rescued_instances(self, timeout):
"""See xenapi_conn.py implementation."""
pass

def poll_unconfirmed_resizes(self, resize_confirm_window):
"""See xenapi_conn.py implementation."""
pass

def update_available_resource(self, ctxt, host):
Expand Down
4 changes: 4 additions & 0 deletions nova/virt/libvirt/connection.py
Expand Up @@ -613,6 +613,10 @@ def unrescue(self, instance, callback, network_info):
os.remove(unrescue_xml_path)
self.reboot(instance, network_info, xml=unrescue_xml)

@exception.wrap_exception()
def poll_rebooting_instances(self, timeout):
pass

@exception.wrap_exception()
def poll_rescued_instances(self, timeout):
pass
Expand Down
2 changes: 1 addition & 1 deletion nova/virt/xenapi/vm_utils.py
Expand Up @@ -713,7 +713,7 @@ def set_vm_name_label(cls, session, vm_ref, name_label):

@classmethod
def lookup(cls, session, name_label):
"""Look the instance i up, and returns it if available"""
"""Look the instance up and return it if available"""
vm_refs = session.get_xenapi().VM.get_by_name_label(name_label)
n = len(vm_refs)
if n == 0:
Expand Down
20 changes: 20 additions & 0 deletions nova/virt/xenapi/vmops.py
Expand Up @@ -1117,6 +1117,26 @@ def power_on(self, instance):
vm_ref = self._get_vm_opaque_ref(instance)
self._start(instance, vm_ref)

def poll_rebooting_instances(self, timeout):
"""Look for expirable rebooting instances.
- issue a "hard" reboot to any instance that has been stuck in a
reboot state for >= the given timeout
"""
ctxt = nova_context.get_admin_context()
instances = db.instance_get_all_hung_in_rebooting(ctxt, timeout)

instances_info = dict(instance_count=len(instances),
timeout=timeout)

if instances_info["instance_count"] > 0:
LOG.info(_("Found %(instance_count)d hung reboots "
"older than %(timeout)d seconds") % instances_info)

for instance in instances:
LOG.info(_("Automatically hard rebooting %d"), instance.id)
self.compute_api.reboot(ctxt, instance.id, "HARD")

def poll_rescued_instances(self, timeout):
"""Look for expirable rescued instances.
Expand Down
4 changes: 4 additions & 0 deletions nova/virt/xenapi_conn.py
Expand Up @@ -265,6 +265,10 @@ def power_on(self, instance):
"""Power on the specified instance"""
self._vmops.power_on(instance)

def poll_rebooting_instances(self, timeout):
"""Poll for rebooting instances"""
self._vmops.poll_rebooting_instances(timeout)

def poll_rescued_instances(self, timeout):
"""Poll for rescued instances"""
self._vmops.poll_rescued_instances(timeout)
Expand Down

0 comments on commit e50e9b4

Please sign in to comment.