Adds the ability to automatically issue a hard reboot to instances th…

…at have been stuck in a 'rebooting' state for longer than a specified window. Fixes bug 873099. Change-Id: Ife2c64326fdb3ec849242583d1bd1d96f9f4be0f
openstack · Oct 13, 2011 · e50e9b4 · e50e9b4
1 parent 52b5611
commit e50e9b4
Show file tree

Hide file tree

Showing 12 changed files with 104 additions and 1 deletion.
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
@@ -73,6 +73,10 @@
 flags.DEFINE_integer('live_migration_retry_count', 30,
                      "Retry count needed in live_migration."
                      " sleep 1 sec for each count")
+flags.DEFINE_integer("reboot_timeout", 0,
+                     "Automatically hard reboot an instance if it has been "
+                     "stuck in a rebooting state longer than N seconds."
+                     " Set to 0 to disable.")
 flags.DEFINE_integer("rescue_timeout", 0,
                      "Automatically unrescue an instance after N seconds."
                      " Set to 0 to disable.")
@@ -1784,6 +1788,14 @@ def periodic_tasks(self, context=None):
         if error_list is None:
             error_list = []
 
+        try:
+            if FLAGS.reboot_timeout > 0:
+                self.driver.poll_rebooting_instances(FLAGS.reboot_timeout)
+        except Exception as ex:
+            LOG.warning(_("Error during poll_rebooting_instances: %s"),
+                    unicode(ex))
+            error_list.append(ex)
+
         try:
             if FLAGS.rescue_timeout > 0:
                 self.driver.poll_rescued_instances(FLAGS.rescue_timeout)

diff --git a/nova/db/api.py b/nova/db/api.py
@@ -585,6 +585,12 @@ def instance_get_project_vpn(context, project_id):
     return IMPL.instance_get_project_vpn(context, project_id)
 
 
+def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
+    """Get all instances stuck in a rebooting state."""
+    return IMPL.instance_get_all_hung_in_rebooting(context, reboot_window,
+            session)
+
+
 def instance_set_state(context, instance_id, state, description=None):
     """Set the state of an instance."""
     return IMPL.instance_set_state(context, instance_id, state, description)

diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py
@@ -1529,6 +1529,21 @@ def instance_get_floating_address(context, instance_id):
     return fixed_ip_refs[0].floating_ips[0]['address']
 
 
+@require_admin_context
+def instance_get_all_hung_in_rebooting(context, reboot_window, session=None):
+    reboot_window = datetime.datetime.utcnow() - datetime.timedelta(
+            seconds=reboot_window)
+
+    if not session:
+        session = get_session()
+
+    results = session.query(models.Instance).\
+            filter(models.Instance.updated_at <= reboot_window).\
+            filter_by(task_state="rebooting").all()
+
+    return results
+
+
 @require_context
 def instance_update(context, instance_id, values):
     session = get_session()

diff --git a/nova/tests/test_db_api.py b/nova/tests/test_db_api.py
@@ -123,3 +123,27 @@ def test_migration_get_all_unconfirmed(self):
         results = db.migration_get_all_unconfirmed(ctxt, 10)
         self.assertEqual(0, len(results))
         db.migration_update(ctxt, migration.id, {"status": "CONFIRMED"})
+
+    def test_instance_get_all_hung_in_rebooting(self):
+        ctxt = context.get_admin_context()
+
+        # Ensure no instances are returned.
+        results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+        self.assertEqual(0, len(results))
+
+        # Ensure one rebooting instance with updated_at older than 10 seconds
+        # is returned.
+        updated_at = datetime.datetime(2000, 01, 01, 12, 00, 00)
+        values = {"task_state": "rebooting", "updated_at": updated_at}
+        instance = db.instance_create(ctxt, values)
+        results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+        self.assertEqual(1, len(results))
+        db.instance_update(ctxt, instance.id, {"task_state": None})
+
+        # Ensure the newly rebooted instance is not returned.
+        updated_at = datetime.datetime.utcnow()
+        values = {"task_state": "rebooting", "updated_at": updated_at}
+        instance = db.instance_create(ctxt, values)
+        results = db.instance_get_all_hung_in_rebooting(ctxt, 10)
+        self.assertEqual(0, len(results))
+        db.instance_update(ctxt, instance.id, {"task_state": None})
diff --git a/nova/tests/test_virt_drivers.py b/nova/tests/test_virt_drivers.py
@@ -172,6 +172,10 @@ def test_unrescue_rescued_instance(self):
                                lambda x: None, network_info)
         self.connection.unrescue(instance_ref, lambda x: None, network_info)
 
+    @catch_notimplementederror
+    def test_poll_rebooting_instances(self):
+        self.connection.poll_rebooting_instances(10)
+
     @catch_notimplementederror
     def test_poll_rescued_instances(self):
         self.connection.poll_rescued_instances(10)

diff --git a/nova/virt/driver.py b/nova/virt/driver.py
@@ -485,6 +485,11 @@ def inject_network_info(self, instance, nw_info):
         # TODO(Vek): Need to pass context in for access to auth_token
         pass
 
+    def poll_rebooting_instances(self, timeout):
+        """Poll for rebooting instances"""
+        # TODO(Vek): Need to pass context in for access to auth_token
+        raise NotImplementedError()
+
     def poll_rescued_instances(self, timeout):
         """Poll for rescued instances"""
         # TODO(Vek): Need to pass context in for access to auth_token

diff --git a/nova/virt/fake.py b/nova/virt/fake.py
@@ -131,6 +131,9 @@ def rescue(self, context, instance, callback, network_info):
     def unrescue(self, instance, callback, network_info):
         pass
 
+    def poll_rebooting_instances(self, timeout):
+        pass
+
     def poll_rescued_instances(self, timeout):
         pass
 

diff --git a/nova/virt/hyperv.py b/nova/virt/hyperv.py
@@ -485,10 +485,16 @@ def detach_volume(self, connection_info, instance_name, mountpoint):
         if vm is None:
             raise exception.InstanceNotFound(instance_id=instance_name)
 
+    def poll_rebooting_instances(self, timeout):
+        """See xenapi_conn.py implementation."""
+        pass
+
     def poll_rescued_instances(self, timeout):
+        """See xenapi_conn.py implementation."""
         pass
 
     def poll_unconfirmed_resizes(self, resize_confirm_window):
+        """See xenapi_conn.py implementation."""
         pass
 
     def update_available_resource(self, ctxt, host):

diff --git a/nova/virt/libvirt/connection.py b/nova/virt/libvirt/connection.py
@@ -613,6 +613,10 @@ def unrescue(self, instance, callback, network_info):
         os.remove(unrescue_xml_path)
         self.reboot(instance, network_info, xml=unrescue_xml)
 
+    @exception.wrap_exception()
+    def poll_rebooting_instances(self, timeout):
+        pass
+
     @exception.wrap_exception()
     def poll_rescued_instances(self, timeout):
         pass

diff --git a/nova/virt/xenapi/vm_utils.py b/nova/virt/xenapi/vm_utils.py
@@ -713,7 +713,7 @@ def set_vm_name_label(cls, session, vm_ref, name_label):
 
     @classmethod
     def lookup(cls, session, name_label):
-        """Look the instance i up, and returns it if available"""
+        """Look the instance up and return it if available"""
         vm_refs = session.get_xenapi().VM.get_by_name_label(name_label)
         n = len(vm_refs)
         if n == 0:

diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py
@@ -1117,6 +1117,26 @@ def power_on(self, instance):
         vm_ref = self._get_vm_opaque_ref(instance)
         self._start(instance, vm_ref)
 
+    def poll_rebooting_instances(self, timeout):
+        """Look for expirable rebooting instances.
+
+            - issue a "hard" reboot to any instance that has been stuck in a
+              reboot state for >= the given timeout
+        """
+        ctxt = nova_context.get_admin_context()
+        instances = db.instance_get_all_hung_in_rebooting(ctxt, timeout)
+
+        instances_info = dict(instance_count=len(instances),
+                timeout=timeout)
+
+        if instances_info["instance_count"] > 0:
+            LOG.info(_("Found %(instance_count)d hung reboots "
+                    "older than %(timeout)d seconds") % instances_info)
+
+        for instance in instances:
+            LOG.info(_("Automatically hard rebooting %d"), instance.id)
+            self.compute_api.reboot(ctxt, instance.id, "HARD")
+
     def poll_rescued_instances(self, timeout):
         """Look for expirable rescued instances.
 

diff --git a/nova/virt/xenapi_conn.py b/nova/virt/xenapi_conn.py
@@ -265,6 +265,10 @@ def power_on(self, instance):
         """Power on the specified instance"""
         self._vmops.power_on(instance)
 
+    def poll_rebooting_instances(self, timeout):
+        """Poll for rebooting instances"""
+        self._vmops.poll_rebooting_instances(timeout)
+
     def poll_rescued_instances(self, timeout):
         """Poll for rescued instances"""
         self._vmops.poll_rescued_instances(timeout)