Merge pull request #1922 from obino/simplify-restore

Simplify restore logic.
AppScale · Feb 5, 2016 · 165e710 · 165e710
2 parents 484c784 + cbcf71e
commit 165e710
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 68 deletions.
diff --git a/AppController/djinn.rb b/AppController/djinn.rb
@@ -1698,17 +1698,28 @@ def job_start(secret)
     Djinn.log_info("==== Starting AppController ====")
 
     start_infrastructure_manager()
-    data_restored, need_to_start_jobs = restore_appcontroller_state()
 
-    if data_restored
+    # We need to wait for the 'state', that is the deployment layouts and
+    # the options for this deployment. It's either a save state from a
+    # previous start, or it comes from the tools. If the tools communicate
+    # the deployment's data, then we are the headnode.
+    if restore_appcontroller_state()
       parse_options()
     else
       erase_old_data()
       wait_for_data()
       parse_options()
     end
 
-    if need_to_start_jobs and my_node.is_shadow?
+    # From here on we have the basic local state that allows to operate.
+    # In particular we know our roles, and the deployment layout. Let's
+    # start attaching any permanent disk we may have associated with us.
+    mount_persistent_storage
+
+    # If we are the headnode, we may need to start/setup all other nodes.
+    # Better do it early on, since it may take some time for the other
+    # nodes to start up.
+    if my_node.is_shadow?
       Djinn.log_info("Spawning/setting up other nodes.")
       spawn_and_setup_appengine
     end
@@ -2723,56 +2734,38 @@ def backup_appcontroller_state()
     end
   end
 
-  # In multinode deployments, it could be the case that we restored data
-  # from ZooKeeper on a different machine. In that case, we still need to
-  # start all the services on this machine.
-  # 
-  # Returns:
-  #   true, if we do, false otherwise.
-  def are_we_restoring_from_local()
-    if HelperFunctions.is_process_running?("zookeeper")
-      return false
-    else
-      return true
-    end
-  end
-
   # Restores the state of each of the instance variables that the AppController
   # holds by pulling it from ZooKeeper (previously populated by the Shadow
   # node, who always has the most up-to-date version of this data).
   #
   # Returns:
-  #   Two booleans, that indicate if (1) data was restored to this AppController
-  #   from either ZooKeeper or locally, and (2) if we need to start the roles
-  #   on this machine or not.
+  #   A boolean to indicate if we were able to restore the state from
+  #   either zookeeper of the local disk.
   def restore_appcontroller_state()
     Djinn.log_info("Restoring AppController state")
-    restoring_from_local = true
+    json_state=""
+
     if File.exists?(ZK_LOCATIONS_FILE)
       Djinn.log_info("Trying to restore data from ZooKeeper.")
       json_state = restore_from_zookeeper()
-      if json_state.empty?
-        Djinn.log_info("Failed to restore data from ZooKeeper, trying locally.")
-        json_state = restore_from_local_data()
-        if json_state == nil
-          Djinn.log_warn("Unable to restore from ZK or local state, not restoring!")
-          restoring_from_local = are_we_restoring_from_local()
-          return false, restoring_from_local
-        end
-      else
+      if not json_state.empty?
         Djinn.log_info("Restored data from ZooKeeper.")
-        restoring_from_local = are_we_restoring_from_local()
       end
-    else
-      if File.exists?(HelperFunctions::APPCONTROLLER_STATE_LOCATION)
-        Djinn.log_info("Restoring from local data")
-        json_state = restore_from_local_data()
-      else
-        Djinn.log_info("No recovery data found - skipping recovery process")
-        return false, restoring_from_local
+    end
+
+    if json_state.empty? and File.exists?(HelperFunctions::APPCONTROLLER_STATE_LOCATION)
+      Djinn.log_info("Trying to restore data from local data.")
+      json_state = restore_from_local_data()
+      if not json_state.empty?
+        Djinn.log_info("Restored data from local data.")
       end
     end
 
+    if json_state.empty?
+      Djinn.log_warn("Unable to restore from ZK or local state, not restoring!")
+      return false
+    end
+
     Djinn.log_info("Reload State : #{json_state}")
 
     @@secret = json_state['@@secret']
@@ -2801,7 +2794,9 @@ def restore_appcontroller_state()
     # of our internal state to use the new public and private IP anywhere the
     # old ones were present.
     if !HelperFunctions.get_all_local_ips().include?(@my_private_ip)
+      Djinn.log_info("IP changed old private:#{@my_private_ip} public:#{@my_public_ip}.")
       update_state_with_new_local_ip()
+      Djinn.log_info("IP changed new private:#{@my_private_ip} public:#{@my_public_ip}.")
     end
 
     # Now that we've restored our state, update the pointer that indicates
@@ -2815,7 +2810,7 @@ def restore_appcontroller_state()
       restore_appserver_state()
     end
 
-    return true, restoring_from_local
+    return true
   end
 
 
@@ -4251,31 +4246,9 @@ def my_node()
     return @nodes[@my_index]
   end
 
-  # Perform any necessary initialization steps before we begin starting up
-  # services.
-  def initialize_server()
-    head_node_ip = get_public_ip(@options['hostname'])
-
-    if not HAProxy.is_running?
-      HAProxy.initialize_config()
-      HAProxy.create_app_load_balancer_config(my_node.public_ip,
-        my_node.private_ip, AppDashboard::PROXY_PORT)
-      HAProxy.start()
-      Djinn.log_info("HAProxy configured and started.")
-    else
-      Djinn.log_info("HAProxy already configured.")
-    end
-
-    if not Nginx.is_running?
-      Nginx.initialize_config()
-      Nginx.create_app_load_balancer_config(my_node.public_ip,
-        my_node.private_ip, AppDashboard::PROXY_PORT)
-      Nginx.start()
-      Djinn.log_info("Nginx configured and started.")
-    else
-      Djinn.log_info("Nginx already configured and running.")
-    end
-
+  # If we are in cloud mode, we should mount any volume containing our
+  # local state.
+  def mount_persistent_storage()
     if my_node.disk
       imc = InfrastructureManagerClient.new(@@secret)
       begin
@@ -4338,6 +4311,31 @@ def initialize_server()
       Djinn.log_run("mv /var/lib/rabbitmq #{PERSISTENT_MOUNT_POINT}")
       Djinn.log_run("ln -s #{PERSISTENT_MOUNT_POINT}/rabbitmq /var/lib/rabbitmq")
     end
+  end
+
+  # This function performs basic setup ahead of starting the API services.
+  def initialize_server()
+    head_node_ip = get_public_ip(@options['hostname'])
+
+    if not HAProxy.is_running?
+      HAProxy.initialize_config()
+      HAProxy.create_app_load_balancer_config(my_node.public_ip,
+        my_node.private_ip, AppDashboard::PROXY_PORT)
+      HAProxy.start()
+      Djinn.log_info("HAProxy configured and started.")
+    else
+      Djinn.log_info("HAProxy already configured.")
+    end
+
+    if not Nginx.is_running?
+      Nginx.initialize_config()
+      Nginx.create_app_load_balancer_config(my_node.public_ip,
+        my_node.private_ip, AppDashboard::PROXY_PORT)
+      Nginx.start()
+      Djinn.log_info("Nginx configured and started.")
+    else
+      Djinn.log_info("Nginx already configured and running.")
+    end
 
     # Volume is mounted, let's finish the configuration of static files.
     configure_db_nginx()
@@ -4391,14 +4389,16 @@ def set_appcontroller_monit()
       'EC2_HOME' => ENV['EC2_HOME'],
       'JAVA_HOME' => ENV['JAVA_HOME']
     }
-    start = "/usr/bin/ruby -w #{APPSCALE_HOME}/AppController/djinnServer.rb"
+    start = "/usr/sbin/service appscale-controller start"
     stop = "/usr/sbin/service appscale-controller stop"
+    match_cmd = "/usr/bin/ruby -w /root/appscale/AppController/djinnServer.rb"
 
     # Let's make sure we don't have 2 jobs monitoring the controller.
     FileUtils.rm_rf("/etc/monit/conf.d/controller-17443.cfg")
 
     begin
-      MonitInterface.start(:controller, start, stop, SERVER_PORT, env)
+      MonitInterface.start(:controller, start, stop, SERVER_PORT, env,
+        nil, nil, match_cmd)
     rescue Exception => e
       Djinn.log_warn("Failed to set local AppController monit: retrying.")
       retry

diff --git a/AppController/scripts/appcontroller b/AppController/scripts/appcontroller
@@ -26,8 +26,8 @@ do_start()
      exit 0
    fi
 
-   # If we start from boot, we need to clear the state.
-   rm -rf /etc/appscale/zookeeper_locations.json
+   # If we start from boot, we need to clear the monit state. The
+   # AppController will rebuild it.
    rm -rf /etc/monit/conf.d/appscale*cfg
 
    log_daemon_msg "Starting system $DAEMON_NAME daemon"