Skip to content

Commit

Permalink
Respawn glance services on unexpected death.
Browse files Browse the repository at this point in the history
Fixes bug 923894

Add new '--respawn' option to cause glance services launched via
glance-control to be monitored for unexpected death and resuscitated
as necessary.

This option will cause glance-control itself to remain running.

Deliberately stopped services are not respawned, neither are rapidly
bouncing services (where process death occurred within one second
of the last launch).

Change-Id: I1a9a99cce9b6ad43274836e39ebe4f29c19455af
  • Loading branch information
Eoghan Glynn committed Feb 8, 2012
1 parent eea25b4 commit 8043962
Show file tree
Hide file tree
Showing 6 changed files with 303 additions and 69 deletions.
138 changes: 94 additions & 44 deletions bin/glance-control
Expand Up @@ -67,6 +67,17 @@ And command is one of:
And CONFPATH is the optional configuration file to use."""


def gated_by(predicate):
def wrap(f):
def wrapped_f(*args):
if predicate:
return f(*args)
else:
return None
return wrapped_f
return wrap


def pid_files(server, conf):
pid_files = []
if conf.pid_file:
Expand All @@ -80,25 +91,26 @@ def pid_files(server, conf):
yield pid_file, pid


def do_start(server, conf, args):
server_type = '-'.join(server.split('-')[:-1])

for pid_file, pid in pid_files(server, conf):
if os.path.exists('/proc/%s' % pid):
print "%s appears to already be running: %s" % (server, pid_file)
return
else:
print "Removing stale pid file %s" % pid_file
os.unlink(pid_file)
def do_start(verb, server, conf, args):
if verb != 'Respawn':
for pid_file, pid in pid_files(server, conf):
if os.path.exists('/proc/%s' % pid):
print "%s appears to already be running: %s" % \
(server, pid_file)
return
else:
print "Removing stale pid file %s" % pid_file
os.unlink(pid_file)

try:
resource.setrlimit(resource.RLIMIT_NOFILE,
(MAX_DESCRIPTORS, MAX_DESCRIPTORS))
resource.setrlimit(resource.RLIMIT_DATA,
(MAX_MEMORY, MAX_MEMORY))
except ValueError:
print "Unable to increase file descriptor limit. Running as non-root?"
os.environ['PYTHON_EGG_CACHE'] = '/tmp'
try:
resource.setrlimit(resource.RLIMIT_NOFILE,
(MAX_DESCRIPTORS, MAX_DESCRIPTORS))
resource.setrlimit(resource.RLIMIT_DATA,
(MAX_MEMORY, MAX_MEMORY))
except ValueError:
action = 'increase file descriptor limit'
print 'Unable to %s. Running as non-root?' % action
os.environ['PYTHON_EGG_CACHE'] = '/tmp'

def write_pid_file(pid_file, pid):
dir, file = os.path.split(pid_file)
Expand All @@ -113,18 +125,6 @@ def do_start(server, conf, args):
fp.write('%d\n' % pid)
fp.close()

def await_child(pid):
if conf.await_child:
bail_time = time.time() + conf.await_child
while time.time() < bail_time:
reported_pid, status = os.waitpid(pid, os.WNOHANG)
if reported_pid == pid:
global exitcode
# the exit code is encoded in 2nd least significant byte
exitcode = status >> 8
break
time.sleep(0.05)

def redirect_to_null(fds):
with open(os.devnull, 'r+b') as nullfile:
for desc in fds: # close fds
Expand Down Expand Up @@ -154,14 +154,15 @@ def do_start(server, conf, args):
else:
redirect_to_null(output)

@gated_by(conf.capture_output)
def close_stdio_on_exec():
fds = [sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()]
for desc in fds: # set close on exec flag
fcntl.fcntl(desc, fcntl.F_SETFD, fcntl.FD_CLOEXEC)

def launch(pid_file, conf_file=None):
args = [server]
print 'Starting %s' % server,
print '%sing %s' % (verb, server),
if conf_file:
args += ['--config-file', conf_file]
print 'with %s' % conf_file,
Expand All @@ -176,23 +177,37 @@ def do_start(server, conf, args):
try:
os.execlp('%s' % server, *args)
except OSError, e:
sys.exit('unable to launch %s. Got error: %s'
% (server, "%s" % e))
msg = 'unable to launch %s. Got error: %s' % (server, e)
sys.exit(msg)
sys.exit(0)
else:
write_pid_file(pid_file, pid)
await_child(pid)
return pid

if not conf.pid_file:
pid_file = '/var/run/glance/%s.pid' % server
else:
pid_file = os.path.abspath(conf.pid_file)
@gated_by(conf.await_child)
def await_child(pid):
bail_time = time.time() + conf.await_child
while time.time() < bail_time:
reported_pid, status = os.waitpid(pid, os.WNOHANG)
if reported_pid == pid:
global exitcode
exitcode = os.WEXITSTATUS(status)
break
time.sleep(0.05)

pid_file = get_pid_file(server, conf)

conf_file = None
if args and os.path.exists(args[0]):
conf_file = os.path.abspath(os.path.expanduser(args[0]))

launch(pid_file, conf_file)
return launch(pid_file, conf_file)


def get_pid_file(pid, conf):
return os.path.abspath(conf.pid_file) if conf.pid_file else \
'/var/run/glance/%s.pid' % server


def do_stop(server, conf, args, graceful=False):
Expand All @@ -205,15 +220,15 @@ def do_stop(server, conf, args, graceful=False):
pfiles = pid_files(server, conf)
for pid_file, pid in pfiles:
did_anything = True
try:
os.unlink(pid_file)
except OSError:
pass
try:
print 'Stopping %s pid: %s signal: %s' % (server, pid, sig)
os.kill(pid, sig)
except OSError:
print "Process %d not running" % pid
try:
os.unlink(pid_file)
except OSError:
pass
for pid_file, pid in pfiles:
for _junk in xrange(150): # 15 seconds
if not os.path.exists('/proc/%s' % pid):
Expand Down Expand Up @@ -246,11 +261,22 @@ if __name__ == '__main__':
default=False,
help='Capture stdout/err in syslog '
'instead of discarding'),
cfg.BoolOpt('respawn',
default=False,
help='Restart service on unexpected death'),
]
conf.register_cli_opts(opts)

args = conf()

@gated_by(conf.await_child)
@gated_by(conf.respawn)
def mutually_exclusive():
sys.stderr.write('--await-child and --respawn are mutually exclusive')
sys.exit(1)

mutually_exclusive()

if len(args) < 2:
conf.print_usage()
sys.exit(1)
Expand All @@ -276,9 +302,33 @@ if __name__ == '__main__':
"command in this list: %(command_list)s" % locals())
sys.exit(msg)

@gated_by(conf.respawn)
def anticipate_respawn(children):
while children:
pid, status = os.wait()
if pid in children:
(server, conf, args) = children.pop(pid)
pid_file = get_pid_file(server, conf)
running = os.path.exists(pid_file)
one_second_ago = time.time() - 1
bouncing = (running and
os.path.getmtime(pid_file) >= one_second_ago)
if running and not bouncing:
args = (server, conf, args)
new_pid = do_start('Respawn', *args)
children[new_pid] = args
else:
rsn = 'bouncing' if bouncing else 'deliberately stopped'
print 'Supressed respawn as %s was %s.' % (server, rsn)

if command == 'start':
children = {}
for server in servers:
do_start(server, conf, args)
args = (server, conf, args)
pid = do_start('Start', *args)
children[pid] = args

anticipate_respawn(children)

if command == 'stop':
for server in servers:
Expand All @@ -292,7 +342,7 @@ if __name__ == '__main__':
for server in servers:
do_stop(server, conf, args)
for server in servers:
do_start(server, conf, args)
do_start('Restart', server, conf, args)

if command == 'reload' or command == 'force-reload':
for server in servers:
Expand Down
12 changes: 12 additions & 0 deletions doc/source/controllingservers.rst
Expand Up @@ -177,6 +177,18 @@ Glance server programs, and you can specify (as the example above shows)
a configuration file when starting the server.


In order for your launched glance service to be monitored for unexpected death
and respawned if necessary, use the following option:


$ sudo glance-control [service] start --respawn ...


Note that this will cause ``glance-control`` itself to remain running. Also note
that deliberately stopped services are not respawned, neither are rapidly bouncing
services (where process death occurred within one second of the last launch).


By default, output from glance services is discarded when launched with ``glance-control``.
In order to capture such output via syslog, use the following option:

Expand Down
50 changes: 30 additions & 20 deletions glance/tests/functional/__init__.py
Expand Up @@ -129,7 +129,7 @@ def override_conf(filepath, base, override):

return self.conf_file_name

def start(self, expected_exitcode=0, **kwargs):
def start(self, expect_exit=True, expected_exitcode=0, **kwargs):
"""
Starts the server.
Expand All @@ -147,6 +147,7 @@ def start(self, expected_exitcode=0, **kwargs):
return execute(cmd,
no_venv=self.no_venv,
exec_env=self.exec_env,
expect_exit=expect_exit,
expected_exitcode=expected_exitcode)

def stop(self):
Expand All @@ -156,7 +157,8 @@ def stop(self):
cmd = ("%(server_control)s %(server_name)s stop "
"%(conf_file_name)s --pid-file=%(pid_file)s"
% self.__dict__)
return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env)
return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env,
expect_exit=True)


class ApiServer(Server):
Expand Down Expand Up @@ -449,6 +451,7 @@ def cleanup(self):
def start_server(self,
server,
expect_launch,
expect_exit=True,
expected_exitcode=0,
**kwargs):
"""
Expand All @@ -460,18 +463,22 @@ def start_server(self,
:param server: the server to launch
:param expect_launch: true iff the server is expected to
successfully start
:param expect_exit: true iff the launched server is expected
to exit in a timely fashion
:param expected_exitcode: expected exitcode from the launcher
"""
self.cleanup()

# Start up the requested server
exitcode, out, err = server.start(expected_exitcode=expected_exitcode,
exitcode, out, err = server.start(expect_exit=expect_exit,
expected_exitcode=expected_exitcode,
**kwargs)
if expect_exit:
self.assertEqual(expected_exitcode, exitcode,
"Failed to spin up the requested server. "
"Got: %s" % err)

self.assertEqual(expected_exitcode, exitcode,
"Failed to spin up the requested server. "
"Got: %s" % err)
self.assertTrue(re.search("Starting glance-[a-z]+ with", out))
self.assertTrue(re.search("Starting glance-[a-z]+ with", out))

self.wait_for_servers([server.bind_port], expect_launch)

Expand Down Expand Up @@ -551,6 +558,19 @@ def wait_for_servers(self, ports, expect_launch=True, timeout=3):
time.sleep(0.05)
self.assertFalse(expect_launch, "Unexpected server launch status")

def stop_server(self, server, name):
"""
Called to stop a single server in a normal fashion using the
glance-control stop method to gracefully shut the server down.
:param server: the server to stop
"""
# Spin down the requested server
exitcode, out, err = server.stop()
self.assertEqual(0, exitcode,
"Failed to spin down the %s server. Got: %s" %
(err, name))

def stop_servers(self):
"""
Called to stop the started servers in a normal fashion. Note
Expand All @@ -562,20 +582,10 @@ def stop_servers(self):
"""

# Spin down the API and default registry server
exitcode, out, err = self.api_server.stop()
self.assertEqual(0, exitcode,
"Failed to spin down the API server. "
"Got: %s" % err)

exitcode, out, err = self.registry_server.stop()
self.assertEqual(0, exitcode,
"Failed to spin down the Registry server. "
"Got: %s" % err)
self.stop_server(self.api_server, 'API server')
self.stop_server(self.registry_server, 'Registry server')
self.stop_server(self.scrubber_daemon, 'Scrubber daemon')

exitcode, out, err = self.scrubber_daemon.stop()
self.assertEqual(0, exitcode,
"Failed to spin down the Scrubber daemon. "
"Got: %s" % err)
# If all went well, then just remove the test directory.
# We only want to check the logs and stuff if something
# went wrong...
Expand Down
3 changes: 2 additions & 1 deletion glance/tests/functional/test_api.py
Expand Up @@ -21,7 +21,6 @@
import hashlib
import httplib2
import json
import os
import tempfile

from glance.common import utils
Expand Down Expand Up @@ -1293,6 +1292,8 @@ def test_unsupported_default_store(self):
"""
self.cleanup()
self.api_server.default_store = 'shouldnotexist'

# ensure failure exit code is available to assert on
self.api_server.server_control_options += ' --await-child=1'

# ensure that the API server fails to launch
Expand Down

0 comments on commit 8043962

Please sign in to comment.