Skip to content

Commit

Permalink
Prevent L3 agent looping calls from hanging
Browse files Browse the repository at this point in the history
This patch adopts several measures to prevent _sync_routers_task
and _rpc_loop from hanging because of subprocess.Popen.communicate
not returning.

1) Perform a sleep everytime a command is completed, similarly to
what is done in openstack.common.processutils.execute
2) Disable by default GARP, as kernel crashes caused by arping
have been observed
3) Prevent a non-critical keyerror in _router_removed from triggering
again a full sync, which might put the system under significant load.

This patch also adds debug log statements aimed at improving the
ability of debugging similar failures.

Change-Id: I003316bce0f38b7d2ea7d563b5a0a58676834398
Partial-Bug: 1224001
(cherry picked from commit 591ee00)
  • Loading branch information
salv-orlando authored and markmcclain committed Oct 8, 2013
1 parent b988984 commit bc50764
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 21 deletions.
21 changes: 16 additions & 5 deletions neutron/agent/l3_agent.py
Expand Up @@ -163,10 +163,9 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
help=_("TCP Port used by Neutron metadata namespace "
"proxy.")),
cfg.IntOpt('send_arp_for_ha',
default=3,
help=_("Send this many gratuitous ARPs for HA setup, "
"set it below or equal to 0 to disable this "
"feature.")),
default=0,
help=_("Send this many gratuitous ARPs for HA setup, if "
"less than or equal to 0, the feature is disabled")),
cfg.BoolOpt('use_namespaces', default=True,
help=_("Allow overlapping IP.")),
cfg.StrOpt('router_id', default='',
Expand Down Expand Up @@ -308,7 +307,11 @@ def _router_added(self, router_id, router):
self._spawn_metadata_proxy(ri)

def _router_removed(self, router_id):
ri = self.router_info[router_id]
ri = self.router_info.get(router_id)
if ri is None:
LOG.warn(_("Info for router %s were not found. "
"Skipping router removal"), router_id)
return
ri.router['gw_port'] = None
ri.router[l3_constants.INTERFACE_KEY] = []
ri.router[l3_constants.FLOATINGIP_KEY] = []
Expand Down Expand Up @@ -706,13 +709,16 @@ def _rpc_loop(self):
# so we can clear the value of updated_routers
# and removed_routers
try:
LOG.debug(_("Starting RPC loop for %d updated routers"),
len(self.updated_routers))
if self.updated_routers:
router_ids = list(self.updated_routers)
self.updated_routers.clear()
routers = self.plugin_rpc.get_routers(
self.context, router_ids)
self._process_routers(routers)
self._process_router_delete()
LOG.debug(_("RPC loop successfully completed"))
except Exception:
LOG.exception(_("Failed synchronizing routers"))
self.fullsync = True
Expand All @@ -732,6 +738,8 @@ def _router_ids(self):
def _sync_routers_task(self, context):
if self.services_sync:
super(L3NATAgent, self).process_services_sync(context)
LOG.debug(_("Starting _sync_routers_task - fullsync:%s"),
self.fullsync)
if not self.fullsync:
return
try:
Expand All @@ -744,6 +752,7 @@ def _sync_routers_task(self, context):
LOG.debug(_('Processing :%r'), routers)
self._process_routers(routers, all_routers=True)
self.fullsync = False
LOG.debug(_("_sync_routers_task successfully completed"))
except Exception:
LOG.exception(_("Failed synchronizing routers"))
self.fullsync = True
Expand Down Expand Up @@ -809,6 +818,7 @@ def __init__(self, host, conf=None):
self.heartbeat.start(interval=report_interval)

def _report_state(self):
LOG.debug(_("Report state task started"))
num_ex_gw_ports = 0
num_interfaces = 0
num_floating_ips = 0
Expand All @@ -832,6 +842,7 @@ def _report_state(self):
self.use_call)
self.agent_state.pop('start_flag', None)
self.use_call = False
LOG.debug(_("Report state task successfully completed"))
except AttributeError:
# This means the server does not support report_state
LOG.warn(_("Neutron server does not support state report."
Expand Down
38 changes: 22 additions & 16 deletions neutron/agent/linux/utils.py
Expand Up @@ -25,6 +25,7 @@
import tempfile

from eventlet.green import subprocess
from eventlet import greenthread

from neutron.common import utils
from neutron.openstack.common import log as logging
Expand All @@ -43,22 +44,27 @@ def execute(cmd, root_helper=None, process_input=None, addl_env=None,
env = os.environ.copy()
if addl_env:
env.update(addl_env)
obj = utils.subprocess_popen(cmd, shell=False,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env)

_stdout, _stderr = (process_input and
obj.communicate(process_input) or
obj.communicate())
obj.stdin.close()
m = _("\nCommand: %(cmd)s\nExit code: %(code)s\nStdout: %(stdout)r\n"
"Stderr: %(stderr)r") % {'cmd': cmd, 'code': obj.returncode,
'stdout': _stdout, 'stderr': _stderr}
LOG.debug(m)
if obj.returncode and check_exit_code:
raise RuntimeError(m)
try:
obj = utils.subprocess_popen(cmd, shell=False,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env)
_stdout, _stderr = (process_input and
obj.communicate(process_input) or
obj.communicate())
obj.stdin.close()
m = _("\nCommand: %(cmd)s\nExit code: %(code)s\nStdout: %(stdout)r\n"
"Stderr: %(stderr)r") % {'cmd': cmd, 'code': obj.returncode,
'stdout': _stdout, 'stderr': _stderr}
LOG.debug(m)
if obj.returncode and check_exit_code:
raise RuntimeError(m)
finally:
# NOTE(termie): this appears to be necessary to let the subprocess
# call clean something up in between calls, without
# it two execute calls in a row hangs the second one
greenthread.sleep(0)

return return_stderr and (_stdout, _stderr) or _stdout

Expand Down
1 change: 1 addition & 0 deletions neutron/tests/unit/test_l3_agent.py
Expand Up @@ -46,6 +46,7 @@ def setUp(self):
self.conf.set_override('router_id', 'fake_id')
self.conf.set_override('interface_driver',
'neutron.agent.linux.interface.NullDriver')
self.conf.set_override('send_arp_for_ha', 1)
self.conf.root_helper = 'sudo'

self.device_exists_p = mock.patch(
Expand Down

0 comments on commit bc50764

Please sign in to comment.