Skip to content

Commit

Permalink
[haproxy] Fix KeyError when an unknown status is found (#2681)
Browse files Browse the repository at this point in the history
* [haproxy] Fix `KeyError` when an unknown status is found

An unknown status (i.e. one that's not listed in
`Services.ALL_STATUSES`) would make the check fail with a `KeyError`
because that status wasn't in the initialized `dict` of statuses.

The check now sends unknown statuses as:
* an `UNAVAILABLE` status if `collate_status_tags_per_host` is enabled,
in order to keep the number of contexts low
* the unknown status as-is if `collate_status_tags_per_host` is left
disabled, in order to give maximum visibility in the actual statuses

This commit also improves the parsing of status suffixes:
* Introduce normalization of statuses to regard `UP`, `UP 1/2`,
and `UP (agent)` statuses as the same `up` status. This fixes a
regression.
* Use the same logic for the `haproxy.backend_hosts` metric

See related documentation (thanks @alexism for the links):
https://cbonte.github.io/haproxy-dconv/configuration-1.5.html#9.1 :

`17. status [LFBS]: status (UP/DOWN/NOLB/MAINT/MAINT(via)...)`

More precisely, the haproxy code shows that 2 suffixes might be used: ` %d/%d` and ` (agent)`
https://github.com/haproxy/haproxy/blob/a5de024d42c4113fc6e189ea1d0ba6335219e151/src/dumpstats.c#L4117-L4129

Misc:
* move the remaining mock test in `integration/` to `mock/`
  • Loading branch information
olivielpeau committed Jul 22, 2016
1 parent bba3833 commit 9640c1d
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 92 deletions.
89 changes: 62 additions & 27 deletions checks.d/haproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

# stdlib
from collections import defaultdict
import copy
import re
import time

Expand All @@ -23,26 +24,37 @@ class Services(object):
BACKEND = 'BACKEND'
FRONTEND = 'FRONTEND'
ALL = (BACKEND, FRONTEND)

# Statuses that we normalize to and that are reported by
# `haproxy.count_per_status` by default (unless `collate_status_tags_per_host` is enabled)
ALL_STATUSES = (
'up', 'open', 'down', 'maint', 'nolb'
)

AVAILABLE = 'available'
UNAVAILABLE = 'unavailable'
COLLATED_STATUSES = (AVAILABLE, UNAVAILABLE)

STATUS_MAP = {
BACKEND_STATUS_TO_COLLATED = {
'up': AVAILABLE,
'down': UNAVAILABLE,
'maint': UNAVAILABLE,
'nolb': UNAVAILABLE,
}

STATUS_TO_COLLATED = {
'up': AVAILABLE,
'open': AVAILABLE,
'down': UNAVAILABLE,
'maint': UNAVAILABLE,
'nolb': UNAVAILABLE
'nolb': UNAVAILABLE,
}

STATUSES_TO_SERVICE_CHECK = {
'UP': AgentCheck.OK,
'DOWN': AgentCheck.CRITICAL,
'no check': AgentCheck.UNKNOWN,
'MAINT': AgentCheck.OK,
STATUS_TO_SERVICE_CHECK = {
'up': AgentCheck.OK,
'down': AgentCheck.CRITICAL,
'no_check': AgentCheck.UNKNOWN,
'maint': AgentCheck.OK,
}


Expand Down Expand Up @@ -229,6 +241,10 @@ def _line_to_dict(self, fields, line):
except Exception:
pass
data_dict[fields[i]] = val

if 'status' in data_dict:
data_dict['status'] = self._normalize_status(data_dict['status'])

return data_dict

def _update_data_dict(self, data_dict, back_or_front):
Expand Down Expand Up @@ -287,9 +303,23 @@ def _tag_match_patterns(self, tag, filters):
return True
return False

@staticmethod
def _normalize_status(status):
"""
Try to normalize the HAProxy status as one of the statuses defined in `ALL_STATUSES`,
if it can't be matched return the status as-is in a tag-friendly format
ex: 'UP 1/2' -> 'up'
'no check' -> 'no_check'
"""
formatted_status = status.lower().replace(" ", "_")
for normalized_status in Services.ALL_STATUSES:
if formatted_status.startswith(normalized_status):
return normalized_status
return formatted_status

def _process_backend_hosts_metric(self, hosts_statuses, services_incl_filter=None,
services_excl_filter=None):
agg_statuses = defaultdict(lambda: {'available': 0, 'unavailable': 0})
agg_statuses = defaultdict(lambda: {status: 0 for status in Services.COLLATED_STATUSES})
for host_status, count in hosts_statuses.iteritems():
try:
service, hostname, status = host_status
Expand All @@ -298,11 +328,10 @@ def _process_backend_hosts_metric(self, hosts_statuses, services_incl_filter=Non

if self._is_service_excl_filtered(service, services_incl_filter, services_excl_filter):
continue
status = status.lower()
if 'up' in status:
agg_statuses[service]['available'] += count
elif 'down' in status or 'maint' in status or 'nolb' in status:
agg_statuses[service]['unavailable'] += count

collated_status = Services.BACKEND_STATUS_TO_COLLATED.get(status)
if collated_status:
agg_statuses[service][collated_status] += count
else:
# create the entries for this service anyway
agg_statuses[service]
Expand All @@ -311,11 +340,11 @@ def _process_backend_hosts_metric(self, hosts_statuses, services_incl_filter=Non
tags = ['service:%s' % service]
self.gauge(
'haproxy.backend_hosts',
agg_statuses[service]['available'],
agg_statuses[service][Services.AVAILABLE],
tags=tags + ['available:true'])
self.gauge(
'haproxy.backend_hosts',
agg_statuses[service]['unavailable'],
agg_statuses[service][Services.UNAVAILABLE],
tags=tags + ['available:false'])
return agg_statuses

Expand All @@ -324,10 +353,15 @@ def _process_status_metric(self, hosts_statuses, collect_status_metrics_by_host,
collate_status_tags_per_host=False, count_status_by_service=True):
agg_statuses_counter = defaultdict(lambda: {status: 0 for status in Services.COLLATED_STATUSES})

# Initialize `statuses_counter`: every value is a defaultdict initialized with the correct
# keys, which depends on the `collate_status_tags_per_host` option
reported_statuses = Services.ALL_STATUSES
if collate_status_tags_per_host:
reported_statuses = Services.COLLATED_STATUSES
statuses_counter = defaultdict(lambda: {status: 0 for status in reported_statuses})
reported_statuses_dict = defaultdict(int)
for reported_status in reported_statuses:
reported_statuses_dict[reported_status] = 0
statuses_counter = defaultdict(lambda: copy.copy(reported_statuses_dict))

for host_status, count in hosts_statuses.iteritems():
hostname = None
Expand All @@ -338,7 +372,6 @@ def _process_status_metric(self, hosts_statuses, collect_status_metrics_by_host,
self.warning('`collect_status_metrics_by_host` is enabled but no host info\
could be extracted from HAProxy stats endpoint for {0}'.format(service))
service, status = host_status
status = status.lower()

if self._is_service_excl_filtered(service, services_incl_filter, services_excl_filter):
continue
Expand All @@ -351,7 +384,8 @@ def _process_status_metric(self, hosts_statuses, collect_status_metrics_by_host,

counter_status = status
if collate_status_tags_per_host:
counter_status = Services.STATUS_MAP.get(status, status)
# An unknown status will be sent as UNAVAILABLE
counter_status = Services.STATUS_TO_COLLATED.get(status, Services.UNAVAILABLE)
statuses_counter[tuple(tags)][counter_status] += count

# Compute aggregates with collated statuses. If collate_status_tags_per_host is enabled we
Expand All @@ -360,7 +394,8 @@ def _process_status_metric(self, hosts_statuses, collect_status_metrics_by_host,
agg_tags = []
if count_status_by_service:
agg_tags.append('service:%s' % service)
agg_statuses_counter[tuple(agg_tags)][Services.STATUS_MAP.get(status, status)] += count
# An unknown status will be sent as UNAVAILABLE
agg_statuses_counter[tuple(agg_tags)][Services.STATUS_TO_COLLATED.get(status, Services.UNAVAILABLE)] += count

for tags, count_per_status in statuses_counter.iteritems():
for status, count in count_per_status.iteritems():
Expand Down Expand Up @@ -421,7 +456,7 @@ def _process_event(self, data, url, services_incl_filter=None,
self.host_status[url][key] = data['status']
return

if status != data['status'] and data['status'] in ('UP', 'DOWN'):
if status != data['status'] and data['status'] in ('up', 'down'):
# If the status of a host has changed, we trigger an event
try:
lastchg = int(data['lastchg'])
Expand All @@ -440,15 +475,15 @@ def _process_event(self, data, url, services_incl_filter=None,

def _create_event(self, status, hostname, lastchg, service_name, back_or_front):
HAProxy_agent = self.hostname.decode('utf-8')
if status == "DOWN":
if status == 'down':
alert_type = "error"
title = "%s reported %s:%s %s" % (HAProxy_agent, service_name, hostname, status)
title = "%s reported %s:%s %s" % (HAProxy_agent, service_name, hostname, status.upper())
else:
if status == "UP":
if status == "up":
alert_type = "success"
else:
alert_type = "info"
title = "%s reported %s:%s back and %s" % (HAProxy_agent, service_name, hostname, status)
title = "%s reported %s:%s back and %s" % (HAProxy_agent, service_name, hostname, status.upper())

tags = ["service:%s" % service_name]
if back_or_front == Services.BACKEND:
Expand All @@ -467,7 +502,7 @@ def _create_event(self, status, hostname, lastchg, service_name, back_or_front):
def _process_service_check(self, data, url, tag_by_host=False,
services_incl_filter=None, services_excl_filter=None):
''' Report a service check, tagged by the service and the backend.
Statuses are defined in `STATUSES_TO_SERVICE_CHECK` mapping.
Statuses are defined in `STATUS_TO_SERVICE_CHECK` mapping.
'''
service_name = data['pxname']
status = data['status']
Expand All @@ -478,13 +513,13 @@ def _process_service_check(self, data, url, tag_by_host=False,
services_excl_filter):
return

if status in Services.STATUSES_TO_SERVICE_CHECK:
if status in Services.STATUS_TO_SERVICE_CHECK:
service_check_tags = ["service:%s" % service_name]
hostname = data['svname']
if data['back_or_front'] == Services.BACKEND:
service_check_tags.append('backend:%s' % hostname)

status = Services.STATUSES_TO_SERVICE_CHECK[status]
status = Services.STATUS_TO_SERVICE_CHECK[status]
message = "%s reported %s:%s %s" % (haproxy_hostname, service_name,
hostname, status)
self.service_check(self.SERVICE_CHECK_NAME, status, message=message,
Expand Down
60 changes: 0 additions & 60 deletions tests/checks/integration/test_haproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,63 +205,3 @@ def test_open_config(self):
self.assertEquals(self.service_checks[0]['host_name'], '')

self.coverage_report()

# Keeping a mocked test since it tests the internal
# process of service checks
def test_count_per_statuses(self):
from collections import defaultdict
self.run_check(self.config)

data = """# pxname,svname,qcur,qmax,scur,smax,slim,stot,bin,bout,dreq,dresp,ereq,econ,eresp,wretr,wredis,status,weight,act,bck,chkfail,chkdown,lastchg,downtime,qlimit,pid,iid,sid,throttle,lbtot,tracked,type,rate,rate_lim,rate_max,check_status,check_code,check_duration,hrsp_1xx,hrsp_2xx,hrsp_3xx,hrsp_4xx,hrsp_5xx,hrsp_other,hanafail,req_rate,req_rate_max,req_tot,cli_abrt,srv_abrt,
a,FRONTEND,,,1,2,12,1,11,11,0,0,0,,,,,OPEN,,,,,,,,,1,1,0,,,,0,1,0,2,,,,0,1,0,0,0,0,,1,1,1,,,
a,BACKEND,0,0,0,0,12,0,11,11,0,0,,0,0,0,0,UP,0,0,0,,0,1221810,0,,1,1,0,,0,,1,0,,0,,,,0,0,0,0,0,0,,,,,0,0,
b,FRONTEND,,,1,2,12,11,11,0,0,0,0,,,,,OPEN,,,,,,,,,1,2,0,,,,0,0,0,1,,,,,,,,,,,0,0,0,,,
b,i-1,0,0,0,1,,1,1,0,,0,,0,0,0,0,UP,1,1,0,0,1,1,30,,1,3,1,,70,,2,0,,1,1,,0,,,,,,,0,,,,0,0,
b,i-2,0,0,1,1,,1,1,0,,0,,0,0,0,0,UP,1,1,0,0,0,1,0,,1,3,2,,71,,2,0,,1,1,,0,,,,,,,0,,,,0,0,
b,i-3,0,0,0,1,,1,1,0,,0,,0,0,0,0,UP,1,1,0,0,0,1,0,,1,3,3,,70,,2,0,,1,1,,0,,,,,,,0,,,,0,0,
b,i-4,0,0,0,1,,1,1,0,,0,,0,0,0,0,DOWN,1,1,0,0,0,1,0,,1,3,3,,70,,2,0,,1,1,,0,,,,,,,0,,,,0,0,
b,i-5,0,0,0,1,,1,1,0,,0,,0,0,0,0,MAINT,1,1,0,0,0,1,0,,1,3,3,,70,,2,0,,1,1,,0,,,,,,,0,,,,0,0,
b,BACKEND,0,0,1,2,0,421,1,0,0,0,,0,0,0,0,UP,6,6,0,,0,1,0,,1,3,0,,421,,1,0,,1,,,,,,,,,,,,,,0,0,
""".split('\n')

# per service
self.check._process_data(data, True, False, collect_status_metrics=True,
collect_status_metrics_by_host=False)

expected_hosts_statuses = defaultdict(int)
expected_hosts_statuses[('b', 'OPEN')] = 1
expected_hosts_statuses[('b', 'UP')] = 3
expected_hosts_statuses[('b', 'DOWN')] = 1
expected_hosts_statuses[('b', 'MAINT')] = 1
expected_hosts_statuses[('a', 'OPEN')] = 1
self.assertEquals(self.check.hosts_statuses, expected_hosts_statuses)

# backend hosts
agg_statuses = self.check._process_backend_hosts_metric(expected_hosts_statuses)
expected_agg_statuses = {
'a': {'available': 0, 'unavailable': 0},
'b': {'available': 3, 'unavailable': 2},
}
self.assertEquals(expected_agg_statuses, dict(agg_statuses))

# with collect_aggregates_only set to True
self.check._process_data(data, True, True, collect_status_metrics=True,
collect_status_metrics_by_host=False)
self.assertEquals(self.check.hosts_statuses, expected_hosts_statuses)

# per host
self.check._process_data(data, True, False, collect_status_metrics=True,
collect_status_metrics_by_host=True)
expected_hosts_statuses = defaultdict(int)
expected_hosts_statuses[('b', 'FRONTEND', 'OPEN')] = 1
expected_hosts_statuses[('a', 'FRONTEND', 'OPEN')] = 1
expected_hosts_statuses[('b', 'i-1', 'UP')] = 1
expected_hosts_statuses[('b', 'i-2', 'UP')] = 1
expected_hosts_statuses[('b', 'i-3', 'UP')] = 1
expected_hosts_statuses[('b', 'i-4', 'DOWN')] = 1
expected_hosts_statuses[('b', 'i-5', 'MAINT')] = 1
self.assertEquals(self.check.hosts_statuses, expected_hosts_statuses)

self.check._process_data(data, True, True, collect_status_metrics=True,
collect_status_metrics_by_host=True)
self.assertEquals(self.check.hosts_statuses, expected_hosts_statuses)
Loading

0 comments on commit 9640c1d

Please sign in to comment.