Skip to content

Commit

Permalink
Add a varnish service check.
Browse files Browse the repository at this point in the history
The service check uses the `varnishadm debug.health` command to
get the health of available backends. One check per backend will
be submitted, tagged by the backend name.

The BIG question here is how we will let users enable this because
the varnishadm command requires access to the secret key which has
root permissions by default (and is owned by root).
  • Loading branch information
conorbranagan committed Dec 10, 2014
1 parent 27d986b commit 63453b0
Show file tree
Hide file tree
Showing 3 changed files with 193 additions and 67 deletions.
187 changes: 133 additions & 54 deletions checks.d/varnish.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
# stdlib
import xml.parsers.expat # python 2.4 compatible
from collections import defaultdict
import re
import subprocess
import xml.parsers.expat # python 2.4 compatible

# project
from checks import AgentCheck


class BackendStatus(object):
HEALTHY = 'healthy'
SICK = 'sick'
ALL = (HEALTHY, SICK)

@classmethod
def to_check_status(cls, status):
if status == cls.HEALTHY:
return AgentCheck.OK
elif status == cls.SICK:
return AgentCheck.CRITICAL
return AgentCheck.UNKNOWN

class Varnish(AgentCheck):
SERVICE_CHECK_NAME = 'varnish.backend_healthy'

# XML parsing bits, a.k.a. Kafka in Code
def _reset(self):
self._current_element = ""
Expand Down Expand Up @@ -47,39 +64,6 @@ def _char_data(self, data):
self._current_str = data

def check(self, instance):
"""Extract stats from varnishstat -x
The text option (-1) is not reliable enough when counters get large.
VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615
2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h
Bitmaps are not supported.
<varnishstat>
<stat>
<name>fetch_304</name>
<value>0</value>
<flag>a</flag>
<description>Fetch no body (304)</description>
</stat>
<stat>
<name>n_sess_mem</name>
<value>334</value>
<flag>i</flag>
<description>N struct sess_mem</description>
</stat>
<stat>
<type>LCK</type>
<ident>vcl</ident>
<name>creat</name>
<value>1</value>
<flag>a</flag>
<description>Created locks</description>
</stat>
</varnishstat>
"""
# Not configured? Not a problem.
if instance.get("varnishstat", None) is None:
raise Exception("varnishstat is not configured")
Expand All @@ -88,16 +72,47 @@ def check(self, instance):
tags = []
else:
tags = list(set(tags))
varnishstat_path = instance.get("varnishstat")
name = instance.get('name')

# Get version and version-specific args from varnishstat -V.
version, use_xml = self._get_version_info(varnishstat_path)

# Parse metrics from varnishstat.
arg = '-x' if use_xml else '-1'
cmd = [varnishstat_path, arg]

if name is not None:
cmd.extend(['-n', name])
tags += [u'varnish_name:%s' % name]
else:
tags += [u'varnish_name:default']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, error = proc.communicate()
if error and len(error) > 0:
self.log.error(error)
self._parse_varnishstat_metrics(varnishstat_path, use_xml, tags)

# Parse service checks from varnishadm.
varnishadm_path = instance.get('varnishadm')
if varnishadm_path:
secretfile_path = instance.get('secretfile', '/etc/varnish/secret')
varnishadm_path = 'sudo %s' % varnishadm_path
cmd = [varnishadm_path, '-S', secretfile_path, 'debug.health']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
output, _ = proc.communicate()
if output:
self._parse_varnishadm(output)

def _get_version_info(self, varnishstat_path):
# Get the varnish version from varnishstat
output, error = subprocess.Popen([instance.get("varnishstat"), "-V"],
output, error = subprocess.Popen([varnishstat_path, "-V"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()

# Assumptions regarding varnish's version
use_xml = True
arg = "-x" # varnishstat argument
version = 3

m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE)
Expand All @@ -118,26 +133,44 @@ def check(self, instance):
# Location of varnishstat
if version <= 2:
use_xml = False
arg = "-1"

cmd = [instance.get("varnishstat"), arg]
if name is not None:
cmd.extend(['-n', name])
tags += [u'varnish_name:%s' % name]
else:
tags += [u'varnish_name:default']
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, error = proc.communicate()
except Exception:
self.log.error(u"Failed to run %s" % repr(cmd))
raise
if error and len(error) > 0:
self.log.error(error)
self._parse_varnishstat(output, use_xml, tags)
return version, use_xml

def _parse_varnishstat(self, output, use_xml, tags=None):
"""Extract stats from varnishstat -x
The text option (-1) is not reliable enough when counters get large.
VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615
2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h
Bitmaps are not supported.
Example XML output (with `use_xml=True`)
<varnishstat>
<stat>
<name>fetch_304</name>
<value>0</value>
<flag>a</flag>
<description>Fetch no body (304)</description>
</stat>
<stat>
<name>n_sess_mem</name>
<value>334</value>
<flag>i</flag>
<description>N struct sess_mem</description>
</stat>
<stat>
<type>LCK</type>
<ident>vcl</ident>
<name>creat</name>
<value>1</value>
<flag>a</flag>
<description>Created locks</description>
</stat>
</varnishstat>
"""
tags = tags or []
if use_xml:
p = xml.parsers.expat.ParserCreate()
Expand Down Expand Up @@ -165,4 +198,50 @@ def _parse_varnishstat(self, output, use_xml, tags=None):
self.log.debug("Varnish (rate) %s %d" % (metric_name, int(gauge_val)))
self.rate(metric_name, float(gauge_val), tags=tags)


def _parse_varnishadm(self, output):
""" Parse out service checks from varnishadm.
Example output:
Backend b0 is Sick
Current states good: 2 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
-------------------------------------------------------------444 Good IPv4
-------------------------------------------------------------XXX Good Xmit
-------------------------------------------------------------RRR Good Recv
----------------------------------------------------------HHH--- Happy
Backend b1 is Sick
Current states good: 2 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
----------------------------------------------------------HHH--- Happy
"""
# Process status by backend.
backends_by_status = defaultdict(list)
backend, status, message = None, None, None
for line in output.split("\n"):
tokens = line.strip().split(' ')
if len(tokens) > 0:
if tokens[0] == 'Backend':
backend = tokens[1]
status = tokens[1].lower()
elif tokens[0] == 'Current' and backend is not None:
try:
message = ' '.join(tokens[2:]).strip()
except Exception:
# If we can't parse a message still send a status.
self.log.exception('Error when parsing message from varnishadm')
message = ''
backends_by_status[status].append((backend, message))

for status, backends in backends_by_status.iteritems():
check_status = BackendStatus.to_check_status(status)
for backend, message in backends:
tags = ['backend:%s' % backend]
self.service_check(self.SERVICE_CHECK_NAME, check_status,
tags=tags, message=message)

35 changes: 25 additions & 10 deletions conf.d/varnish.yaml.example
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
init_config:

instances:
# - varnishstat: (required) String path to varnishstat binary
# name: (optional) String name of varnish instance. Passed to the -n parameter of varnishstat. Will also tag each metric with this name.
# tags: (optional) Additional tags to tag each metric with
#
# Example:
#
- varnishstat: /usr/bin/varnishstat
name: myvarnishinstance
tags:
- instance:production
# The full path to the varnishstat binary
# - varnishstat: /usr/bin/varnishstat

# The (optional) name will be used in the varnishstat command for the
# -n argument and will add a name:$instancename tag to all metrics.
# name: myvarnishinstance

# The (optional) list of tags will be applied to every emitted metric.
# tags:
# - instance:production

# The (optional) path to the varnishadm binary will signal the check to
# emit a service check status on backend health using `debug.health`.
# The service check will be tagged by backend.
# NOTE: The Agent must be able to access varnishadm as with root
# privilleges. You can configure your sudoers file for this:
#
# example /etc/sudoers entry:
# dd-agent ALL=(ALL) NOPASSWD:/usr/bin/find
#
# varnishadm: /usr/bin/varnishadm

# The (optional) path to the varnish secretfile will be used in the
# varnishadm command, if enabled.
# secretfile: /etc/varnish/secret
38 changes: 35 additions & 3 deletions tests/test_varnish.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
import os
import time
import unittest

from nose.plugins.attrib import attr

from tests.common import get_check


Expand Down Expand Up @@ -1853,7 +1854,7 @@ def setUp(self):
"""


def testParsing(self):
def test_parsing(self):
v, instances = get_check('varnish', self.config)
v._parse_varnishstat(self.v_dump, False)
metrics = v.get_metrics()
Expand All @@ -1868,7 +1869,7 @@ def testParsing(self):
if m[0] == "varnish.SMA.s0.g_space"][0], 120606)
assert "varnish.SMA.transient.c_bytes" not in [m[0] for m in metrics]

def testCheck(self):
def test_check(self):
v, instances = get_check('varnish', self.config)
import pprint
try:
Expand All @@ -1879,5 +1880,36 @@ def testCheck(self):
except Exception:
pass

def test_service_check(self):
varnishadm_dump = """
Backend b0 is Sick
Current states good: 0 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
4444444444444444444444444444444444444444444444444444444444444444 Good IPv4
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Good Xmit
RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR Good Recv
---------------------------------------------------------------- Happy
Backend b1 is Sick
Current states good: 0 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
---------------------------------------------------------------- Happy
"""
v, instances = get_check('varnish', self.config)
v._parse_varnishadm(varnishadm_dump)
service_checks = v.get_service_checks()
self.assertEquals(len(service_checks), 2)

b0_check = service_checks[0]
self.assertEquals(b0_check['check'], v.SERVICE_CHECK_NAME)
self.assertEquals(b0_check['tags'], ['backend:b0'])

b1_check = service_checks[1]
self.assertEquals(b1_check['check'], v.SERVICE_CHECK_NAME)
self.assertEquals(b1_check['tags'], ['backend:b1'])

if __name__ == '__main__':
unittest.main()

0 comments on commit 63453b0

Please sign in to comment.