Skip to content

Commit

Permalink
Merge pull request #1213 from DataDog/conor/varnish-service-check
Browse files Browse the repository at this point in the history
Add a varnish service check.
  • Loading branch information
LeoCavaille committed Dec 10, 2014
2 parents 700cf22 + e28661b commit 26c97df
Show file tree
Hide file tree
Showing 3 changed files with 193 additions and 67 deletions.
187 changes: 133 additions & 54 deletions checks.d/varnish.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
# stdlib
import xml.parsers.expat # python 2.4 compatible
from collections import defaultdict
import re
import subprocess
import xml.parsers.expat # python 2.4 compatible

# project
from checks import AgentCheck


class BackendStatus(object):
HEALTHY = 'healthy'
SICK = 'sick'
ALL = (HEALTHY, SICK)

@classmethod
def to_check_status(cls, status):
if status == cls.HEALTHY:
return AgentCheck.OK
elif status == cls.SICK:
return AgentCheck.CRITICAL
return AgentCheck.UNKNOWN

class Varnish(AgentCheck):
SERVICE_CHECK_NAME = 'varnish.backend_healthy'

# XML parsing bits, a.k.a. Kafka in Code
def _reset(self):
self._current_element = ""
Expand Down Expand Up @@ -47,39 +64,6 @@ def _char_data(self, data):
self._current_str = data

def check(self, instance):
"""Extract stats from varnishstat -x
The text option (-1) is not reliable enough when counters get large.
VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615
2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h
Bitmaps are not supported.
<varnishstat>
<stat>
<name>fetch_304</name>
<value>0</value>
<flag>a</flag>
<description>Fetch no body (304)</description>
</stat>
<stat>
<name>n_sess_mem</name>
<value>334</value>
<flag>i</flag>
<description>N struct sess_mem</description>
</stat>
<stat>
<type>LCK</type>
<ident>vcl</ident>
<name>creat</name>
<value>1</value>
<flag>a</flag>
<description>Created locks</description>
</stat>
</varnishstat>
"""
# Not configured? Not a problem.
if instance.get("varnishstat", None) is None:
raise Exception("varnishstat is not configured")
Expand All @@ -88,16 +72,47 @@ def check(self, instance):
tags = []
else:
tags = list(set(tags))
varnishstat_path = instance.get("varnishstat")
name = instance.get('name')

# Get version and version-specific args from varnishstat -V.
version, use_xml = self._get_version_info(varnishstat_path)

# Parse metrics from varnishstat.
arg = '-x' if use_xml else '-1'
cmd = [varnishstat_path, arg]

if name is not None:
cmd.extend(['-n', name])
tags += [u'varnish_name:%s' % name]
else:
tags += [u'varnish_name:default']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, error = proc.communicate()
if error and len(error) > 0:
self.log.error(error)
self._parse_varnishstat_metrics(varnishstat_path, use_xml, tags)

# Parse service checks from varnishadm.
varnishadm_path = instance.get('varnishadm')
if varnishadm_path:
secretfile_path = instance.get('secretfile', '/etc/varnish/secret')
varnishadm_path = 'sudo %s' % varnishadm_path
cmd = [varnishadm_path, '-S', secretfile_path, 'debug.health']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
output, _ = proc.communicate()
if output:
self._parse_varnishadm(output)

def _get_version_info(self, varnishstat_path):
# Get the varnish version from varnishstat
output, error = subprocess.Popen([instance.get("varnishstat"), "-V"],
output, error = subprocess.Popen([varnishstat_path, "-V"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()

# Assumptions regarding varnish's version
use_xml = True
arg = "-x" # varnishstat argument
version = 3

m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE)
Expand All @@ -118,26 +133,44 @@ def check(self, instance):
# Location of varnishstat
if version <= 2:
use_xml = False
arg = "-1"

cmd = [instance.get("varnishstat"), arg]
if name is not None:
cmd.extend(['-n', name])
tags += [u'varnish_name:%s' % name]
else:
tags += [u'varnish_name:default']
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, error = proc.communicate()
except Exception:
self.log.error(u"Failed to run %s" % repr(cmd))
raise
if error and len(error) > 0:
self.log.error(error)
self._parse_varnishstat(output, use_xml, tags)
return version, use_xml

def _parse_varnishstat(self, output, use_xml, tags=None):
"""Extract stats from varnishstat -x
The text option (-1) is not reliable enough when counters get large.
VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615
2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h
Bitmaps are not supported.
Example XML output (with `use_xml=True`)
<varnishstat>
<stat>
<name>fetch_304</name>
<value>0</value>
<flag>a</flag>
<description>Fetch no body (304)</description>
</stat>
<stat>
<name>n_sess_mem</name>
<value>334</value>
<flag>i</flag>
<description>N struct sess_mem</description>
</stat>
<stat>
<type>LCK</type>
<ident>vcl</ident>
<name>creat</name>
<value>1</value>
<flag>a</flag>
<description>Created locks</description>
</stat>
</varnishstat>
"""
tags = tags or []
if use_xml:
p = xml.parsers.expat.ParserCreate()
Expand Down Expand Up @@ -165,4 +198,50 @@ def _parse_varnishstat(self, output, use_xml, tags=None):
self.log.debug("Varnish (rate) %s %d" % (metric_name, int(gauge_val)))
self.rate(metric_name, float(gauge_val), tags=tags)


def _parse_varnishadm(self, output):
""" Parse out service checks from varnishadm.
Example output:
Backend b0 is Sick
Current states good: 2 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
-------------------------------------------------------------444 Good IPv4
-------------------------------------------------------------XXX Good Xmit
-------------------------------------------------------------RRR Good Recv
----------------------------------------------------------HHH--- Happy
Backend b1 is Sick
Current states good: 2 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
----------------------------------------------------------HHH--- Happy
"""
# Process status by backend.
backends_by_status = defaultdict(list)
backend, status, message = None, None, None
for line in output.split("\n"):
tokens = line.strip().split(' ')
if len(tokens) > 0:
if tokens[0] == 'Backend':
backend = tokens[1]
status = tokens[1].lower()
elif tokens[0] == 'Current' and backend is not None:
try:
message = ' '.join(tokens[2:]).strip()
except Exception:
# If we can't parse a message still send a status.
self.log.exception('Error when parsing message from varnishadm')
message = ''
backends_by_status[status].append((backend, message))

for status, backends in backends_by_status.iteritems():
check_status = BackendStatus.to_check_status(status)
for backend, message in backends:
tags = ['backend:%s' % backend]
self.service_check(self.SERVICE_CHECK_NAME, check_status,
tags=tags, message=message)

35 changes: 25 additions & 10 deletions conf.d/varnish.yaml.example
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
init_config:

instances:
# - varnishstat: (required) String path to varnishstat binary
# name: (optional) String name of varnish instance. Passed to the -n parameter of varnishstat. Will also tag each metric with this name.
# tags: (optional) Additional tags to tag each metric with
#
# Example:
#
- varnishstat: /usr/bin/varnishstat
name: myvarnishinstance
tags:
- instance:production
# The full path to the varnishstat binary
# - varnishstat: /usr/bin/varnishstat

# The (optional) name will be used in the varnishstat command for the
# -n argument and will add a name:$instancename tag to all metrics.
# name: myvarnishinstance

# The (optional) list of tags will be applied to every emitted metric.
# tags:
# - instance:production

# The (optional) path to the varnishadm binary will signal the check to
# emit a service check status on backend health using `debug.health`.
# The service check will be tagged by backend.
# NOTE: The Agent must be able to access varnishadm as with root
# privilleges. You can configure your sudoers file for this:
#
# example /etc/sudoers entry:
# dd-agent ALL=(ALL) NOPASSWD:/usr/bin/varnishadm
#
# varnishadm: /usr/bin/varnishadm

# The (optional) path to the varnish secretfile will be used in the
# varnishadm command, if enabled.
# secretfile: /etc/varnish/secret
38 changes: 35 additions & 3 deletions tests/test_varnish.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
import os
import time
import unittest

from nose.plugins.attrib import attr

from tests.common import get_check


Expand Down Expand Up @@ -1853,7 +1854,7 @@ def setUp(self):
"""


def testParsing(self):
def test_parsing(self):
v, instances = get_check('varnish', self.config)
v._parse_varnishstat(self.v_dump, False)
metrics = v.get_metrics()
Expand All @@ -1868,7 +1869,7 @@ def testParsing(self):
if m[0] == "varnish.SMA.s0.g_space"][0], 120606)
assert "varnish.SMA.transient.c_bytes" not in [m[0] for m in metrics]

def testCheck(self):
def test_check(self):
v, instances = get_check('varnish', self.config)
import pprint
try:
Expand All @@ -1879,5 +1880,36 @@ def testCheck(self):
except Exception:
pass

def test_service_check(self):
varnishadm_dump = """
Backend b0 is Sick
Current states good: 0 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
4444444444444444444444444444444444444444444444444444444444444444 Good IPv4
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Good Xmit
RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR Good Recv
---------------------------------------------------------------- Happy
Backend b1 is Sick
Current states good: 0 threshold: 3 window: 5
Average responsetime of good probes: 0.000000
Oldest Newest
================================================================
---------------------------------------------------------------- Happy
"""
v, instances = get_check('varnish', self.config)
v._parse_varnishadm(varnishadm_dump)
service_checks = v.get_service_checks()
self.assertEquals(len(service_checks), 2)

b0_check = service_checks[0]
self.assertEquals(b0_check['check'], v.SERVICE_CHECK_NAME)
self.assertEquals(b0_check['tags'], ['backend:b0'])

b1_check = service_checks[1]
self.assertEquals(b1_check['check'], v.SERVICE_CHECK_NAME)
self.assertEquals(b1_check['tags'], ['backend:b1'])

if __name__ == '__main__':
unittest.main()

0 comments on commit 26c97df

Please sign in to comment.