Add a varnish service check.

The service check uses the `varnishadm debug.health` command to get the health of available backends. One check per backend will be submitted, tagged by the backend name. The BIG question here is how we will let users enable this because the varnishadm command requires access to the secret key which has root permissions by default (and is owned by root).
DataDog · Dec 10, 2014 · e28661b · LotharSee · Dec 30, 2014 · conorbranagan
1 parent 27d986b
commit e28661b
Show file tree

Hide file tree

Showing 3 changed files with 193 additions and 67 deletions.
diff --git a/checks.d/varnish.py b/checks.d/varnish.py
@@ -1,12 +1,29 @@
 # stdlib
-import xml.parsers.expat # python 2.4 compatible
+from collections import defaultdict
 import re
 import subprocess
+import xml.parsers.expat # python 2.4 compatible
 
 # project
 from checks import AgentCheck
 
+
+class BackendStatus(object):
+    HEALTHY = 'healthy'
+    SICK = 'sick'
+    ALL = (HEALTHY, SICK)
+
+    @classmethod
+    def to_check_status(cls, status):
+        if status == cls.HEALTHY:
+            return AgentCheck.OK
+        elif status == cls.SICK:
+            return AgentCheck.CRITICAL
+        return AgentCheck.UNKNOWN
+
 class Varnish(AgentCheck):
+    SERVICE_CHECK_NAME = 'varnish.backend_healthy'
+
     # XML parsing bits, a.k.a. Kafka in Code
     def _reset(self):
         self._current_element = ""
@@ -47,39 +64,6 @@ def _char_data(self, data):
                 self._current_str = data
 
     def check(self, instance):
-        """Extract stats from varnishstat -x
-
-        The text option (-1) is not reliable enough when counters get large.
-        VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615
-
-        2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
-        https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h
-
-        Bitmaps are not supported.
-
-        <varnishstat>
-            <stat>
-                <name>fetch_304</name>
-                <value>0</value>
-                <flag>a</flag>
-                <description>Fetch no body (304)</description>
-            </stat>
-            <stat>
-                <name>n_sess_mem</name>
-                <value>334</value>
-                <flag>i</flag>
-                <description>N struct sess_mem</description>
-            </stat>
-            <stat>
-                <type>LCK</type>
-                <ident>vcl</ident>
-                <name>creat</name>
-                <value>1</value>
-                <flag>a</flag>
-                <description>Created locks</description>
-            </stat>
-        </varnishstat>
-        """
         # Not configured? Not a problem.
         if instance.get("varnishstat", None) is None:
             raise Exception("varnishstat is not configured")
@@ -88,16 +72,47 @@ def check(self, instance):
             tags = []
         else:
             tags = list(set(tags))
+        varnishstat_path = instance.get("varnishstat")
         name = instance.get('name')
 
+        # Get version and version-specific args from varnishstat -V.
+        version, use_xml = self._get_version_info(varnishstat_path)
+
+        # Parse metrics from varnishstat.
+        arg = '-x' if use_xml else '-1'
+        cmd = [varnishstat_path, arg]
+
+        if name is not None:
+            cmd.extend(['-n', name])
+            tags += [u'varnish_name:%s' % name]
+        else:
+            tags += [u'varnish_name:default']
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                                     stderr=subprocess.PIPE)
+        output, error = proc.communicate()
+        if error and len(error) > 0:
+            self.log.error(error)
+        self._parse_varnishstat_metrics(varnishstat_path, use_xml, tags)
+
+        # Parse service checks from varnishadm.
+        varnishadm_path = instance.get('varnishadm')
+        if varnishadm_path:
+            secretfile_path = instance.get('secretfile', '/etc/varnish/secret')
+            varnishadm_path = 'sudo %s' % varnishadm_path
+            cmd = [varnishadm_path, '-S', secretfile_path, 'debug.health']
+            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+            output, _ = proc.communicate()
+            if output:
+                self._parse_varnishadm(output)
+
+    def _get_version_info(self, varnishstat_path):
         # Get the varnish version from varnishstat
-        output, error = subprocess.Popen([instance.get("varnishstat"), "-V"],
+        output, error = subprocess.Popen([varnishstat_path, "-V"],
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE).communicate()
 
         # Assumptions regarding varnish's version
         use_xml = True
-        arg = "-x" # varnishstat argument
         version = 3
 
         m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE)
@@ -118,26 +133,44 @@ def check(self, instance):
         # Location of varnishstat
         if version <= 2:
             use_xml = False
-            arg = "-1"
 
-        cmd = [instance.get("varnishstat"), arg]
-        if name is not None:
-            cmd.extend(['-n', name])
-            tags += [u'varnish_name:%s' % name]
-        else:
-            tags += [u'varnish_name:default']
-        try:
-            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
-                                         stderr=subprocess.PIPE)
-            output, error = proc.communicate()
-        except Exception:
-            self.log.error(u"Failed to run %s" % repr(cmd))
-            raise
-        if error and len(error) > 0:
-            self.log.error(error)
-        self._parse_varnishstat(output, use_xml, tags)
+        return version, use_xml
 
     def _parse_varnishstat(self, output, use_xml, tags=None):
+        """Extract stats from varnishstat -x
+
+        The text option (-1) is not reliable enough when counters get large.
+        VBE.media_video_prd_services_01(10.93.67.16,,8080).happy18446744073709551615
+
+        2 types of data, "a" for counter ("c" in newer versions of varnish), "i" for gauge ("g")
+        https://github.com/varnish/Varnish-Cache/blob/master/include/tbl/vsc_fields.h
+
+        Bitmaps are not supported.
+
+        Example XML output (with `use_xml=True`)
+        <varnishstat>
+            <stat>
+                <name>fetch_304</name>
+                <value>0</value>
+                <flag>a</flag>
+                <description>Fetch no body (304)</description>
+            </stat>
+            <stat>
+                <name>n_sess_mem</name>
+                <value>334</value>
+                <flag>i</flag>
+                <description>N struct sess_mem</description>
+            </stat>
+            <stat>
+                <type>LCK</type>
+                <ident>vcl</ident>
+                <name>creat</name>
+                <value>1</value>
+                <flag>a</flag>
+                <description>Created locks</description>
+            </stat>
+        </varnishstat>
+        """
         tags = tags or []
         if use_xml:
             p = xml.parsers.expat.ParserCreate()
@@ -165,4 +198,50 @@ def _parse_varnishstat(self, output, use_xml, tags=None):
                     self.log.debug("Varnish (rate) %s %d" % (metric_name, int(gauge_val)))
                     self.rate(metric_name, float(gauge_val), tags=tags)
 
-
+    def _parse_varnishadm(self, output):
+        """ Parse out service checks from varnishadm.
+
+        Example output:
+
+            Backend b0 is Sick
+            Current states  good:  2 threshold:  3 window:  5
+            Average responsetime of good probes: 0.000000
+            Oldest                                                    Newest
+            ================================================================
+            -------------------------------------------------------------444 Good IPv4
+            -------------------------------------------------------------XXX Good Xmit
+            -------------------------------------------------------------RRR Good Recv
+            ----------------------------------------------------------HHH--- Happy
+            Backend b1 is Sick
+            Current states  good:  2 threshold:  3 window:  5
+            Average responsetime of good probes: 0.000000
+            Oldest                                                    Newest
+            ================================================================
+            ----------------------------------------------------------HHH--- Happy
+
+        """
+        # Process status by backend.
+        backends_by_status = defaultdict(list)
+        backend, status, message = None, None, None
+        for line in output.split("\n"):
+            tokens = line.strip().split(' ')
+            if len(tokens) > 0:
+                if tokens[0] == 'Backend':
+                    backend = tokens[1]
+                    status = tokens[1].lower()
+                elif tokens[0] == 'Current' and backend is not None:
+                    try:
+                        message = ' '.join(tokens[2:]).strip()
+                    except Exception:
+                        # If we can't parse a message still send a status.
+                        self.log.exception('Error when parsing message from varnishadm')
+                        message = ''
+                    backends_by_status[status].append((backend, message))
+
+        for status, backends in backends_by_status.iteritems():
+            check_status = BackendStatus.to_check_status(status)
+            for backend, message in backends:
+                tags = ['backend:%s' % backend]
+                self.service_check(self.SERVICE_CHECK_NAME, check_status,
+                                   tags=tags, message=message)
+
diff --git a/conf.d/varnish.yaml.example b/conf.d/varnish.yaml.example
@@ -1,13 +1,28 @@
 init_config:
 
 instances:
-# - varnishstat: (required) String path to varnishstat binary
-#   name: (optional) String name of varnish instance. Passed to the -n parameter of varnishstat. Will also tag each metric with this name.
-#   tags: (optional) Additional tags to tag each metric with
-#
-# Example:
-#
-    - varnishstat: /usr/bin/varnishstat
-      name: myvarnishinstance
-      tags:
-        -   instance:production
+        # The full path to the varnishstat binary
+#   -   varnishstat: /usr/bin/varnishstat
+
+        # The (optional) name will be used in the varnishstat command for the
+        # -n argument and will add a name:$instancename tag to all metrics.
+#       name: myvarnishinstance
+
+        # The (optional) list of tags will be applied to every emitted metric.
+#       tags:
+#         -  instance:production
+
+        # The (optional) path to the varnishadm binary will signal the check to
+        # emit a service check status on backend health using `debug.health`.
+        # The service check will be tagged by backend.
+        # NOTE: The Agent must be able to access varnishadm as with root
+        # privilleges. You can configure your sudoers file for this:
+        #
+        # example /etc/sudoers entry:
+        #   dd-agent ALL=(ALL) NOPASSWD:/usr/bin/varnishadm
+        #
+#       varnishadm: /usr/bin/varnishadm
+
+        # The (optional) path to the varnish secretfile will be used in the
+        # varnishadm command, if enabled.
+#       secretfile: /etc/varnish/secret
diff --git a/tests/test_varnish.py b/tests/test_varnish.py
@@ -1,8 +1,9 @@
-import logging
 import os
 import time
 import unittest
 
+from nose.plugins.attrib import attr
+
 from tests.common import get_check
 
 
@@ -1853,7 +1854,7 @@ def setUp(self):
 """
 
 
-    def testParsing(self):
+    def test_parsing(self):
         v, instances = get_check('varnish', self.config)
         v._parse_varnishstat(self.v_dump, False)
         metrics = v.get_metrics()
@@ -1868,7 +1869,7 @@ def testParsing(self):
             if m[0] == "varnish.SMA.s0.g_space"][0], 120606)
         assert "varnish.SMA.transient.c_bytes" not in [m[0] for m in metrics]
 
-    def testCheck(self):
+    def test_check(self):
         v, instances = get_check('varnish', self.config)
         import pprint
         try:
@@ -1879,5 +1880,36 @@ def testCheck(self):
         except Exception:
             pass
 
+    def test_service_check(self):
+        varnishadm_dump = """
+Backend b0 is Sick
+Current states  good:  0 threshold:  3 window:  5
+Average responsetime of good probes: 0.000000
+Oldest                                                    Newest
+================================================================
+4444444444444444444444444444444444444444444444444444444444444444 Good IPv4
+XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Good Xmit
+RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR Good Recv
+---------------------------------------------------------------- Happy
+Backend b1 is Sick
+Current states  good:  0 threshold:  3 window:  5
+Average responsetime of good probes: 0.000000
+Oldest                                                    Newest
+================================================================
+---------------------------------------------------------------- Happy
+        """
+        v, instances = get_check('varnish', self.config)
+        v._parse_varnishadm(varnishadm_dump)
+        service_checks = v.get_service_checks()
+        self.assertEquals(len(service_checks), 2)
+
+        b0_check = service_checks[0]
+        self.assertEquals(b0_check['check'], v.SERVICE_CHECK_NAME)
+        self.assertEquals(b0_check['tags'], ['backend:b0'])
+
+        b1_check = service_checks[1]
+        self.assertEquals(b1_check['check'], v.SERVICE_CHECK_NAME)
+        self.assertEquals(b1_check['tags'], ['backend:b1'])
+
 if __name__ == '__main__':
     unittest.main()