[snmp] Add support for forced SNMP data types to help w/ buggy devices

[snmp] adding tests, minor fix to PR. [snmp] with new SNMP network check, we run test twice to collect results. [snmp] log network check exceptions, fix test. [snmpd-test] add a sleep parameter to ensure interval >= 1
DataDog · Feb 9, 2016 · 8e36e58 · 8e36e58
1 parent 960db49
commit 8e36e58
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 9 deletions.
diff --git a/checks.d/snmp.py b/checks.d/snmp.py
@@ -328,7 +328,9 @@ def report_raw_metrics(self, metrics, results, tags):
 
         Submit the results to the aggregator.
         '''
+
         for metric in metrics:
+            forced_type = metric.get('forced_type')
             if 'OID' in metric:
                 queried_oid = metric['OID']
                 if queried_oid in results:
@@ -343,7 +345,7 @@ def report_raw_metrics(self, metrics, results, tags):
                                          queried_oid)
                         continue
                 name = metric.get('name', 'unnamed_metric')
-                self.submit_metric(name, value, tags)
+                self.submit_metric(name, value, forced_type, tags)
 
     def report_table_metrics(self, metrics, results, tags):
         '''
@@ -354,6 +356,7 @@ def report_table_metrics(self, metrics, results, tags):
         '''
 
         for metric in metrics:
+            forced_type = metric.get('forced_type')
             if 'table' in metric:
                 index_based_tags = []
                 column_based_tags = []
@@ -371,7 +374,7 @@ def report_table_metrics(self, metrics, results, tags):
                         metric_tags = tags + self.get_index_tags(index, results,
                                                                  index_based_tags,
                                                                  column_based_tags)
-                        self.submit_metric(value_to_collect, val, metric_tags)
+                        self.submit_metric(value_to_collect, val, forced_type, metric_tags)
 
             elif 'symbol' in metric:
                 name = metric['symbol']
@@ -380,7 +383,7 @@ def report_table_metrics(self, metrics, results, tags):
                     self.log.warning("Several rows corresponding while the metric is supposed to be a scalar")
                     continue
                 val = result[0][1]
-                self.submit_metric(name, val, tags)
+                self.submit_metric(name, val, forced_type, tags)
             elif 'OID' in metric:
                 pass # This one is already handled by the other batch of requests
             else:
@@ -422,7 +425,7 @@ def get_index_tags(self, index, results, index_tags, column_tags):
             tags.append("{0}:{1}".format(tag_group, tag_value))
         return tags
 
-    def submit_metric(self, name, snmp_value, tags=[]):
+    def submit_metric(self, name, snmp_value, forced_type, tags=[]):
         '''
         Convert the values reported as pysnmp-Managed Objects to values and
         report them to the aggregator
@@ -434,6 +437,19 @@ def submit_metric(self, name, snmp_value, tags=[]):
 
         metric_name = self.normalize(name, prefix="snmp")
 
+        if forced_type:
+            if forced_type.lower() == "gauge":
+                value = int(snmp_value)
+                self.gauge(metric_name, value, tags)
+            elif forced_type.lower() == "counter":
+                value = int(snmp_value)
+                self.rate(metric_name, value, tags)
+            else:
+                self.warning("Invalid forced-type specified: {0} in {1}".format(forced_type, name))
+                raise Exception("Invalid forced-type in config file: {0}".format(name))
+
+            return
+
         # Ugly hack but couldn't find a cleaner way
         # Proper way would be to use the ASN1 method isSameTypeWith but it
         # wrongfully returns True in the case of CounterBasedGauge64

diff --git a/checks/network_checks.py b/checks/network_checks.py
@@ -93,6 +93,7 @@ def start_pool(self):
 
         self.resultsq = Queue()
         self.jobs_status = {}
+        self.jobs_results = {}
         self.pool_started = True
 
     def stop_pool(self):
@@ -122,7 +123,7 @@ def check(self, instance):
         if name not in self.jobs_status:
             # A given instance should be processed one at a time
             self.jobs_status[name] = time.time()
-            self.pool.apply_async(self._process, args=(instance,))
+            self.jobs_results[name] = self.pool.apply_async(self._process, args=(instance,))
         else:
             self.log.error("Instance: %s skipped because it's already running." % name)
 
@@ -203,6 +204,14 @@ def _process_results(self):
             if instance_name in self.jobs_status:
                 del self.jobs_status[instance_name]
 
+            # if an exception happened, log it
+            if instance_name in self.jobs_results:
+                ret = self.jobs_results[instance_name].get()
+                if isinstance(ret, Exception):
+                    self.log.exception("Exception in worker thread: {0}".format(ret))
+                del self.jobs_results[instance_name]
+
+
     def _check(self, instance):
         """This function should be implemented by inherited classes"""
         raise NotImplementedError

diff --git a/conf.d/snmp.yaml.example b/conf.d/snmp.yaml.example
@@ -28,6 +28,16 @@ instances:
   #     - OID: 1.3.6.1.2.1.6.5
   #       name: tcpPassiveOpens
   #
+  #     # This monitor auto-detects OID data types from the remote agent's response.
+  #     # If you're dealing with a buggy agent that returns incorrect data types for OIDs,
+  #     # You can force the data type with the 'forced_type' parameter.  Valid options for
+  #     # this parameter are 'gauge' and 'counter'.
+  #     # Example: When a F5 Networks load balancer is queried for this OID, it will return
+  #     # it as a Counter64 when it should be a gauge.  So, we force the data type to gauge:
+  #     - OID: 1.3.6.1.4.1.3375.2.1.1.2.1.8.0
+  #       name: F5_TotalCurrentConnections
+  #       forced_type: gauge
+  #
   #     # You can also query a table and specify
   #     #   - which columns to report as value (symbols)
   #     #   - which columns / indexes to use as tags (metric_tags)

diff --git a/tests/checks/common.py b/tests/checks/common.py
@@ -168,13 +168,13 @@ def run_check_twice(self, config, agent_config=None, mocks=None,
         self.run_check(config, agent_config, mocks)
 
     def run_check_n(self, config, agent_config=None, mocks=None,
-                    force_reload=False, repeat=1):
+                    force_reload=False, repeat=1, sleep=1):
         for i in xrange(repeat):
             if not i:
                 self.run_check(config, agent_config, mocks, force_reload)
             else:
                 self.run_check(config, agent_config, mocks)
-            time.sleep(1)
+            time.sleep(sleep)
 
     def run_check(self, config, agent_config=None, mocks=None, force_reload=False):
         # If not loaded already, do it!
@@ -325,7 +325,7 @@ def _candidates_size_assert(self, candidates, count=None, at_least=1):
             raise
 
     def assertMetric(self, metric_name, value=None, tags=None, count=None,
-                     at_least=1, hostname=None, device_name=None):
+                     at_least=1, hostname=None, device_name=None, metric_type=None):
         candidates = []
         for m_name, ts, val, mdata in self.metrics:
             if m_name == metric_name:
@@ -337,6 +337,8 @@ def assertMetric(self, metric_name, value=None, tags=None, count=None,
                     continue
                 if device_name is not None and mdata['device_name'] != device_name:
                     continue
+                if metric_type is not None and mdata['type'] != metric_type:
+                    continue
 
                 candidates.append((m_name, ts, val, mdata))
 

diff --git a/tests/checks/integration/test_snmp.py b/tests/checks/integration/test_snmp.py
@@ -6,6 +6,7 @@
 
 # agent
 from checks import AgentCheck
+from checks.metric_types import MetricTypes
 from tests.checks.common import AgentCheckTest
 
 # This test is dependent of having a fully open snmpd responding at localhost:161
@@ -64,6 +65,31 @@ class SNMPTestCase(AgentCheckTest):
         }
     ]
 
+    FORCED_METRICS = [
+        {
+            'OID': "1.3.6.1.2.1.4.24.6.0",          # Gauge32
+            'name': "IAmAGauge32",
+            'forced_type': 'counter'
+
+        }, {
+            'OID': "1.3.6.1.2.1.4.31.1.1.6.1",      # Counter32
+            'name': "IAmACounter64",
+            'forced_type': 'gauge'
+        }
+    ]
+    INVALID_FORCED_METRICS = [
+        {
+            'OID': "1.3.6.1.2.1.4.24.6.0",          # Gauge32
+            'name': "IAmAGauge32",
+            'forced_type': 'counter'
+
+        }, {
+            'OID': "1.3.6.1.2.1.4.31.1.1.6.1",      # Counter32
+            'name': "IAmACounter64",
+            'forced_type': 'histogram'
+        }
+    ]
+
     SCALAR_OBJECTS = [
         {
             'OID': "1.3.6.1.2.1.7.1.0",
@@ -265,7 +291,7 @@ def test_table(self):
         config = {
             'instances': [self.generate_instance_config(self.TABULAR_OBJECTS)]
         }
-        self.run_check_n(config, repeat=3)
+        self.run_check_n(config, repeat=3, sleep=2)
         self.service_checks = self.wait_for_async('get_service_checks', 'service_checks', 1)
 
         # Test metrics
@@ -303,6 +329,51 @@ def test_invalid_metric(self):
                                 tags=self.CHECK_TAGS, count=1)
         self.coverage_report()
 
+    def test_forcedtype_metric(self):
+        """
+        Forced Types should be reported as metrics of the forced type
+        """
+        config = {
+            'instances': [self.generate_instance_config(self.FORCED_METRICS)]
+        }
+        self.run_check_twice(config)
+        self.service_checks = self.wait_for_async('get_service_checks', 'service_checks', 1)
+
+        for metric in self.FORCED_METRICS:
+            metric_name = "snmp." + (metric.get('name') or metric.get('symbol'))
+            if metric.get('forced_type') == MetricTypes.COUNTER:
+                # rate will be flushed as a gauge, so count should be 0.
+                self.assertMetric(metric_name, tags=self.CHECK_TAGS,
+                                  count=0, metric_type=MetricTypes.GAUGE)
+            elif metric.get('forced_type') == MetricTypes.GAUGE:
+                self.assertMetric(metric_name, tags=self.CHECK_TAGS,
+                                  count=1, metric_type=MetricTypes.GAUGE)
+
+        # # Test service check
+        self.assertServiceCheck("snmp.can_check", status=AgentCheck.OK,
+                                tags=self.CHECK_TAGS, count=1)
+        self.coverage_report()
+
+    def test_invalid_forcedtype_metric(self):
+        """
+        If a forced type is invalid a warning should be issued + a service check
+        should be available
+        """
+        config = {
+            'instances': [self.generate_instance_config(self.INVALID_FORCED_METRICS)]
+        }
+
+        self.run_check(config)
+
+        self.warnings = self.wait_for_async('get_warnings', 'warnings', 1)
+        self.assertWarning("Invalid forced-type specified:", count=1, exact_match=False)
+
+        # # Test service check
+        self.service_checks = self.wait_for_async('get_service_checks', 'service_checks', 1)
+        self.assertServiceCheck("snmp.can_check", status=AgentCheck.CRITICAL,
+                                tags=self.CHECK_TAGS, count=1)
+        self.coverage_report()
+
     def test_network_failure(self):
         """
         Network failure is reported in service check