Skip to content

Commit

Permalink
[snmp] Add support for forced SNMP data types to help w/ buggy devices
Browse files Browse the repository at this point in the history
[snmp] adding tests, minor fix to PR.

[snmp] with new SNMP network check, we run test twice to collect results.

[snmp] log network check exceptions, fix test.

[snmpd-test] add a sleep parameter to ensure interval >= 1
  • Loading branch information
chrissnell authored and truthbk committed Feb 9, 2016
1 parent 960db49 commit 8e36e58
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 9 deletions.
24 changes: 20 additions & 4 deletions checks.d/snmp.py
Expand Up @@ -328,7 +328,9 @@ def report_raw_metrics(self, metrics, results, tags):
Submit the results to the aggregator.
'''

for metric in metrics:
forced_type = metric.get('forced_type')
if 'OID' in metric:
queried_oid = metric['OID']
if queried_oid in results:
Expand All @@ -343,7 +345,7 @@ def report_raw_metrics(self, metrics, results, tags):
queried_oid)
continue
name = metric.get('name', 'unnamed_metric')
self.submit_metric(name, value, tags)
self.submit_metric(name, value, forced_type, tags)

def report_table_metrics(self, metrics, results, tags):
'''
Expand All @@ -354,6 +356,7 @@ def report_table_metrics(self, metrics, results, tags):
'''

for metric in metrics:
forced_type = metric.get('forced_type')
if 'table' in metric:
index_based_tags = []
column_based_tags = []
Expand All @@ -371,7 +374,7 @@ def report_table_metrics(self, metrics, results, tags):
metric_tags = tags + self.get_index_tags(index, results,
index_based_tags,
column_based_tags)
self.submit_metric(value_to_collect, val, metric_tags)
self.submit_metric(value_to_collect, val, forced_type, metric_tags)

elif 'symbol' in metric:
name = metric['symbol']
Expand All @@ -380,7 +383,7 @@ def report_table_metrics(self, metrics, results, tags):
self.log.warning("Several rows corresponding while the metric is supposed to be a scalar")
continue
val = result[0][1]
self.submit_metric(name, val, tags)
self.submit_metric(name, val, forced_type, tags)
elif 'OID' in metric:
pass # This one is already handled by the other batch of requests
else:
Expand Down Expand Up @@ -422,7 +425,7 @@ def get_index_tags(self, index, results, index_tags, column_tags):
tags.append("{0}:{1}".format(tag_group, tag_value))
return tags

def submit_metric(self, name, snmp_value, tags=[]):
def submit_metric(self, name, snmp_value, forced_type, tags=[]):
'''
Convert the values reported as pysnmp-Managed Objects to values and
report them to the aggregator
Expand All @@ -434,6 +437,19 @@ def submit_metric(self, name, snmp_value, tags=[]):

metric_name = self.normalize(name, prefix="snmp")

if forced_type:
if forced_type.lower() == "gauge":
value = int(snmp_value)
self.gauge(metric_name, value, tags)
elif forced_type.lower() == "counter":
value = int(snmp_value)
self.rate(metric_name, value, tags)
else:
self.warning("Invalid forced-type specified: {0} in {1}".format(forced_type, name))
raise Exception("Invalid forced-type in config file: {0}".format(name))

return

# Ugly hack but couldn't find a cleaner way
# Proper way would be to use the ASN1 method isSameTypeWith but it
# wrongfully returns True in the case of CounterBasedGauge64
Expand Down
11 changes: 10 additions & 1 deletion checks/network_checks.py
Expand Up @@ -93,6 +93,7 @@ def start_pool(self):

self.resultsq = Queue()
self.jobs_status = {}
self.jobs_results = {}
self.pool_started = True

def stop_pool(self):
Expand Down Expand Up @@ -122,7 +123,7 @@ def check(self, instance):
if name not in self.jobs_status:
# A given instance should be processed one at a time
self.jobs_status[name] = time.time()
self.pool.apply_async(self._process, args=(instance,))
self.jobs_results[name] = self.pool.apply_async(self._process, args=(instance,))
else:
self.log.error("Instance: %s skipped because it's already running." % name)

Expand Down Expand Up @@ -203,6 +204,14 @@ def _process_results(self):
if instance_name in self.jobs_status:
del self.jobs_status[instance_name]

# if an exception happened, log it
if instance_name in self.jobs_results:
ret = self.jobs_results[instance_name].get()
if isinstance(ret, Exception):
self.log.exception("Exception in worker thread: {0}".format(ret))
del self.jobs_results[instance_name]


def _check(self, instance):
"""This function should be implemented by inherited classes"""
raise NotImplementedError
Expand Down
10 changes: 10 additions & 0 deletions conf.d/snmp.yaml.example
Expand Up @@ -28,6 +28,16 @@ instances:
# - OID: 1.3.6.1.2.1.6.5
# name: tcpPassiveOpens
#
# # This monitor auto-detects OID data types from the remote agent's response.
# # If you're dealing with a buggy agent that returns incorrect data types for OIDs,
# # You can force the data type with the 'forced_type' parameter. Valid options for
# # this parameter are 'gauge' and 'counter'.
# # Example: When a F5 Networks load balancer is queried for this OID, it will return
# # it as a Counter64 when it should be a gauge. So, we force the data type to gauge:
# - OID: 1.3.6.1.4.1.3375.2.1.1.2.1.8.0
# name: F5_TotalCurrentConnections
# forced_type: gauge
#
# # You can also query a table and specify
# # - which columns to report as value (symbols)
# # - which columns / indexes to use as tags (metric_tags)
Expand Down
8 changes: 5 additions & 3 deletions tests/checks/common.py
Expand Up @@ -168,13 +168,13 @@ def run_check_twice(self, config, agent_config=None, mocks=None,
self.run_check(config, agent_config, mocks)

def run_check_n(self, config, agent_config=None, mocks=None,
force_reload=False, repeat=1):
force_reload=False, repeat=1, sleep=1):
for i in xrange(repeat):
if not i:
self.run_check(config, agent_config, mocks, force_reload)
else:
self.run_check(config, agent_config, mocks)
time.sleep(1)
time.sleep(sleep)

def run_check(self, config, agent_config=None, mocks=None, force_reload=False):
# If not loaded already, do it!
Expand Down Expand Up @@ -325,7 +325,7 @@ def _candidates_size_assert(self, candidates, count=None, at_least=1):
raise

def assertMetric(self, metric_name, value=None, tags=None, count=None,
at_least=1, hostname=None, device_name=None):
at_least=1, hostname=None, device_name=None, metric_type=None):
candidates = []
for m_name, ts, val, mdata in self.metrics:
if m_name == metric_name:
Expand All @@ -337,6 +337,8 @@ def assertMetric(self, metric_name, value=None, tags=None, count=None,
continue
if device_name is not None and mdata['device_name'] != device_name:
continue
if metric_type is not None and mdata['type'] != metric_type:
continue

candidates.append((m_name, ts, val, mdata))

Expand Down
73 changes: 72 additions & 1 deletion tests/checks/integration/test_snmp.py
Expand Up @@ -6,6 +6,7 @@

# agent
from checks import AgentCheck
from checks.metric_types import MetricTypes
from tests.checks.common import AgentCheckTest

# This test is dependent of having a fully open snmpd responding at localhost:161
Expand Down Expand Up @@ -64,6 +65,31 @@ class SNMPTestCase(AgentCheckTest):
}
]

FORCED_METRICS = [
{
'OID': "1.3.6.1.2.1.4.24.6.0", # Gauge32
'name': "IAmAGauge32",
'forced_type': 'counter'

}, {
'OID': "1.3.6.1.2.1.4.31.1.1.6.1", # Counter32
'name': "IAmACounter64",
'forced_type': 'gauge'
}
]
INVALID_FORCED_METRICS = [
{
'OID': "1.3.6.1.2.1.4.24.6.0", # Gauge32
'name': "IAmAGauge32",
'forced_type': 'counter'

}, {
'OID': "1.3.6.1.2.1.4.31.1.1.6.1", # Counter32
'name': "IAmACounter64",
'forced_type': 'histogram'
}
]

SCALAR_OBJECTS = [
{
'OID': "1.3.6.1.2.1.7.1.0",
Expand Down Expand Up @@ -265,7 +291,7 @@ def test_table(self):
config = {
'instances': [self.generate_instance_config(self.TABULAR_OBJECTS)]
}
self.run_check_n(config, repeat=3)
self.run_check_n(config, repeat=3, sleep=2)
self.service_checks = self.wait_for_async('get_service_checks', 'service_checks', 1)

# Test metrics
Expand Down Expand Up @@ -303,6 +329,51 @@ def test_invalid_metric(self):
tags=self.CHECK_TAGS, count=1)
self.coverage_report()

def test_forcedtype_metric(self):
"""
Forced Types should be reported as metrics of the forced type
"""
config = {
'instances': [self.generate_instance_config(self.FORCED_METRICS)]
}
self.run_check_twice(config)
self.service_checks = self.wait_for_async('get_service_checks', 'service_checks', 1)

for metric in self.FORCED_METRICS:
metric_name = "snmp." + (metric.get('name') or metric.get('symbol'))
if metric.get('forced_type') == MetricTypes.COUNTER:
# rate will be flushed as a gauge, so count should be 0.
self.assertMetric(metric_name, tags=self.CHECK_TAGS,
count=0, metric_type=MetricTypes.GAUGE)
elif metric.get('forced_type') == MetricTypes.GAUGE:
self.assertMetric(metric_name, tags=self.CHECK_TAGS,
count=1, metric_type=MetricTypes.GAUGE)

# # Test service check
self.assertServiceCheck("snmp.can_check", status=AgentCheck.OK,
tags=self.CHECK_TAGS, count=1)
self.coverage_report()

def test_invalid_forcedtype_metric(self):
"""
If a forced type is invalid a warning should be issued + a service check
should be available
"""
config = {
'instances': [self.generate_instance_config(self.INVALID_FORCED_METRICS)]
}

self.run_check(config)

self.warnings = self.wait_for_async('get_warnings', 'warnings', 1)
self.assertWarning("Invalid forced-type specified:", count=1, exact_match=False)

# # Test service check
self.service_checks = self.wait_for_async('get_service_checks', 'service_checks', 1)
self.assertServiceCheck("snmp.can_check", status=AgentCheck.CRITICAL,
tags=self.CHECK_TAGS, count=1)
self.coverage_report()

def test_network_failure(self):
"""
Network failure is reported in service check
Expand Down

0 comments on commit 8e36e58

Please sign in to comment.