From 5d2df5d9cde520b040ee682f6d9e0d99e46df444 Mon Sep 17 00:00:00 2001 From: Stefan Mees Date: Wed, 9 Jan 2013 18:42:34 +0100 Subject: [PATCH 1/2] riak agent checks --- checks.d/riak.py | 98 ++++++++++++++++++++++++++++++++++++++++ conf.d/riak.yaml.example | 4 ++ 2 files changed, 102 insertions(+) create mode 100644 checks.d/riak.py create mode 100644 conf.d/riak.yaml.example diff --git a/checks.d/riak.py b/checks.d/riak.py new file mode 100644 index 0000000000..03f0463ac5 --- /dev/null +++ b/checks.d/riak.py @@ -0,0 +1,98 @@ +import time +import requests + +from checks import AgentCheck +from hashlib import md5 +from util import json + +class Riak(AgentCheck): + + keys = [ + "vnode_gets", + "vnode_puts", + "vnode_index_reads", + "vnode_index_writes", + "vnode_index_deletes", + "node_gets", + "node_puts", + "pbc_active", + "pbc_connects", + "memory_total", + "memory_processes", + "memory_processes_used", + "memory_atom", + "memory_atom_used", + "memory_binary", + "memory_code", + "memory_ets", + "read_repairs", + ] + + stat_keys = [ + "node_get_fsm_siblings", + "node_get_fsm_objsize", + "node_get_fsm_time", + "node_put_fsm_time" + ] + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + for k in ["mean", "median", "95", "99", "100"]: + [self.keys.append(m + "_" + k) for m in self.stat_keys] + + self.prev_coord_redirs_total = -1 + + + def check(self, instance): + url = instance['url'] + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + aggregation_key = md5(url).hexdigest() + + try: + r = requests.get(url, timeout=timeout) + except requests.exceptions.Timeout as e: + self.timeout_event(url, timeout, aggregation_key) + return + + if r.status_code != 200: + self.status_code_event(url, r, aggregation_key) + + stats = json.loads(r.content) + + [self.gauge("riak." + k, stats[k]) for k in self.keys if k in stats] + + coord_redirs_total = stats["coord_redirs_total"] + if self.prev_coord_redirs_total > -1: + count = coord_redirs_total - self.prev_coord_redirs_total + self.gauge('riak.coord_redirs', count) + + self.prev_coord_redirs_total = coord_redirs_total + + def timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'riak_check', + 'msg_title': 'riak check timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'riak_check', + 'msg_title': 'Invalid reponse code for riak check', + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) + +if __name__ == '__main__': + check, instances = Riak.from_yaml('/etc/dd-agent/conf.d/riak.yml') + for instance in instances: + print "\nRunning the check against url: %s" % (instance['url']) + check.check(instance) + if check.has_events(): + print 'Events: %s' % (check.get_events()) + print 'Metrics: %s' % (check.get_metrics()) \ No newline at end of file diff --git a/conf.d/riak.yaml.example b/conf.d/riak.yaml.example new file mode 100644 index 0000000000..8d3545777d --- /dev/null +++ b/conf.d/riak.yaml.example @@ -0,0 +1,4 @@ +init_config: + +instances: +# - url: http:127.0.0.1:8098/stats From 578143b300896eacf75a82814a6b2fa8eb162548 Mon Sep 17 00:00:00 2001 From: Stefan Mees Date: Thu, 10 Jan 2013 09:49:13 +0100 Subject: [PATCH 2/2] correct indentation, import md5 from utils, use httplib2 instead of requests --- checks.d/riak.py | 178 +++++++++++++++++++++++++---------------------- 1 file changed, 95 insertions(+), 83 deletions(-) diff --git a/checks.d/riak.py b/checks.d/riak.py index 03f0463ac5..52267ac9f4 100644 --- a/checks.d/riak.py +++ b/checks.d/riak.py @@ -1,95 +1,107 @@ import time -import requests from checks import AgentCheck -from hashlib import md5 +from util import md5 from util import json +from checks.libs.httplib2 import Http, HttpLib2Error +import socket + class Riak(AgentCheck): - keys = [ - "vnode_gets", - "vnode_puts", - "vnode_index_reads", - "vnode_index_writes", - "vnode_index_deletes", - "node_gets", - "node_puts", - "pbc_active", - "pbc_connects", - "memory_total", - "memory_processes", - "memory_processes_used", - "memory_atom", - "memory_atom_used", - "memory_binary", - "memory_code", - "memory_ets", - "read_repairs", - ] - - stat_keys = [ - "node_get_fsm_siblings", - "node_get_fsm_objsize", - "node_get_fsm_time", - "node_put_fsm_time" - ] - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - for k in ["mean", "median", "95", "99", "100"]: - [self.keys.append(m + "_" + k) for m in self.stat_keys] - - self.prev_coord_redirs_total = -1 - - - def check(self, instance): - url = instance['url'] - default_timeout = self.init_config.get('default_timeout', 5) - timeout = float(instance.get('timeout', default_timeout)) - - aggregation_key = md5(url).hexdigest() - - try: - r = requests.get(url, timeout=timeout) - except requests.exceptions.Timeout as e: - self.timeout_event(url, timeout, aggregation_key) - return - - if r.status_code != 200: - self.status_code_event(url, r, aggregation_key) - - stats = json.loads(r.content) - - [self.gauge("riak." + k, stats[k]) for k in self.keys if k in stats] - - coord_redirs_total = stats["coord_redirs_total"] - if self.prev_coord_redirs_total > -1: - count = coord_redirs_total - self.prev_coord_redirs_total - self.gauge('riak.coord_redirs', count) - - self.prev_coord_redirs_total = coord_redirs_total - - def timeout_event(self, url, timeout, aggregation_key): - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'riak_check', - 'msg_title': 'riak check timeout', - 'msg_text': '%s timed out after %s seconds.' % (url, timeout), - 'aggregation_key': aggregation_key - }) - - def status_code_event(self, url, r, aggregation_key): - self.event({ - 'timestamp': int(time.time()), - 'event_type': 'riak_check', - 'msg_title': 'Invalid reponse code for riak check', - 'msg_text': '%s returned a status of %s' % (url, r.status_code), - 'aggregation_key': aggregation_key - }) + keys = [ + "vnode_gets", + "vnode_puts", + "vnode_index_reads", + "vnode_index_writes", + "vnode_index_deletes", + "node_gets", + "node_puts", + "pbc_active", + "pbc_connects", + "memory_total", + "memory_processes", + "memory_processes_used", + "memory_atom", + "memory_atom_used", + "memory_binary", + "memory_code", + "memory_ets", + "read_repairs", + ] + + stat_keys = [ + "node_get_fsm_siblings", + "node_get_fsm_objsize", + "node_get_fsm_time", + "node_put_fsm_time" + ] + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + for k in ["mean", "median", "95", "99", "100"]: + [self.keys.append(m + "_" + k) for m in self.stat_keys] + + self.prev_coord_redirs_total = -1 + + + def check(self, instance): + url = instance['url'] + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + aggregation_key = md5(url).hexdigest() + + try: + h = Http(timeout=timeout) + resp, content = h.request(url, "GET") + + except socket.timeout, e: + self.timeout_event(url, timeout, aggregation_key) + return + + except socket.error, e: + self.timeout_event(url, timeout, aggregation_key) + return + + except HttpLib2Error, e: + self.timeout_event(url, timeout, aggregation_key) + return + + if resp.status != 200: + self.status_code_event(url, r, aggregation_key) + + stats = json.loads(content) + + [self.gauge("riak." + k, stats[k]) for k in self.keys if k in stats] + + coord_redirs_total = stats["coord_redirs_total"] + if self.prev_coord_redirs_total > -1: + count = coord_redirs_total - self.prev_coord_redirs_total + self.gauge('riak.coord_redirs', count) + + self.prev_coord_redirs_total = coord_redirs_total + + def timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'riak_check', + 'msg_title': 'riak check timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'riak_check', + 'msg_title': 'Invalid reponse code for riak check', + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) if __name__ == '__main__': - check, instances = Riak.from_yaml('/etc/dd-agent/conf.d/riak.yml') + check, instances = Riak.from_yaml('/etc/dd-agent/conf.d/riak.yaml') for instance in instances: print "\nRunning the check against url: %s" % (instance['url']) check.check(instance)