From a995fa2aad5137a57877532367c7d55dcbb56fd0 Mon Sep 17 00:00:00 2001 From: graemej Date: Mon, 28 Apr 2014 12:06:39 -0400 Subject: [PATCH 1/3] basic monitoring support for Apache Mesos --- checks.d/mesos.py | 113 ++++++++++++++++++++++++++++++++++++++ conf.d/mesos.yaml.example | 7 +++ 2 files changed, 120 insertions(+) create mode 100644 checks.d/mesos.py create mode 100644 conf.d/mesos.yaml.example diff --git a/checks.d/mesos.py b/checks.d/mesos.py new file mode 100644 index 0000000000..ac4bb2c426 --- /dev/null +++ b/checks.d/mesos.py @@ -0,0 +1,113 @@ +import time +import requests + +from checks import AgentCheck +from util import json, headers +from hashlib import md5 +import urllib2 + +class Mesos(AgentCheck): + def check(self, instance): + if 'url' not in instance: + self.log.info("Skipping instance, no url found.") + return + + # Load values from the instance config + url = instance['url'] + default_timeout = self.init_config.get('default_timeout', 5) + timeout = float(instance.get('timeout', default_timeout)) + + response = self.master_roles(url, timeout) + if response is not None: + for role in response['roles']: + tags = ['mesos','role:' + role['name']] + self.gauge('mesos.role.frameworks', len(role['frameworks']), tags=tags) + self.gauge('mesos.role.weight', role['weight'], tags=tags) + resources = role['resources'] + for attr in ['cpus','mem']: + if attr in resources: + self.gauge('mesos.role.' + attr, resources[attr], tags=tags) + + response = self.master_stats(url, timeout) + if response is not None: + for key in iter(response): + self.gauge('mesos.stats.' + key, response[key], tags=['mesos']) + + response = self.master_state(url, timeout) + if response is not None: + for attr in ['deactivated_slaves','failed_tasks','finished_tasks','killed_tasks','lost_tasks','staged_tasks','started_tasks']: + tags = ['mesos'] + self.gauge('mesos.state.' + attr, response[attr], tags=tags) + + for framework in response['frameworks']: + tags = ['mesos','framework:' + framework['id']] + resources = framework['resources'] + for attr in ['cpus','mem']: + if attr in resources: + self.gauge('mesos.state.framework.' + attr, resources[attr], tags=tags) + + for slave in response['slaves']: + tags = ['mesos','slave:' + slave['id']] + resources = slave['resources'] + for attr in ['cpus','mem','disk']: + if attr in resources: + self.gauge('mesos.state.slave.' + attr, resources[attr], tags=tags) + + def master_roles(self, url, timeout): + return self.get_json(url + "/master/roles.json", timeout) + + def master_stats(self, url, timeout): + return self.get_json(url + "/master/stats.json", timeout) + + def master_state(self, url, timeout): + return self.get_json(url + "/master/state.json", timeout) + + def get_json(self, url, timeout): + # Use a hash of the URL as an aggregation key + aggregation_key = md5(url).hexdigest() + + try: + response = requests.get(url, timeout=timeout) + parsed = response.json() + return parsed + except requests.exceptions.Timeout as e: + # If there's a timeout + self.timeout_event(url, timeout, aggregation_key) + return None + + if r.status_code != 200: + self.status_code_event(url, r, aggregation_key) + return None + + + def timeout_event(self, url, timeout, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'URL timeout', + 'msg_text': '%s timed out after %s seconds.' % (url, timeout), + 'aggregation_key': aggregation_key + }) + + def status_code_event(self, url, r, aggregation_key): + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'http_check', + 'msg_title': 'Invalid reponse code for %s' % url, + 'msg_text': '%s returned a status of %s' % (url, r.status_code), + 'aggregation_key': aggregation_key + }) + +if __name__ == '__main__': + check, instances = Mesos.from_yaml('/etc/dd-agent/conf.d/mesos.yaml') + for instance in instances: + print "\nRunning the check against url: %s" % (instance['url']) + check.check(instance) + if check.has_events(): + print 'Events: %s' % (check.get_events()) + + i = 0 + print 'Metrics:\n' + for metric in check.get_metrics(): + print " %d: %s" % (i, metric) + i += 1 \ No newline at end of file diff --git a/conf.d/mesos.yaml.example b/conf.d/mesos.yaml.example new file mode 100644 index 0000000000..c4527bc9c6 --- /dev/null +++ b/conf.d/mesos.yaml.example @@ -0,0 +1,7 @@ +init_config: +# time to wait on a Mesos API request +# default_timeout: 5 + +instances: +# url: the API endpoint of your Mesos master +# - url: "https://server:port" \ No newline at end of file From 887b49ae81400cd653d3398a85ae4b61e9365f9e Mon Sep 17 00:00:00 2001 From: graemej Date: Tue, 29 Apr 2014 17:07:06 -0400 Subject: [PATCH 2/3] add get_ prefix and raise on missing url --- checks.d/mesos.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/checks.d/mesos.py b/checks.d/mesos.py index ac4bb2c426..7449e629fe 100644 --- a/checks.d/mesos.py +++ b/checks.d/mesos.py @@ -9,7 +9,7 @@ class Mesos(AgentCheck): def check(self, instance): if 'url' not in instance: - self.log.info("Skipping instance, no url found.") + raise Exception('Mesos instance missing "url" value.') return # Load values from the instance config @@ -17,7 +17,7 @@ def check(self, instance): default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) - response = self.master_roles(url, timeout) + response = self.get_master_roles(url, timeout) if response is not None: for role in response['roles']: tags = ['mesos','role:' + role['name']] @@ -28,12 +28,12 @@ def check(self, instance): if attr in resources: self.gauge('mesos.role.' + attr, resources[attr], tags=tags) - response = self.master_stats(url, timeout) + response = self.get_master_stats(url, timeout) if response is not None: for key in iter(response): self.gauge('mesos.stats.' + key, response[key], tags=['mesos']) - response = self.master_state(url, timeout) + response = self.get_master_state(url, timeout) if response is not None: for attr in ['deactivated_slaves','failed_tasks','finished_tasks','killed_tasks','lost_tasks','staged_tasks','started_tasks']: tags = ['mesos'] @@ -53,13 +53,13 @@ def check(self, instance): if attr in resources: self.gauge('mesos.state.slave.' + attr, resources[attr], tags=tags) - def master_roles(self, url, timeout): + def get_master_roles(self, url, timeout): return self.get_json(url + "/master/roles.json", timeout) - def master_stats(self, url, timeout): + def get_master_stats(self, url, timeout): return self.get_json(url + "/master/stats.json", timeout) - def master_state(self, url, timeout): + def get_master_state(self, url, timeout): return self.get_json(url + "/master/state.json", timeout) def get_json(self, url, timeout): From 418435534ffdac7838db68087f822adac0facc55 Mon Sep 17 00:00:00 2001 From: graemej Date: Wed, 30 Apr 2014 13:23:06 -0400 Subject: [PATCH 3/3] add instance tags --- checks.d/mesos.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/checks.d/mesos.py b/checks.d/mesos.py index 7449e629fe..19b0eaf6f8 100644 --- a/checks.d/mesos.py +++ b/checks.d/mesos.py @@ -14,13 +14,14 @@ def check(self, instance): # Load values from the instance config url = instance['url'] + instance_tags = instance.get('tags', []) default_timeout = self.init_config.get('default_timeout', 5) timeout = float(instance.get('timeout', default_timeout)) response = self.get_master_roles(url, timeout) if response is not None: for role in response['roles']: - tags = ['mesos','role:' + role['name']] + tags = ['role:' + role['name']] + instance_tags self.gauge('mesos.role.frameworks', len(role['frameworks']), tags=tags) self.gauge('mesos.role.weight', role['weight'], tags=tags) resources = role['resources'] @@ -30,24 +31,25 @@ def check(self, instance): response = self.get_master_stats(url, timeout) if response is not None: + tags = instance_tags for key in iter(response): - self.gauge('mesos.stats.' + key, response[key], tags=['mesos']) + self.gauge('mesos.stats.' + key, response[key], tags=tags) response = self.get_master_state(url, timeout) if response is not None: + tags = instance_tags for attr in ['deactivated_slaves','failed_tasks','finished_tasks','killed_tasks','lost_tasks','staged_tasks','started_tasks']: - tags = ['mesos'] self.gauge('mesos.state.' + attr, response[attr], tags=tags) for framework in response['frameworks']: - tags = ['mesos','framework:' + framework['id']] + tags = ['framework:' + framework['id']] + instance_tags resources = framework['resources'] for attr in ['cpus','mem']: if attr in resources: self.gauge('mesos.state.framework.' + attr, resources[attr], tags=tags) for slave in response['slaves']: - tags = ['mesos','slave:' + slave['id']] + tags = ['mesos','slave:' + slave['id']] + instance_tags resources = slave['resources'] for attr in ['cpus','mem','disk']: if attr in resources: