Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Marathon and Mesos svc #1205

Merged
merged 8 commits into from
Dec 31, 2014
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 39 additions & 11 deletions checks.d/marathon.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
import requests

class Marathon(AgentCheck):

check_name = "marathon.can_connect"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be upper case (https://www.python.org/dev/peps/pep-0008/#constants)
I'll fix it.


def check(self, instance):
if 'url' not in instance:
raise Exception('Marathon instance missing "url" value.')
Expand All @@ -28,35 +31,61 @@ def check(self, instance):
for app in response['apps']:
tags = ['app_id:' + app['id'], 'version:' + app['version']] + instance_tags
for attr in ['taskRateLimit','instances','cpus','mem','tasksStaged','tasksRunning']:
self.gauge('marathon.' + attr, app[attr], tags=tags)
if hasattr(app, attr):
self.gauge('marathon.' + attr, app[attr], tags=tags)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we log something in debug if the attribute is not present ? (See comments in that PR: #1228 )

else:
self.warning('Marathon application (id: %s) has no attribute %s' % (app['id'], attr))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that should be self.log.debug as it can be expected in some cases depending on the version of marathon.

versions_reply = self.get_v2_app_versions(url, app['id'], timeout)
if versions_reply is not None:
self.gauge('marathon.versions', len(versions_reply['versions']), tags=tags)

def get_v2_apps(self, url, timeout):
# Use a hash of the URL as an aggregation key
aggregation_key = md5(url).hexdigest()
tags = ['url:%s' % url]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may also want the instance_tags here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we ? We don't for other service checks.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on what Remi said. We don't use custom tags in service checks for now. They should all share the same keys in the list of key:val tags

msg = None
status = None
try:
r = requests.get(url + "/v2/apps", timeout=timeout)
if r.status_code != 200:
self.status_code_event(url, r, aggregation_key)
status = AgentCheck.CRITICAL
msg = "Got %s when hitting %s" % (r.status_code, url)
except requests.exceptions.Timeout as e:
# If there's a timeout
self.timeout_event(url, timeout, aggregation_key)
raise Exception("Timeout when hitting %s" % url)
msg = "%s seconds timeout when hitting %s" % (timeout, url)
status = AgentCheck.CRITICAL
except Exception as e:
msg = e.message
status = AgentCheck.CRITICAL
finally:
if status is AgentCheck.CRITICAL:
self.service_check(self.check_name, status, tags=tags, message=msg)
raise Exception(msg)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need for do both self.warning and raise Exception. The second one is enough.


if r.status_code != 200:
self.status_code_event(url, r, aggregation_key)
raise Exception("Got %s when hitting %s" % (r.status_code, url))
req_json = r.json()
app_count = len(req_json['apps'])

# Condition for request v1.x backward compatibility
if hasattr(r.json, '__call__'):
return r.json()
if app_count is 0:
status = AgentCheck.WARN
msg = "No marathon applications detected at %s" % url
else:
return r.json
instance_count = 0
for app in req_json['apps']:
instance_count += app['instances']
status = AgentCheck.CRITICAL if instance_count is 0 else AgentCheck.OK
msg = "%s Marathon app(s) detected with %s instances running at %s" % (app_count, instance_count, url)


self.service_check(self.check_name, status, tags=tags, message=msg)

return req_json

def get_v2_app_versions(self, url, app_id, timeout):
# Use a hash of the URL as an aggregation key
aggregation_key = md5(url).hexdigest()

try:
r = requests.get(url + "/v2/apps/" + app_id + "/versions", timeout=timeout)
except requests.exceptions.Timeout as e:
Expand All @@ -68,7 +97,6 @@ def get_v2_app_versions(self, url, app_id, timeout):
if r.status_code != 200:
self.status_code_event(url, r, aggregation_key)
self.warning("Got %s when hitting %s" % (r.status_code, url))
return None

return r.json()

Expand Down
38 changes: 24 additions & 14 deletions checks.d/mesos.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import requests

class Mesos(AgentCheck):
check_name = "mesos.can_connect"

def check(self, instance):
if 'url' not in instance:
raise Exception('Mesos instance missing "url" value.')
Expand Down Expand Up @@ -71,25 +73,33 @@ def get_master_state(self, url, timeout):
def get_json(self, url, timeout):
# Use a hash of the URL as an aggregation key
aggregation_key = md5(url).hexdigest()
tags = ["url:%s" % url]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here with the concat of instance.get("tags", [])

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment. We don't send custom tags in service checks for other checks.

msg = None
status = None
try:
r = requests.get(url, timeout=timeout)
if r.status_code != 200:
self.status_code_event(url, r, aggregation_key)
status = AgentCheck.CRITICAL
msg = "Got %s when hitting %s" % (r.status_code, url)
else:
status = AgentCheck.OK
msg = "Mesos master instance detected at %s " % url
except requests.exceptions.Timeout as e:
# If there's a timeout
self.timeout_event(url, timeout, aggregation_key)
self.warning("Timeout when hitting %s" % url)
return None

if r.status_code != 200:
self.status_code_event(url, r, aggregation_key)
self.warning("Got %s when hitting %s" % (r.status_code, url))
return None

# Condition for request v1.x backward compatibility
if hasattr(r.json, '__call__'):
return r.json()
else:
return r.json

msg = "%s seconds timeout when hitting %s" % (timeout, url)
status = AgentCheck.CRITICAL
except Exception as e:
msg = e.message
status = AgentCheck.CRITICAL
finally:
self.service_check(self.check_name, status, tags=tags, message=msg)
if status is AgentCheck.CRITICAL:
self.warning(msg)
return None

return r.json()

def timeout_event(self, url, timeout, aggregation_key):
self.event({
Expand Down