From 382eba220a12cb5fdd9b75322e5504c8c305e353 Mon Sep 17 00:00:00 2001 From: Brendan Shaklovitz Date: Wed, 3 Aug 2016 07:35:49 -0500 Subject: [PATCH] [gearmand] Add per-task gearman metrics (#2672) * Adds gearman.{queued_by_task, running_by_task, workers_by_task} metrics to collect data on each individual task. This lets you see how many of each tasks are queued up to catch problems with any individual queue not being processed. * Each new metric is tagged by task: * Limits the maximum number of tasks on which per-task metrics are collected --- checks.d/gearmand.py | 43 ++++++++++++++++++++++++++++++------ conf.d/gearmand.yaml.example | 8 +++++++ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/checks.d/gearmand.py b/checks.d/gearmand.py index 823ce74b78..3f81344f00 100644 --- a/checks.d/gearmand.py +++ b/checks.d/gearmand.py @@ -9,6 +9,9 @@ # project from checks import AgentCheck + +MAX_NUM_TASKS = 200 + class Gearman(AgentCheck): SERVICE_CHECK_NAME = 'gearman.can_connect' @@ -20,18 +23,17 @@ def _get_client(self,host,port): return gearman.GearmanAdminClient(["%s:%s" % (host, port)]) - def _get_metrics(self, client, tags): - data = client.get_status() + def _get_aggregate_metrics(self, tasks, tags): running = 0 queued = 0 workers = 0 - for stat in data: + for stat in tasks: running += stat['running'] queued += stat['queued'] workers += stat['workers'] - unique_tasks = len(data) + unique_tasks = len(tasks) self.gauge("gearman.unique_tasks", unique_tasks, tags=tags) self.gauge("gearman.running", running, tags=tags) @@ -41,9 +43,34 @@ def _get_metrics(self, client, tags): self.log.debug("running %d, queued %d, unique tasks %d, workers: %d" % (running, queued, unique_tasks, workers)) + def _get_per_task_metrics(self, tasks, task_filter, tags): + if len(task_filter) > MAX_NUM_TASKS: + self.warning( + "The maximum number of tasks you can specify is {}.".format(MAX_NUM_TASKS)) + + if not len(task_filter) == 0: + tasks = [t for t in tasks if t['task'] in task_filter] + + if len(tasks) > MAX_NUM_TASKS: + # Display a warning in the info page + self.warning( + "Too many tasks to fetch. You must choose the tasks you are interested in by editing the gearmand.yaml configuration file or get in touch with Datadog Support") + + for stat in tasks[:MAX_NUM_TASKS]: + running = stat['running'] + queued = stat['queued'] + workers = stat['workers'] + + task_tags = tags[:] + task_tags.append("task:{}".format(stat['task'])) + self.gauge("gearman.running_by_task", running, tags=task_tags) + self.gauge("gearman.queued_by_task", queued, tags=task_tags) + self.gauge("gearman.workers_by_task", workers, tags=task_tags) + def _get_conf(self, instance): host = instance.get('server', None) port = instance.get('port', None) + tasks = instance.get('tasks', []) if host is None: self.warning("Host not set, assuming 127.0.0.1") @@ -55,12 +82,12 @@ def _get_conf(self, instance): tags = instance.get('tags', []) - return host, port, tags + return host, port, tasks, tags def check(self, instance): self.log.debug("Gearman check start") - host, port, tags = self._get_conf(instance) + host, port, task_filter, tags = self._get_conf(instance) service_check_tags = ["server:{0}".format(host), "port:{0}".format(port)] @@ -70,7 +97,9 @@ def check(self, instance): tags += service_check_tags try: - self._get_metrics(client, tags) + tasks = client.get_status() + self._get_aggregate_metrics(tasks, tags) + self._get_per_task_metrics(tasks, task_filter, tags) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, message="Connection to %s:%s succeeded." % (host, port), tags=service_check_tags) diff --git a/conf.d/gearmand.yaml.example b/conf.d/gearmand.yaml.example index 2abf945222..ab6a060672 100644 --- a/conf.d/gearmand.yaml.example +++ b/conf.d/gearmand.yaml.example @@ -3,6 +3,14 @@ init_config: instances: - server: localhost port: 4730 + # Use the `tasks` parameter to specify the tasks you'd like to + # collect metrics on (up to 200 tasks). + # + # If you have fewer than 200 tasks, you don't have to set this parameter, + # the metrics will be collected on all the tasks by default. + # tasks: + # - task1 + # - task2 # tags: # - optional_tag_1 # - optional_tag_2