From 0f6454546a21b687669c4d5e389c76d50e4c2058 Mon Sep 17 00:00:00 2001 From: Tracey McEvoy Date: Thu, 26 Jan 2017 16:45:21 -0800 Subject: [PATCH 1/8] Adding datadog tracing to ansible-datadog --- defaults/main.yml | 7 +++++++ handlers/main.yml | 7 ++++--- tasks/pkg-redhat.yml | 24 +++++++++++++++++++++++- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/defaults/main.yml b/defaults/main.yml index 859bfaf4..35ee4978 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -13,3 +13,10 @@ datadog_group: root # default apt repo datadog_apt_repo: "deb http://apt.datadoghq.com/ stable main" + +# default install datadog tracing agent +datadog-trace: False + +# temporary hardcoded datadog agents until tracing is fully integrated +datadog_agent_version: datadog-agent-5.10.1-1 +datadog_trace_agent_version: dd-trace-agent-0.99.62-1 diff --git a/handlers/main.yml b/handlers/main.yml index 4c8a0afb..8e9fe0ca 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -1,5 +1,6 @@ --- - - name: restart datadog-agent - action: service name=datadog-agent state=restarted - when: datadog_enabled + service: name=datadog-agent state=restarted + +- name: restart datadog-trace-agent + service: name=dd-trace-agent state=restarted \ No newline at end of file diff --git a/tasks/pkg-redhat.yml b/tasks/pkg-redhat.yml index 78b0e58f..7da3106d 100644 --- a/tasks/pkg-redhat.yml +++ b/tasks/pkg-redhat.yml @@ -3,4 +3,26 @@ template: src=datadog.repo.j2 dest=/etc/yum.repos.d/datadog.repo owner=root group=root mode=0644 - name: Install datadog-agent package - yum: name=datadog-agent state=latest + yum: name={{ datadog_agent_version }} state=present enablerepo=datadog + notify: restart datadog-agent + +- name: Configure datadog-agent + template: src=datadog/datadog.conf.j2 dest=/etc/dd-agent/datadog.conf + notify: restart datadog-agent + +- name: Update yum cache + command: sudo yum makecache + when: datadog-trace is defined and datadog-trace == True + +- name: Copy trace repo file into place + template: src=datadog-trace.repo.j2 dest=/etc/yum.repos.d/datadog-trace.repo owner=root group=root mode=0644 + when: datadog-trace is defined and datadog-trace == True + +- name: Install datadog-trace agent + yum: name={{ datadog_trace_agent_version }} state=present + when: datadog-trace is defined and datadog-trace == True + notify: restart datadog-trace-agent + +- name: Uninstall dd-trace package + yum: name=dd-trace-agent state=absent + when: datadog-trace is defined and datadog-trace == False \ No newline at end of file From ec7595714260403e44d5c48ee9521e9c5a1d3725 Mon Sep 17 00:00:00 2001 From: Tracey McEvoy Date: Mon, 30 Jan 2017 14:22:13 +0000 Subject: [PATCH 2/8] Updating config files --- tasks/pkg-redhat.yml | 2 +- templates/datadog-trace.repo.j2 | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 templates/datadog-trace.repo.j2 diff --git a/tasks/pkg-redhat.yml b/tasks/pkg-redhat.yml index 7da3106d..d856ee9f 100644 --- a/tasks/pkg-redhat.yml +++ b/tasks/pkg-redhat.yml @@ -7,7 +7,7 @@ notify: restart datadog-agent - name: Configure datadog-agent - template: src=datadog/datadog.conf.j2 dest=/etc/dd-agent/datadog.conf + template: src=datadog.conf.j2 dest=/etc/dd-agent/datadog.conf notify: restart datadog-agent - name: Update yum cache diff --git a/templates/datadog-trace.repo.j2 b/templates/datadog-trace.repo.j2 new file mode 100644 index 00000000..c07f38f1 --- /dev/null +++ b/templates/datadog-trace.repo.j2 @@ -0,0 +1,7 @@ +[datadog-trace] +name = Datadog, Inc. +baseurl = http://yum-trace.datad0g.com.s3.amazonaws.com/x86_64/ +enabled=1 +priority=1 +gpgcheck=1 +gpgkey=https://yum.datadoghq.com/DATADOG_RPM_KEY_E09422B3.public \ No newline at end of file From fcf9ff0c9a96e68fbeaee40218f75a7acba18f29 Mon Sep 17 00:00:00 2001 From: Tracey McEvoy Date: Tue, 31 Jan 2017 15:25:54 +0000 Subject: [PATCH 3/8] Adding ability to add check agents to main.yml --- defaults/main.yml | 3 +++ tasks/main.yml | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/defaults/main.yml b/defaults/main.yml index 35ee4978..84b173fb 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -7,6 +7,9 @@ datadog_config: {} # default checks enabled datadog_checks: {} +# default checks enabled +datadog_check_agents: {} + # default user/group datadog_user: dd-agent datadog_group: root diff --git a/tasks/main.yml b/tasks/main.yml index ffa0bd65..8a345e73 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -36,3 +36,13 @@ with_items: '{{ datadog_checks.keys() }}' notify: - restart datadog-agent + +- name: Create a check agent for each Datadog check + copy: + src={{ item.src }}/{{ item.name }} + dest=/etc/dd-agent/checks.d/{{ item.name }} + owner={{ datadog_user }} + group={{ datadog_group }} + with_items: '{{ datadog_check_agents.keys() }}' + notify: + - restart datadog-agent From f405a8671a7f013cc616c10ed189a5bf6b7c31c4 Mon Sep 17 00:00:00 2001 From: Tracey McEvoy Date: Tue, 31 Jan 2017 16:25:47 +0000 Subject: [PATCH 4/8] Adding datadock config checks to datadog role --- files/nutcracker.py | 217 +++++++++++++++++++++++++ files/redis-labs-enterprise-cluster.py | 195 ++++++++++++++++++++++ files/uwsgi.py | 128 +++++++++++++++ tasks/main.yml | 4 +- 4 files changed, 542 insertions(+), 2 deletions(-) create mode 100644 files/nutcracker.py create mode 100644 files/redis-labs-enterprise-cluster.py create mode 100755 files/uwsgi.py diff --git a/files/nutcracker.py b/files/nutcracker.py new file mode 100644 index 00000000..124ac905 --- /dev/null +++ b/files/nutcracker.py @@ -0,0 +1,217 @@ +""" + +To test this, run 'sudo -u dd-agent dd-agent check nutcracker' + +When ready: +- place this file in /etc/dd-agent/checks.d/nutcracker.py +- put the config file in /etc/dd-agent/conf.d/nutcracker.yaml +- service datadog-agent restart +""" + +import hashlib +import json +import md5 +import memcache +import os +import socket +import sys +import time +import uuid + +from checks import AgentCheck + +class NutcrackerCheck(AgentCheck): + SOURCE_TYPE_NAME = 'nutcracker' + SERVICE_CHECK = 'nutcracker.can_connect' + + DEFAULT_HOST = '127.0.0.1' + DEFAULT_PORT = 11211 + DEFAULT_STATS_PORT = 22222 + + # Pool stats. These descriptions are from 'nutcracker --describe-stats' + POOL_STATS = [ + ['curr_connections', 'gauge', None], # Number of current connections + ['total_connections', 'rate', None], # Running total connections made + ['server_ejects', 'rate', None], # times a backend server was ejected + ['client_err', 'rate', None], # errors on client connections + ] + + # Server stats. These descriptions are from 'nutcracker --describe-stats' + SERVER_STATS = [ + ['server_eof', 'rate', None], # eof on server connections + ['server_err', 'rate', None], # errors on server connections + ['server_timedout', 'rate', 'timedout'], # timeouts on server connections + ['server_connections', 'gauge', 'connections'], # active server connections + ['requests', 'rate', None], # requests + ['request_bytes', 'rate', None], # total request bytes + ['responses', 'rate', None], # responses + ['response_bytes', 'rate', None], # total response bytes + ['in_queue', 'gauge', None], # requests in incoming queue + ['in_queue_bytes', 'gauge', None], # current request bytes in incoming queue + ['out_queue', 'gauge', None], # requests in outgoing queue + ['out_queue_bytes', 'gauge', None], # current request bytes in outgoing queue + ] + + def _get_raw_stats(self, host, stats_port): + # Connect + self.log.debug("Connecting to %s:%s", host, stats_port) + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((host, stats_port)) + + # Read + file = s.makefile('r') + data = file.readline(); + s.close() + + # Load + return json.loads(data); + + def _send_datadog_stat(self, item, data, tag_map, prefix): + # Break out the info + stat_key, stat_type, override_name = item + + # Make sure we have a name + if not override_name: + override_name = stat_key + + # Add the prefix if appropriate. + if prefix: + override_name = prefix + "_" + override_name + + try: + # Get the data, make sure it's there. + stat_data = float(data.get(stat_key)) + except: + # Hrm, not there. Let it be zero. + stat_data = 0 + + # Make the datadog metric. + metric = self.normalize(override_name.lower(), self.SOURCE_TYPE_NAME) + + tags = [k+":"+v for k,v in tag_map.iteritems()] + + if stat_type == 'gauge': + self.gauge(metric, stat_data, tags=tags) + return + + if stat_type == 'rate': + metric += "_rate" + self.rate(metric, stat_data, tags=tags) + return + + if stat_type == 'bool': + self.gauge(metric, (1 if stat_data else 0), tags=tags) + return + + raise Exception("Unknown datadog stat type '%s' for key '%s'" % (stat_type, stat_key)) + + def _get_metrics(self, host, port, stats_port, tags, aggregation_key): + try: + raw_stats = self._get_raw_stats(host, stats_port) + except Exception as e: + self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL) + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'get_stats', + 'msg_title': 'Cannot get stats', + 'msg_text': str(e), + 'aggregation_key': aggregation_key + }) + + raise + + + # pprint.pprint(raw_stats) + + # Get all the pool stats + for pool_key, pool_data in raw_stats.iteritems(): + try: + # Pools are not separated from the other keys, blarg. + # Just check if it's a dict with one of the pool keys, if not then skip it. + pool_data['client_connections'] + except: + # Not there, it's not a pool. + self.log.debug(pool_key + ": NOT A POOL"); + continue + + # Start the stat tags. + tags['nutcracker_pool'] = pool_key + + # It's a pool. Process all the non-server stats + for item in self.POOL_STATS: + self._send_datadog_stat(item, pool_data, tags, "pool") + + # Find all the servers. + for server_key, server_data in pool_data.iteritems(): + try: + # Servers are not separated from the other keys, blarg. + # Just check if it's a dict with one of the server keys, if not then skip it. + server_data['in_queue_bytes'] + except: + # Not there, it's not a server. + self.log.debug(server_key + ": NOT A SERVER"); + continue + + # Set the server in the tags. + tags['nutcracker_pool_server'] = server_key + + # It's a server. Send stats. + for item in self.SERVER_STATS: + self._send_datadog_stat(item, server_data, tags, "server") + + # The key for our roundtrip tests. + key = uuid.uuid4().hex + + try: + # Make the connection and do a round trip. + mc = memcache.Client([host+':'+str(port)], debug=0) + + mc.set(key, key) + data = mc.get(key) + mc.delete(key) + empty_data = mc.get(key) + + # Did the get work? + if data != key: + raise Exception("Cannot set and get") + + # Did the delete work? + if empty_data: + raise Exception("Cannot delete") + + except Exception as e: + # Something failed. + metric = self.normalize("test_connect_fail", self.SOURCE_TYPE_NAME) + self.gauge(metric, 1, tags=tags) + + self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL) + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'test_data', + 'msg_title': 'Cannot get/set/delete', + 'msg_text': str(e), + 'aggregation_key': aggregation_key + }) + + raise + + # Connection is ok. + self.service_check(self.SERVICE_CHECK, AgentCheck.OK) + + # Called by datadog as the starting point for this check. + def check(self, instance): + host = instance.get('host', self.DEFAULT_HOST) + port = int(instance.get('port', self.DEFAULT_PORT)) + stats_port = int(instance.get('stats_port', self.DEFAULT_STATS_PORT)) + + tags = {} + for item in instance.get('tags', []): + k, v = item.split(":", 1) + tags[k] = v + + tags["host"] = host + ":" + str(port) + + aggregation_key = hashlib.md5(host+":"+str(port)).hexdigest() + + self._get_metrics(host, port, stats_port, tags, aggregation_key) + diff --git a/files/redis-labs-enterprise-cluster.py b/files/redis-labs-enterprise-cluster.py new file mode 100644 index 00000000..75dd8750 --- /dev/null +++ b/files/redis-labs-enterprise-cluster.py @@ -0,0 +1,195 @@ +""" +To test this, run: +'sudo -u dd-agent dd-agent check redis-labs-enterprise-cluster' + +When ready: +- place this file in /etc/dd-agent/checks.d/redis-labs-enterprise-cluster.py +- put the config in /etc/dd-agent/conf.d/redis-labs-enterprise-cluster.yaml +- service datadog-agent restart +""" + +import base64 +import json +import ssl +import socket +import time +import urllib2 + +from checks import AgentCheck + +GIG = 1024 * 1024 * 1024 + + +class RedisLabsEnterpriseClusterCheck(AgentCheck): + SOURCE_TYPE_NAME = 'rlec' + SERVICE_CHECK = 'rlec.can_connect' + + DEFAULT_HOST = '127.0.0.1' + DEFAULT_PORT = 9443 + + stats_endpoints = [ + 'bdbs', + 'nodes', + 'bdbs/stats/last', + 'shards/stats/last', + 'nodes/stats/last', + 'cluster/stats/last', + # 'cluster/actions', + ] + + def metric_name(self, metric): + return self.normalize(metric.lower(), self.SOURCE_TYPE_NAME) + + def _get_raw_stats(self, host, port, username, password): + data = {} + for endpoint in self.stats_endpoints: + url = 'https://%s:%s/v1/%s' % (host, port, endpoint) + + req = urllib2.Request(url) + + base64string = base64.encodestring('%s:%s' % (username, password)) + base64string = base64string[:-1] + req.add_header("Authorization", "Basic %s" % base64string) + + context = ssl._create_unverified_context() + response = urllib2.urlopen(req, context=context) + + data[endpoint] = json.loads(response.read()) + + return data + + def gauge(self, metric, value, *args, **kwargs): + metric = self.metric_name(metric) + return super(RedisLabsEnterpriseClusterCheck, self).gauge( + metric, value, *args, **kwargs + ) + + def _get_metrics_dbs(self, raw_stats): + bdb_stats_map = raw_stats['bdbs/stats/last'] + for item in raw_stats['bdbs']: + uid = item['uid'] + str_uid = str(uid) + bdb_stats = bdb_stats_map[str_uid] + + name = item['name'] + + tags = [ + 'db_name:%s' % name + ] + + mem_gigs = int(item['memory_size']) / GIG + used_gigs = int(bdb_stats['used_memory']) / GIG + + self.gauge('db.total_size_in_gigs', mem_gigs, tags=tags) + self.gauge('db.num_shards', item['shards_count'], tags=tags) + self.gauge('db.read_hits', bdb_stats['read_hits'], tags=tags) + self.gauge('db.read_misses', bdb_stats['read_misses'], tags=tags) + self.gauge('db.write_hits', bdb_stats['write_hits'], tags=tags) + self.gauge('db.write_misses', bdb_stats['write_misses'], tags=tags) + self.gauge('db.num_connections', bdb_stats['conns'], tags=tags) + self.gauge('db.num_keys', bdb_stats['no_of_keys'], tags=tags) + self.gauge('db.bytes_added', bdb_stats['ingress_bytes'], tags=tags) + self.gauge('db.bytes_read', bdb_stats['egress_bytes'], tags=tags) + self.gauge('db.count_evicted', bdb_stats['evicted_objects'], + tags=tags) + self.gauge('db.count_expired', bdb_stats['expired_objects'], + tags=tags) + self.gauge('db.ops_per_sec', + bdb_stats['instantaneous_ops_per_sec'], tags=tags) + self.gauge('db.used_memory_in_gigs', used_gigs, tags=tags) + + def _get_metrics_nodes(self, raw_stats): + node_stats_map = raw_stats['nodes/stats/last'] + for node in raw_stats['nodes']: + # Get the node ip. Don't send these stats if it doesn't match + # the node we're on. + ip = node['addr'] + if ip != socket.gethostbyname(socket.gethostname()): + continue + + uid = node['uid'] + node_stats = node_stats_map[str(uid)] + + tags = [ + 'node_ip:%s' % ip, + 'node_uid:%s' % uid + ] + + is_active = 1 if (node['status'] == 'active') else 0 + ephemeral_gigs = int(node_stats['ephemeral_storage_free']) / GIG + persistent_gigs = int(node_stats['persistent_storage_free']) / GIG + memory_gigs = int(node_stats['free_memory']) / GIG + + self.gauge('node.shard_count', node['shard_count'], tags=tags) + self.gauge('node.active', is_active, tags=tags) + self.gauge('node.connections', node_stats['conns'], tags=tags) + self.gauge('node.aof_rewrites', node_stats['cur_aof_rewrites'], + tags=tags) + self.gauge('node.ephemeral_free_space_gigs', ephemeral_gigs, + tags=tags) + self.gauge('node.persistent_free_space_gigs', persistent_gigs, + tags=tags) + self.gauge('node.free_memory_gigs', memory_gigs, tags=tags) + self.gauge('node.requests', node_stats['total_req'], tags=tags) + + def _get_metrics_shards(self, raw_stats): + """ + At this time it looks like this isn't useful info + + shards_stats_map = raw_stats['shards/stats/last'] + """ + + def _get_metrics_cluster(self, raw_stats): + stats = raw_stats['cluster/stats/last'] + + tags = [] + + ephemeral_gigs = int(stats['ephemeral_storage_free']) / GIG + persistent_gigs = int(stats['persistent_storage_free']) / GIG + memory_gigs = int(stats['free_memory']) / GIG + + self.gauge('cluster.connections', stats['conns'], tags=tags) + self.gauge('cluster.ephemeral_free_space_gigs', ephemeral_gigs, + tags=tags) + self.gauge('cluster.persistent_free_space_gigs', persistent_gigs, + tags=tags) + self.gauge('cluster.free_memory_gigs', memory_gigs, tags=tags) + self.gauge('cluster.requests', stats['total_req'], tags=tags) + self.gauge('cluster.bytes_added', stats['ingress_bytes'], tags=tags) + self.gauge('cluster.bytes_read', stats['egress_bytes'], tags=tags) + self.gauge('cluster.cpu_idle', stats['cpu_idle'], tags=tags) + + def _get_metrics(self, host, port, username, password, tags): + try: + raw_stats = self._get_raw_stats(host, port, username, password) + except Exception: + self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL) + self.increment(self.metric_name('node.get_stats.failure'), 1, tags = [ + 'host:%s' % host, + ]) + + raise + + # Send all stats + self._get_metrics_dbs(raw_stats) + self._get_metrics_nodes(raw_stats) + self._get_metrics_shards(raw_stats) + self._get_metrics_cluster(raw_stats) + + # Connection is ok. + self.service_check(self.SERVICE_CHECK, AgentCheck.OK) + + # Called by datadog as the starting point for this check. + def check(self, instance): + host = instance.get('host', self.DEFAULT_HOST) + port = int(instance.get('port', self.DEFAULT_PORT)) + username = instance['username'] + password = instance['password'] + + tags = {} + for item in instance.get('tags', []): + k, v = item.split(":", 1) + tags[k] = v + + self._get_metrics(host, port, username, password, tags) + diff --git a/files/uwsgi.py b/files/uwsgi.py new file mode 100755 index 00000000..81d698f3 --- /dev/null +++ b/files/uwsgi.py @@ -0,0 +1,128 @@ +""" + +To test this, run 'sudo -u dd-agent dd-agent check uwsgi' + +When ready: +- place this file in /etc/dd-agent/checks.d/uwsgi.py +- put the config file in /etc/dd-agent/conf.d/uwsgi.yaml +- service datadog-agent restart +""" + +import hashlib +import glob +import json +import os +import socket +from stat import ST_CTIME +import time + +from checks import AgentCheck + + +class UwsgiCheck(AgentCheck): + SOURCE_TYPE_NAME = 'uwsgi' + SERVICE_CHECK = 'uwsgi.can_connect' + + def metric_name(self, metric): + return self.normalize(metric.lower(), self.SOURCE_TYPE_NAME) + + def gauge(self, metric, value, *args, **kwargs): + metric = self.metric_name(metric) + return super(UwsgiCheck, self).gauge( + metric, value, *args, **kwargs + ) + + def histogram(self, metric, value, *args, **kwargs): + metric = self.metric_name(metric) + return super(UwsgiCheck, self).histogram( + metric, value, *args, **kwargs + ) + + def _get_raw_stats(self): + chosen_socket = None + latest_ctime = 0 + files = glob.glob('/tmp/uwsgi_stats_*.socket') + for fname in files: + stats = os.stat(fname) + + if stats[ST_CTIME] > latest_ctime: + latest_ctime = stats[ST_CTIME] + chosen_socket = fname + + if not chosen_socket: + raise RuntimeError("Cannot find uwsgi stats socket file") + + sock_obj = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock_obj.connect(chosen_socket) + + json_str = '' + while True: + data = sock_obj.recv(4096) + if len(data) < 1: + break + json_str += data.decode('utf8') + + return json.loads(json_str) + + def _send_stats(self, data): + self._send_stats_workers(data) + + def _send_stats_workers(self, data): + code_dir = data['cwd'] + master_pid = data['pid'] + global_tags = [ + 'code_dir:%s' % code_dir, + 'master_pid:%s' % master_pid, + ] + + self.gauge('listen_queue', data['listen_queue'], tags=global_tags) + self.gauge('listen_queue_errors', data['listen_queue_errors'], tags=global_tags) + + for worker in data['workers']: + worker_tags = [ + 'pid:%s' % worker['pid'], + 'worker_id:%s' % worker['id'], + ] + tags = worker_tags + global_tags + + self.gauge('worker.accepting', worker['accepting'], tags=tags) + self.gauge('worker.status.%s' % worker['status'], 1, tags=tags) + self.gauge('worker.running_time', worker['running_time'], + tags=tags) + self.gauge('worker.data_transmitted', worker['tx'], tags=tags) + self.gauge('worker.address_space', worker['vsz'], tags=tags) + self.gauge('worker.rss_memory', worker['rss'], tags=tags) + self.gauge('worker.respawn_count', worker['respawn_count'], + tags=tags) + self.gauge('worker.exceptions_count', worker['exceptions'], + tags=tags) + self.gauge('worker.harakiri_count', worker['harakiri_count'], + tags=tags) + self.histogram('worker.avg_response_time_ms', + worker['avg_rt']/1000, tags=tags) + + def _get_metrics(self, aggregation_key): + try: + raw_stats = self._get_raw_stats() + except Exception as e: + self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL) + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'get_stats', + 'msg_title': 'Cannot get stats', + 'msg_text': str(e), + 'aggregation_key': aggregation_key + }) + + raise + + self._send_stats(raw_stats) + + # Connection is ok. + self.service_check(self.SERVICE_CHECK, AgentCheck.OK) + + # Called by datadog as the starting point for this check. + def check(self, instance): + aggregation_key = hashlib.md5().hexdigest() + self._get_metrics(aggregation_key) + diff --git a/tasks/main.yml b/tasks/main.yml index 8a345e73..7dacb213 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -39,8 +39,8 @@ - name: Create a check agent for each Datadog check copy: - src={{ item.src }}/{{ item.name }} - dest=/etc/dd-agent/checks.d/{{ item.name }} + src={{ item.name }}.py + dest=/etc/dd-agent/checks.d/{{ item.name }}.py owner={{ datadog_user }} group={{ datadog_group }} with_items: '{{ datadog_check_agents.keys() }}' From e5154d593d08c8e995572e56271449ceb76c3728 Mon Sep 17 00:00:00 2001 From: Tracey McEvoy Date: Tue, 31 Jan 2017 16:25:47 +0000 Subject: [PATCH 5/8] Adding datadock config checks to datadog role --- files/nutcracker.py | 217 +++++++++++++++++++++++++ files/redis-labs-enterprise-cluster.py | 195 ++++++++++++++++++++++ files/uwsgi.py | 128 +++++++++++++++ tasks/main.yml | 6 +- 4 files changed, 543 insertions(+), 3 deletions(-) create mode 100644 files/nutcracker.py create mode 100644 files/redis-labs-enterprise-cluster.py create mode 100755 files/uwsgi.py diff --git a/files/nutcracker.py b/files/nutcracker.py new file mode 100644 index 00000000..124ac905 --- /dev/null +++ b/files/nutcracker.py @@ -0,0 +1,217 @@ +""" + +To test this, run 'sudo -u dd-agent dd-agent check nutcracker' + +When ready: +- place this file in /etc/dd-agent/checks.d/nutcracker.py +- put the config file in /etc/dd-agent/conf.d/nutcracker.yaml +- service datadog-agent restart +""" + +import hashlib +import json +import md5 +import memcache +import os +import socket +import sys +import time +import uuid + +from checks import AgentCheck + +class NutcrackerCheck(AgentCheck): + SOURCE_TYPE_NAME = 'nutcracker' + SERVICE_CHECK = 'nutcracker.can_connect' + + DEFAULT_HOST = '127.0.0.1' + DEFAULT_PORT = 11211 + DEFAULT_STATS_PORT = 22222 + + # Pool stats. These descriptions are from 'nutcracker --describe-stats' + POOL_STATS = [ + ['curr_connections', 'gauge', None], # Number of current connections + ['total_connections', 'rate', None], # Running total connections made + ['server_ejects', 'rate', None], # times a backend server was ejected + ['client_err', 'rate', None], # errors on client connections + ] + + # Server stats. These descriptions are from 'nutcracker --describe-stats' + SERVER_STATS = [ + ['server_eof', 'rate', None], # eof on server connections + ['server_err', 'rate', None], # errors on server connections + ['server_timedout', 'rate', 'timedout'], # timeouts on server connections + ['server_connections', 'gauge', 'connections'], # active server connections + ['requests', 'rate', None], # requests + ['request_bytes', 'rate', None], # total request bytes + ['responses', 'rate', None], # responses + ['response_bytes', 'rate', None], # total response bytes + ['in_queue', 'gauge', None], # requests in incoming queue + ['in_queue_bytes', 'gauge', None], # current request bytes in incoming queue + ['out_queue', 'gauge', None], # requests in outgoing queue + ['out_queue_bytes', 'gauge', None], # current request bytes in outgoing queue + ] + + def _get_raw_stats(self, host, stats_port): + # Connect + self.log.debug("Connecting to %s:%s", host, stats_port) + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((host, stats_port)) + + # Read + file = s.makefile('r') + data = file.readline(); + s.close() + + # Load + return json.loads(data); + + def _send_datadog_stat(self, item, data, tag_map, prefix): + # Break out the info + stat_key, stat_type, override_name = item + + # Make sure we have a name + if not override_name: + override_name = stat_key + + # Add the prefix if appropriate. + if prefix: + override_name = prefix + "_" + override_name + + try: + # Get the data, make sure it's there. + stat_data = float(data.get(stat_key)) + except: + # Hrm, not there. Let it be zero. + stat_data = 0 + + # Make the datadog metric. + metric = self.normalize(override_name.lower(), self.SOURCE_TYPE_NAME) + + tags = [k+":"+v for k,v in tag_map.iteritems()] + + if stat_type == 'gauge': + self.gauge(metric, stat_data, tags=tags) + return + + if stat_type == 'rate': + metric += "_rate" + self.rate(metric, stat_data, tags=tags) + return + + if stat_type == 'bool': + self.gauge(metric, (1 if stat_data else 0), tags=tags) + return + + raise Exception("Unknown datadog stat type '%s' for key '%s'" % (stat_type, stat_key)) + + def _get_metrics(self, host, port, stats_port, tags, aggregation_key): + try: + raw_stats = self._get_raw_stats(host, stats_port) + except Exception as e: + self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL) + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'get_stats', + 'msg_title': 'Cannot get stats', + 'msg_text': str(e), + 'aggregation_key': aggregation_key + }) + + raise + + + # pprint.pprint(raw_stats) + + # Get all the pool stats + for pool_key, pool_data in raw_stats.iteritems(): + try: + # Pools are not separated from the other keys, blarg. + # Just check if it's a dict with one of the pool keys, if not then skip it. + pool_data['client_connections'] + except: + # Not there, it's not a pool. + self.log.debug(pool_key + ": NOT A POOL"); + continue + + # Start the stat tags. + tags['nutcracker_pool'] = pool_key + + # It's a pool. Process all the non-server stats + for item in self.POOL_STATS: + self._send_datadog_stat(item, pool_data, tags, "pool") + + # Find all the servers. + for server_key, server_data in pool_data.iteritems(): + try: + # Servers are not separated from the other keys, blarg. + # Just check if it's a dict with one of the server keys, if not then skip it. + server_data['in_queue_bytes'] + except: + # Not there, it's not a server. + self.log.debug(server_key + ": NOT A SERVER"); + continue + + # Set the server in the tags. + tags['nutcracker_pool_server'] = server_key + + # It's a server. Send stats. + for item in self.SERVER_STATS: + self._send_datadog_stat(item, server_data, tags, "server") + + # The key for our roundtrip tests. + key = uuid.uuid4().hex + + try: + # Make the connection and do a round trip. + mc = memcache.Client([host+':'+str(port)], debug=0) + + mc.set(key, key) + data = mc.get(key) + mc.delete(key) + empty_data = mc.get(key) + + # Did the get work? + if data != key: + raise Exception("Cannot set and get") + + # Did the delete work? + if empty_data: + raise Exception("Cannot delete") + + except Exception as e: + # Something failed. + metric = self.normalize("test_connect_fail", self.SOURCE_TYPE_NAME) + self.gauge(metric, 1, tags=tags) + + self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL) + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'test_data', + 'msg_title': 'Cannot get/set/delete', + 'msg_text': str(e), + 'aggregation_key': aggregation_key + }) + + raise + + # Connection is ok. + self.service_check(self.SERVICE_CHECK, AgentCheck.OK) + + # Called by datadog as the starting point for this check. + def check(self, instance): + host = instance.get('host', self.DEFAULT_HOST) + port = int(instance.get('port', self.DEFAULT_PORT)) + stats_port = int(instance.get('stats_port', self.DEFAULT_STATS_PORT)) + + tags = {} + for item in instance.get('tags', []): + k, v = item.split(":", 1) + tags[k] = v + + tags["host"] = host + ":" + str(port) + + aggregation_key = hashlib.md5(host+":"+str(port)).hexdigest() + + self._get_metrics(host, port, stats_port, tags, aggregation_key) + diff --git a/files/redis-labs-enterprise-cluster.py b/files/redis-labs-enterprise-cluster.py new file mode 100644 index 00000000..75dd8750 --- /dev/null +++ b/files/redis-labs-enterprise-cluster.py @@ -0,0 +1,195 @@ +""" +To test this, run: +'sudo -u dd-agent dd-agent check redis-labs-enterprise-cluster' + +When ready: +- place this file in /etc/dd-agent/checks.d/redis-labs-enterprise-cluster.py +- put the config in /etc/dd-agent/conf.d/redis-labs-enterprise-cluster.yaml +- service datadog-agent restart +""" + +import base64 +import json +import ssl +import socket +import time +import urllib2 + +from checks import AgentCheck + +GIG = 1024 * 1024 * 1024 + + +class RedisLabsEnterpriseClusterCheck(AgentCheck): + SOURCE_TYPE_NAME = 'rlec' + SERVICE_CHECK = 'rlec.can_connect' + + DEFAULT_HOST = '127.0.0.1' + DEFAULT_PORT = 9443 + + stats_endpoints = [ + 'bdbs', + 'nodes', + 'bdbs/stats/last', + 'shards/stats/last', + 'nodes/stats/last', + 'cluster/stats/last', + # 'cluster/actions', + ] + + def metric_name(self, metric): + return self.normalize(metric.lower(), self.SOURCE_TYPE_NAME) + + def _get_raw_stats(self, host, port, username, password): + data = {} + for endpoint in self.stats_endpoints: + url = 'https://%s:%s/v1/%s' % (host, port, endpoint) + + req = urllib2.Request(url) + + base64string = base64.encodestring('%s:%s' % (username, password)) + base64string = base64string[:-1] + req.add_header("Authorization", "Basic %s" % base64string) + + context = ssl._create_unverified_context() + response = urllib2.urlopen(req, context=context) + + data[endpoint] = json.loads(response.read()) + + return data + + def gauge(self, metric, value, *args, **kwargs): + metric = self.metric_name(metric) + return super(RedisLabsEnterpriseClusterCheck, self).gauge( + metric, value, *args, **kwargs + ) + + def _get_metrics_dbs(self, raw_stats): + bdb_stats_map = raw_stats['bdbs/stats/last'] + for item in raw_stats['bdbs']: + uid = item['uid'] + str_uid = str(uid) + bdb_stats = bdb_stats_map[str_uid] + + name = item['name'] + + tags = [ + 'db_name:%s' % name + ] + + mem_gigs = int(item['memory_size']) / GIG + used_gigs = int(bdb_stats['used_memory']) / GIG + + self.gauge('db.total_size_in_gigs', mem_gigs, tags=tags) + self.gauge('db.num_shards', item['shards_count'], tags=tags) + self.gauge('db.read_hits', bdb_stats['read_hits'], tags=tags) + self.gauge('db.read_misses', bdb_stats['read_misses'], tags=tags) + self.gauge('db.write_hits', bdb_stats['write_hits'], tags=tags) + self.gauge('db.write_misses', bdb_stats['write_misses'], tags=tags) + self.gauge('db.num_connections', bdb_stats['conns'], tags=tags) + self.gauge('db.num_keys', bdb_stats['no_of_keys'], tags=tags) + self.gauge('db.bytes_added', bdb_stats['ingress_bytes'], tags=tags) + self.gauge('db.bytes_read', bdb_stats['egress_bytes'], tags=tags) + self.gauge('db.count_evicted', bdb_stats['evicted_objects'], + tags=tags) + self.gauge('db.count_expired', bdb_stats['expired_objects'], + tags=tags) + self.gauge('db.ops_per_sec', + bdb_stats['instantaneous_ops_per_sec'], tags=tags) + self.gauge('db.used_memory_in_gigs', used_gigs, tags=tags) + + def _get_metrics_nodes(self, raw_stats): + node_stats_map = raw_stats['nodes/stats/last'] + for node in raw_stats['nodes']: + # Get the node ip. Don't send these stats if it doesn't match + # the node we're on. + ip = node['addr'] + if ip != socket.gethostbyname(socket.gethostname()): + continue + + uid = node['uid'] + node_stats = node_stats_map[str(uid)] + + tags = [ + 'node_ip:%s' % ip, + 'node_uid:%s' % uid + ] + + is_active = 1 if (node['status'] == 'active') else 0 + ephemeral_gigs = int(node_stats['ephemeral_storage_free']) / GIG + persistent_gigs = int(node_stats['persistent_storage_free']) / GIG + memory_gigs = int(node_stats['free_memory']) / GIG + + self.gauge('node.shard_count', node['shard_count'], tags=tags) + self.gauge('node.active', is_active, tags=tags) + self.gauge('node.connections', node_stats['conns'], tags=tags) + self.gauge('node.aof_rewrites', node_stats['cur_aof_rewrites'], + tags=tags) + self.gauge('node.ephemeral_free_space_gigs', ephemeral_gigs, + tags=tags) + self.gauge('node.persistent_free_space_gigs', persistent_gigs, + tags=tags) + self.gauge('node.free_memory_gigs', memory_gigs, tags=tags) + self.gauge('node.requests', node_stats['total_req'], tags=tags) + + def _get_metrics_shards(self, raw_stats): + """ + At this time it looks like this isn't useful info + + shards_stats_map = raw_stats['shards/stats/last'] + """ + + def _get_metrics_cluster(self, raw_stats): + stats = raw_stats['cluster/stats/last'] + + tags = [] + + ephemeral_gigs = int(stats['ephemeral_storage_free']) / GIG + persistent_gigs = int(stats['persistent_storage_free']) / GIG + memory_gigs = int(stats['free_memory']) / GIG + + self.gauge('cluster.connections', stats['conns'], tags=tags) + self.gauge('cluster.ephemeral_free_space_gigs', ephemeral_gigs, + tags=tags) + self.gauge('cluster.persistent_free_space_gigs', persistent_gigs, + tags=tags) + self.gauge('cluster.free_memory_gigs', memory_gigs, tags=tags) + self.gauge('cluster.requests', stats['total_req'], tags=tags) + self.gauge('cluster.bytes_added', stats['ingress_bytes'], tags=tags) + self.gauge('cluster.bytes_read', stats['egress_bytes'], tags=tags) + self.gauge('cluster.cpu_idle', stats['cpu_idle'], tags=tags) + + def _get_metrics(self, host, port, username, password, tags): + try: + raw_stats = self._get_raw_stats(host, port, username, password) + except Exception: + self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL) + self.increment(self.metric_name('node.get_stats.failure'), 1, tags = [ + 'host:%s' % host, + ]) + + raise + + # Send all stats + self._get_metrics_dbs(raw_stats) + self._get_metrics_nodes(raw_stats) + self._get_metrics_shards(raw_stats) + self._get_metrics_cluster(raw_stats) + + # Connection is ok. + self.service_check(self.SERVICE_CHECK, AgentCheck.OK) + + # Called by datadog as the starting point for this check. + def check(self, instance): + host = instance.get('host', self.DEFAULT_HOST) + port = int(instance.get('port', self.DEFAULT_PORT)) + username = instance['username'] + password = instance['password'] + + tags = {} + for item in instance.get('tags', []): + k, v = item.split(":", 1) + tags[k] = v + + self._get_metrics(host, port, username, password, tags) + diff --git a/files/uwsgi.py b/files/uwsgi.py new file mode 100755 index 00000000..81d698f3 --- /dev/null +++ b/files/uwsgi.py @@ -0,0 +1,128 @@ +""" + +To test this, run 'sudo -u dd-agent dd-agent check uwsgi' + +When ready: +- place this file in /etc/dd-agent/checks.d/uwsgi.py +- put the config file in /etc/dd-agent/conf.d/uwsgi.yaml +- service datadog-agent restart +""" + +import hashlib +import glob +import json +import os +import socket +from stat import ST_CTIME +import time + +from checks import AgentCheck + + +class UwsgiCheck(AgentCheck): + SOURCE_TYPE_NAME = 'uwsgi' + SERVICE_CHECK = 'uwsgi.can_connect' + + def metric_name(self, metric): + return self.normalize(metric.lower(), self.SOURCE_TYPE_NAME) + + def gauge(self, metric, value, *args, **kwargs): + metric = self.metric_name(metric) + return super(UwsgiCheck, self).gauge( + metric, value, *args, **kwargs + ) + + def histogram(self, metric, value, *args, **kwargs): + metric = self.metric_name(metric) + return super(UwsgiCheck, self).histogram( + metric, value, *args, **kwargs + ) + + def _get_raw_stats(self): + chosen_socket = None + latest_ctime = 0 + files = glob.glob('/tmp/uwsgi_stats_*.socket') + for fname in files: + stats = os.stat(fname) + + if stats[ST_CTIME] > latest_ctime: + latest_ctime = stats[ST_CTIME] + chosen_socket = fname + + if not chosen_socket: + raise RuntimeError("Cannot find uwsgi stats socket file") + + sock_obj = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock_obj.connect(chosen_socket) + + json_str = '' + while True: + data = sock_obj.recv(4096) + if len(data) < 1: + break + json_str += data.decode('utf8') + + return json.loads(json_str) + + def _send_stats(self, data): + self._send_stats_workers(data) + + def _send_stats_workers(self, data): + code_dir = data['cwd'] + master_pid = data['pid'] + global_tags = [ + 'code_dir:%s' % code_dir, + 'master_pid:%s' % master_pid, + ] + + self.gauge('listen_queue', data['listen_queue'], tags=global_tags) + self.gauge('listen_queue_errors', data['listen_queue_errors'], tags=global_tags) + + for worker in data['workers']: + worker_tags = [ + 'pid:%s' % worker['pid'], + 'worker_id:%s' % worker['id'], + ] + tags = worker_tags + global_tags + + self.gauge('worker.accepting', worker['accepting'], tags=tags) + self.gauge('worker.status.%s' % worker['status'], 1, tags=tags) + self.gauge('worker.running_time', worker['running_time'], + tags=tags) + self.gauge('worker.data_transmitted', worker['tx'], tags=tags) + self.gauge('worker.address_space', worker['vsz'], tags=tags) + self.gauge('worker.rss_memory', worker['rss'], tags=tags) + self.gauge('worker.respawn_count', worker['respawn_count'], + tags=tags) + self.gauge('worker.exceptions_count', worker['exceptions'], + tags=tags) + self.gauge('worker.harakiri_count', worker['harakiri_count'], + tags=tags) + self.histogram('worker.avg_response_time_ms', + worker['avg_rt']/1000, tags=tags) + + def _get_metrics(self, aggregation_key): + try: + raw_stats = self._get_raw_stats() + except Exception as e: + self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL) + self.event({ + 'timestamp': int(time.time()), + 'event_type': 'get_stats', + 'msg_title': 'Cannot get stats', + 'msg_text': str(e), + 'aggregation_key': aggregation_key + }) + + raise + + self._send_stats(raw_stats) + + # Connection is ok. + self.service_check(self.SERVICE_CHECK, AgentCheck.OK) + + # Called by datadog as the starting point for this check. + def check(self, instance): + aggregation_key = hashlib.md5().hexdigest() + self._get_metrics(aggregation_key) + diff --git a/tasks/main.yml b/tasks/main.yml index 8a345e73..3408bc66 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -39,10 +39,10 @@ - name: Create a check agent for each Datadog check copy: - src={{ item.src }}/{{ item.name }} - dest=/etc/dd-agent/checks.d/{{ item.name }} + src={{ item.name }}.py + dest=/etc/dd-agent/checks.d/{{ item.name }}.py owner={{ datadog_user }} group={{ datadog_group }} - with_items: '{{ datadog_check_agents.keys() }}' + with_items: '{{ datadog_check_agents }}' notify: - restart datadog-agent From df9fe9a0bb2b839b77d2310ffdce89f89d19a770 Mon Sep 17 00:00:00 2001 From: Tracey McEvoy Date: Fri, 3 Feb 2017 11:36:00 +0000 Subject: [PATCH 6/8] Adding hadoop and mysql steps to datadog --- tasks/main.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tasks/main.yml b/tasks/main.yml index b3cdcaa8..eebd1eac 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -46,3 +46,18 @@ with_items: '{{ datadog_check_agents }}' notify: - restart datadog-agent + +- name: Upgrade snakebite version + pip: + name: snakebite + version: {{ datadog_snakebite_version }} + executable: /opt/datadog-agent/embedded/bin/pip + state: present + when: hadoop_server_monitoring is defined and hadoop_server_monitoring + notify: restart datadog-agent + +- mysql_db: name=datadog state=present + when: data_server_monitoring is defined and data_server_monitoring + +- mysql_user: name=datadog password="{{ datadog_mysql_data_password }}" priv=datadog.*:ALL state=present + when: data_server_monitoring is defined and data_server_monitoring \ No newline at end of file From 903cd04c2576ed73144925275e7695d4a49bd9b6 Mon Sep 17 00:00:00 2001 From: Tracey McEvoy Date: Fri, 3 Feb 2017 12:09:04 +0000 Subject: [PATCH 7/8] Fixing snake-bit issue --- tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/main.yml b/tasks/main.yml index eebd1eac..3a7de871 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -50,7 +50,7 @@ - name: Upgrade snakebite version pip: name: snakebite - version: {{ datadog_snakebite_version }} + version: "{{ datadog_snakebite_version }}" executable: /opt/datadog-agent/embedded/bin/pip state: present when: hadoop_server_monitoring is defined and hadoop_server_monitoring From 0bd54a22a4948f9a4566546ccc039a7f1d517d16 Mon Sep 17 00:00:00 2001 From: Scott Estes Date: Thu, 9 Feb 2017 10:19:25 +0000 Subject: [PATCH 8/8] updating repo to default to disabled --- templates/datadog.repo.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/datadog.repo.j2 b/templates/datadog.repo.j2 index d4f63f82..8df0ed32 100644 --- a/templates/datadog.repo.j2 +++ b/templates/datadog.repo.j2 @@ -1,6 +1,6 @@ [datadog] name = Datadog, Inc. baseurl = https://yum.datadoghq.com/rpm/{{ ansible_userspace_architecture }}/ -enabled=1 +enabled=0 gpgcheck=1 gpgkey=https://yum.datadoghq.com/DATADOG_RPM_KEY.public