Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[postgres] Collect StatIO per table & revamp test #1415

Merged
merged 9 commits into from
Apr 7, 2015
97 changes: 68 additions & 29 deletions checks.d/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ class PostgreSql(AgentCheck):
('schemaname', 'schema')
],
'metrics': {
'pg_stat_user_tables': ('postgresql.total_tables', GAUGE),
'pg_stat_user_tables': ('postgresql.table.count', GAUGE),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might break existing user's screens, alerts, etc. Renaming a metric that's out there could be considered a breaking change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. I checked and no such screens and monitors exist right now. We can recheck before we release.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alq666 @miketheman should have added a note.
That's normal b/c this is a new metric post 5.2 so it hasn't been released to the crowds yet. That's why I had no mercy renaming it.
See : bee8eec

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notes are good! :) Thanks for the explain @LeoCavaille .

},
'relation': False,
'query': """
Expand Down Expand Up @@ -218,6 +218,28 @@ class PostgreSql(AgentCheck):
"""
}

STATIO_METRICS = {
'descriptors': [
('relname', 'table'),
],
'metrics': {
'heap_blks_read' : ('postgresql.heap_blocks_read', RATE),
'heap_blks_hit' : ('postgresql.heap_blocks_hit', RATE),
'idx_blks_read' : ('postgresql.index_blocks_read', RATE),
'idx_blks_hit' : ('postgresql.index_blocks_hit', RATE),
'toast_blks_read' : ('postgresql.toast_blocks_read', RATE),
'toast_blks_hit' : ('postgresql.toast_blocks_hit', RATE),
'tidx_blks_read' : ('postgresql.toast_index_blocks_read', RATE),
'tidx_blks_hit' : ('postgresql.toast_index_blocks_hit', RATE),
},
'query': """
SELECT relname,
%s
FROM pg_statio_user_tables
WHERE relname = ANY(%s)""",
'relation': True,
}

def __init__(self, name, init_config, agentConfig, instances=None):
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
self.dbs = {}
Expand All @@ -227,6 +249,7 @@ def __init__(self, name, init_config, agentConfig, instances=None):
self.db_instance_metrics = []
self.db_bgw_metrics = []
self.replication_metrics = {}
self.custom_metrics = {}

def _get_version(self, key, db):
if key not in self.versions:
Expand Down Expand Up @@ -263,20 +286,18 @@ def _get_instance_metrics(self, key, db):
metrics = self.instance_metrics.get(key)

if metrics is None:

# Hack to make sure that if we have multiple instances that connect to
# the same host, port, we don't collect metrics twice
# as it will result in https://github.com/DataDog/dd-agent/issues/1211
sub_key = key[:2]
if sub_key in self.db_instance_metrics:
self.instance_metrics[key] = {}
self.log.debug("Not collecting instance metrics for key: {0} as"\
self.instance_metrics[key] = None
self.log.debug("Not collecting instance metrics for key: {0} as"
" they are already collected by another instance".format(key))
return {}
return None

self.db_instance_metrics.append(sub_key)


if self._is_9_2_or_above(key, db):
self.instance_metrics[key] = dict(self.COMMON_METRICS, **self.NEWER_92_METRICS)
else:
Expand All @@ -293,16 +314,15 @@ def _get_bgw_metrics(self, key, db):
metrics = self.bgw_metrics.get(key)

if metrics is None:

# Hack to make sure that if we have multiple instances that connect to
# the same host, port, we don't collect metrics twice
# as it will result in https://github.com/DataDog/dd-agent/issues/1211
sub_key = key[:2]
if sub_key in self.db_bgw_metrics:
self.bgw_metrics[key] = {}
self.log.debug("Not collecting bgw metrics for key: {0} as"\
self.bgw_metrics[key] = None
self.log.debug("Not collecting bgw metrics for key: {0} as"
" they are already collected by another instance".format(key))
return {}
return None

self.db_bgw_metrics.append(sub_key)

Expand Down Expand Up @@ -334,26 +354,38 @@ def _collect_stats(self, key, db, instance_tags, relations, custom_metrics):
If custom_metrics is not an empty list, gather custom metrics defined in postgres.yaml
"""

self.DB_METRICS['metrics'] = self._get_instance_metrics(key, db)
self.BGW_METRICS['metrics'] = self._get_bgw_metrics(key, db)
metric_scope = [
self.DB_METRICS,
self.CONNECTION_METRICS,
self.BGW_METRICS,
self.LOCK_METRICS,
self.COUNT_METRICS
self.COUNT_METRICS,
]

# These are added only once per PG server, thus the test
db_instance_metrics = self._get_instance_metrics(key, db)
bgw_instance_metrics = self._get_bgw_metrics(key, db)

if db_instance_metrics is not None:
# FIXME: constants shouldn't be modified
self.DB_METRICS['metrics'] = db_instance_metrics
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes me sad to see what should be a constant being muted :( we should clean that in a next version of this check

metric_scope.append(self.DB_METRICS)

if bgw_instance_metrics is not None:
# FIXME: constants shouldn't be modified
self.BGW_METRICS['metrics'] = bgw_instance_metrics
metric_scope.append(self.BGW_METRICS)

# Do we need relation-specific metrics?
if relations:
metric_scope += [
self.REL_METRICS,
self.IDX_METRICS,
self.SIZE_METRICS
self.SIZE_METRICS,
self.STATIO_METRICS
]

replication_metrics = self._get_replication_metrics(key, db)
if replication_metrics is not None:
# FIXME: constants shouldn't be modified
self.REPLICATION_METRICS['metrics'] = replication_metrics
metric_scope.append(self.REPLICATION_METRICS)

Expand All @@ -364,10 +396,8 @@ def _collect_stats(self, key, db, instance_tags, relations, custom_metrics):
for scope in full_metric_scope:
if scope == self.REPLICATION_METRICS or not self._is_above(key, db, [9,0,0]):
log_func = self.log.debug
warning_func = self.log.debug
else:
log_func = self.log.warning
warning_func = self.warning

# build query
cols = scope['metrics'].keys() # list of metrics to query, in some order
Expand All @@ -394,10 +424,11 @@ def _collect_stats(self, key, db, instance_tags, relations, custom_metrics):

if scope in custom_metrics and len(results) > MAX_CUSTOM_RESULTS:
self.warning(
"Query: {0} returned more than {1} results ({2})Truncating").format(
"Query: {0} returned more than {1} results ({2}). Truncating").format(
query, MAX_CUSTOM_RESULTS, len(results))
results = results[:MAX_CUSTOM_RESULTS]

# FIXME this cramps my style
if scope == self.DB_METRICS:
self.gauge("postgresql.db.count", len(results),
tags=[t for t in instance_tags if not t.startswith("db:")])
Expand Down Expand Up @@ -482,7 +513,12 @@ def get_connection(self, key, host, port, user, password, dbname, use_cached=Tru
self.dbs[key] = connection
return connection

def _process_customer_metrics(self,custom_metrics):
def _get_custom_metrics(self, custom_metrics, key):
# Pre-processed cached custom_metrics
if key in self.custom_metrics:
return self.custom_metrics[key]

# Otherwise pre-process custom metrics and verify definition
required_parameters = ("descriptors", "metrics", "query", "relation")

for m in custom_metrics:
Expand All @@ -493,14 +529,17 @@ def _process_customer_metrics(self,custom_metrics):

self.log.debug("Metric: {0}".format(m))

for k, v in m['metrics'].items():
if v[1].upper() not in ['RATE','GAUGE','MONOTONIC']:
raise CheckException("Collector method {0} is not known."\
"Known methods are RATE,GAUGE,MONOTONIC".format(
v[1].upper()))
for ref, (_, mtype) in m['metrics'].iteritems():
cap_mtype = mtype.upper()
if cap_mtype not in ('RATE', 'GAUGE', 'MONOTONIC'):
raise CheckException("Collector method {0} is not known."
" Known methods are RATE, GAUGE, MONOTONIC".format(cap_mtype))

m['metrics'][k][1] = getattr(PostgreSql, v[1].upper())
self.log.debug("Method: %s" % (str(v[1])))
m['metrics'][ref][1] = getattr(PostgreSql, cap_mtype)
self.log.debug("Method: %s" % (str(mtype)))

self.custom_metrics[key] = custom_metrics
return custom_metrics

def check(self, instance):
host = instance.get('host', '')
Expand All @@ -510,8 +549,6 @@ def check(self, instance):
tags = instance.get('tags', [])
dbname = instance.get('dbname', None)
relations = instance.get('relations', [])
custom_metrics = instance.get('custom_metrics') or []
self._process_customer_metrics(custom_metrics)

if relations and not dbname:
self.warning('"dbname" parameter must be set when using the "relations" parameter.')
Expand All @@ -521,6 +558,8 @@ def check(self, instance):

key = (host, port, dbname)

custom_metrics = self._get_custom_metrics(instance.get('custom_metrics', []), key)

# Clean up tags in case there was a None entry in the instance
# e.g. if the yaml contains tags: but no actual tags
if tags is None:
Expand Down
21 changes: 4 additions & 17 deletions ci/postgres.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,26 +46,13 @@ def pg_rootdir
sleep_for 5
sh %(#{pg_rootdir}/bin/psql\
-p 15432 -U $USER\
-c "CREATE USER datadog WITH PASSWORD 'datadog'"\
postgres)
postgres < $TRAVIS_BUILD_DIR/ci/resources/postgres/postgres.sql)
sh %(#{pg_rootdir}/bin/psql\
-p 15432 -U $USER\
-c "GRANT SELECT ON pg_stat_database TO datadog"\
postgres)
sh %(#{pg_rootdir}/bin/psql\
-p 15432 -U $USER\
-c "CREATE DATABASE datadog_test"\
postgres)
sh %(#{pg_rootdir}/bin/psql\
-p 15432 -U $USER\
-c "GRANT ALL PRIVILEGES ON DATABASE datadog_test TO datadog"\
postgres)
-p 15432 -U datadog\
datadog_test < $TRAVIS_BUILD_DIR/ci/resources/postgres/datadog_test.sql)
sh %(#{pg_rootdir}/bin/psql\
-p 15432 -U datadog\
-c "CREATE TABLE persons (personid INT, lastname VARCHAR(255), firstname VARCHAR(255), address VARCHAR(255), city VARCHAR(255))"\
datadog_test)
# For pg_stat_user_table to return stuff
sleep_for 5
dogs < $TRAVIS_BUILD_DIR/ci/resources/postgres/dogs.sql)
end

task :script => ['ci:common:script'] do
Expand Down
5 changes: 5 additions & 0 deletions ci/resources/postgres/datadog_test.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CREATE TABLE persons (personid SERIAL, lastname VARCHAR(255), firstname VARCHAR(255), address VARCHAR(255), city VARCHAR(255));
INSERT INTO persons (lastname, firstname, address, city) VALUES ('Cavaille', 'Leo', 'Midtown', 'New York'), ('Someveryveryveryveryveryveryveryveryveryverylongname', 'something', 'Avenue des Champs Elysees', 'Beautiful city of lights');
SELECT * FROM persons;
SELECT * FROM persons;
SELECT * FROM persons;
7 changes: 7 additions & 0 deletions ci/resources/postgres/dogs.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
CREATE TABLE breed (id SERIAL, name VARCHAR(255));
CREATE TABLE kennel (id SERIAL, address VARCHAR(255));
INSERT INTO kennel (address) VALUES ('Midtown, New York'), ('Boston');
SELECT * FROM kennel;
CREATE INDEX breed_names ON breed(name);
INSERT INTO breed (name) VALUES ('Labrador Retriver'), ('German Shepherd'), ('Yorkshire Terrier'), ('Golden Retriever'), ('Bulldog');
SELECT * FROM breed WHERE name = 'Labrador';
5 changes: 5 additions & 0 deletions ci/resources/postgres/postgres.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CREATE USER datadog WITH PASSWORD 'datadog';
GRANT SELECT ON pg_stat_database TO datadog;
CREATE DATABASE datadog_test;
GRANT ALL PRIVILEGES ON DATABASE datadog_test TO datadog;
CREATE DATABASE dogs;
6 changes: 4 additions & 2 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def assertMetricTagPrefix(self, metric_name, tag_prefix, count=None, at_least=1)
log.debug("Looking for a tag starting with `{0}:` on metric {1}".format(tag_prefix, metric_name))
if count is not None:
log.debug(" * should have exactly {0} data points".format(count))
if at_least is not None:
elif at_least is not None:
log.debug(" * should have at least {0} data points".format(at_least))

candidates = []
Expand Down Expand Up @@ -314,7 +314,7 @@ def assertMetricTag(self, metric_name, tag, count=None, at_least=1):
log.debug("Looking for tag {0} on metric {1}".format(tag, metric_name))
if count is not None:
log.debug(" * should have exactly {0} data points".format(count))
if at_least is not None:
elif at_least is not None:
log.debug(" * should have at least {0} data points".format(at_least))

candidates = []
Expand Down Expand Up @@ -348,6 +348,8 @@ def assertServiceCheck(self, service_check_name, status=None, tags=None,
log.debug(" * tagged with {0}".format(tags))
if count is not None:
log.debug(" * should have exactly {0} statuses".format(count))
elif at_least is not None:
log.debug(" * should have at least {0} statuses".format(count))
candidates = []
for sc in self.service_checks:
if sc['check'] == service_check_name:
Expand Down