diff --git a/discovery-provider/src/queries/get_health.py b/discovery-provider/src/queries/get_health.py index d56ba8b060a..73b52ac3ccd 100644 --- a/discovery-provider/src/queries/get_health.py +++ b/discovery-provider/src/queries/get_health.py @@ -406,12 +406,16 @@ def get_elasticsearch_health_info( def health_check_prometheus_exporter(): health_results, is_unhealthy = get_health({}) - PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_LATEST).save( - health_results["block_difference"] - ) + # store all top-level keys with numerical values + for key, value in health_results.items(): + if isinstance(value, (int, float)): + PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK).save( + value, {"key": key} + ) - PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK_INDEXED_BLOCK_NUM_LATEST).save( - health_results["web"]["blocknumber"] + # store a non-top-level key + PrometheusMetric(PrometheusMetricNames.HEALTH_CHECK).save( + health_results["web"]["blocknumber"], {"key": "blocknumber"} ) diff --git a/discovery-provider/src/utils/prometheus_metric.py b/discovery-provider/src/utils/prometheus_metric.py index 3a14f42d62a..5872cff8e0f 100644 --- a/discovery-provider/src/utils/prometheus_metric.py +++ b/discovery-provider/src/utils/prometheus_metric.py @@ -3,7 +3,7 @@ from time import time from typing import Callable, Dict -from prometheus_client import Gauge, Histogram +from prometheus_client import Gauge, Histogram, Summary logger = logging.getLogger(__name__) @@ -99,8 +99,7 @@ class PrometheusMetricNames: CELERY_TASK_DURATION_SECONDS = "celery_task_duration_seconds" CELERY_TASK_LAST_DURATION_SECONDS = "celery_task_last_duration_seconds" FLASK_ROUTE_DURATION_SECONDS = "flask_route_duration_seconds" - HEALTH_CHECK_BLOCK_DIFFERENCE_LATEST = "health_check_block_difference_latest" - HEALTH_CHECK_INDEXED_BLOCK_NUM_LATEST = "health_check_indexed_block_num_latest" + HEALTH_CHECK = "health_check" INDEX_BLOCKS_DURATION_SECONDS = "index_blocks_duration_seconds" INDEX_METRICS_DURATION_SECONDS = "index_metrics_duration_seconds" INDEX_TRENDING_DURATION_SECONDS = "index_trending_duration_seconds" @@ -128,6 +127,8 @@ class PrometheusMetricNames: * When looking at the raw /prometheus_metrics endpoint for `audius_dn_update_aggregate_table_latency_seconds_bucket`, you can see how a single metric explodes into multiple statistical helpers. +* Prometheus Summaries: Prometheus Summaries will export a single metric across all pids + which is useful for point-in-time collection. Labels: @@ -149,6 +150,7 @@ class PrometheusMetricNames: f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_ACTIVE_DURATION_SECONDS}", "How long the currently running celery task has been running", ("task_name",), + multiprocess_mode="liveall", ), PrometheusMetricNames.CELERY_TASK_DURATION_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.CELERY_TASK_DURATION_SECONDS}", @@ -174,13 +176,11 @@ class PrometheusMetricNames: "route", ), ), - PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_LATEST: Gauge( - f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_BLOCK_DIFFERENCE_LATEST}", - "Difference between the latest block and the latest indexed block", - ), - PrometheusMetricNames.HEALTH_CHECK_INDEXED_BLOCK_NUM_LATEST: Gauge( - f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK_INDEXED_BLOCK_NUM_LATEST}", - "Latest indexed block number", + PrometheusMetricNames.HEALTH_CHECK: Gauge( + f"{METRIC_PREFIX}_{PrometheusMetricNames.HEALTH_CHECK}", + "Metrics extracted from our health-checks, using similar keys.", + ("key",), + multiprocess_mode="liveall", ), PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS: Histogram( f"{METRIC_PREFIX}_{PrometheusMetricNames.INDEX_BLOCKS_DURATION_SECONDS}", @@ -271,6 +271,8 @@ def save(self, value, labels=None): this_metric.observe(value, labels) elif isinstance(this_metric, Gauge): this_metric.set(value) + elif isinstance(this_metric, Summary): + this_metric.observe(value) @classmethod def register_collector(cls, name, collector_func):