Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions docker/telemetry/alertmanager/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
global:
resolve_timeout: 5m
# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'

# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
#
# To aggregate by all possible labels use '...' as the sole label name.
# This effectively disables aggregation entirely, passing through all
# alerts as-is. This is unlikely to be what you want, unless you have
# a very low alert volume or your upstream notification system performs
# its own grouping. Example: group_by: [...]
group_by: ['alertname', 'job', 'instance']

# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s

# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m

# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 3h

# A default receiver
receiver: webhook_receiver

# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_matchers: [severity="critical"]
target_matchers: [severity="warning"]
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: [alertname, job, instance]


receivers:
- name: 'webhook_receiver'
webhook_configs:
- url: '${your_webhook_url}'

13 changes: 13 additions & 0 deletions docker/telemetry/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,24 @@ services:
- "--enable-feature=otlp-write-receiver"
volumes:
- ./prometheus/prometheus.yml:/prometheus/prometheus.yml
- ./prometheus/rules:/prometheus/rules
- ${DATA_PATH}/prometheus/data:/prometheus
depends_on:
- otel-collector
extra_hosts:
- "host.docker.internal:host-gateway"
alertmanager:
image: prom/alertmanager
ports:
- "9087:9087"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- ${DATA_PATH}/alertmanager/data:/etc/alertmanager
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--web.listen-address=:9087"
extra_hosts:
- "host.docker.internal:host-gateway"
otel-collector:
image: otel/opentelemetry-collector-contrib
volumes:
Expand Down
12 changes: 5 additions & 7 deletions docker/telemetry/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
scrape_interval: 30s # Set the scrape interval to every 30 seconds. Default is every 1 minute.
evaluation_interval: 30s # Evaluate rules every 30 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
- targets: ["host.docker.internal:9087"]

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/prometheus/rules/alert_rules_template.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
Expand All @@ -33,7 +31,7 @@ scrape_configs:
group: 'prometheus'

- job_name: "kafka"
scrape_interval: 5s
scrape_interval: 30s
honor_labels: true
static_configs:
- targets: ["host.docker.internal:8890"]
122 changes: 122 additions & 0 deletions docker/telemetry/prometheus/rules/alert_rules_template.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# This is the alert rules template for AutoMQ, please modify the alert thresholds and period per your needs
# before applying it to your production environment.
groups:
- name: kafka_alerts
rules:
- alert: ActiveControllerCount
expr: sum(kafka_controller_active_count) by (job) != 1
for: 1m
labels:
severity: critical
annotations:
summary: "Illegal kafka active controller number for cluster {{ $labels.job }}"
description: "Current number of active controller is {{ $value }}"

- alert: KafkaClusterHighBytesInPerSec
expr: sum(rate(kafka_broker_network_io_bytes_total{direction="in"}[1m])) by (job) > 50 * 1024 * 1024
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}"
description: "The number of bytes per second received by Kafka cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaClusterHighBytesOutPerSec
expr: sum(rate(kafka_broker_network_io_bytes_total{direction="out"}[1m])) by (job) > 50 * 1024 * 1024
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}"
description: "The number of bytes per second fetched from Kafka cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighBytesInPerSec
expr: rate(kafka_broker_network_io_bytes_total{direction="in"}[1m]) > 20 * 1024 * 1024
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The number of bytes per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighBytesOutPerSec
expr: rate(kafka_broker_network_io_bytes_total{direction="out"}[1m]) > 20 * 1024 * 1024
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The number of bytes per second fetched from Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighProduceRequestRate
expr: sum(rate(kafka_request_count_total{type="Produce"}[1m])) by (job, instance) > 1000
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka produce request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The number of produce requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighFetchRequestRate
expr: sum(rate(kafka_request_count_total{type="Fetch"}[1m])) by (job, instance) > 1000
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka fetch request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The number of fetch requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighProduceLatency
expr: kafka_request_time_99p_milliseconds{type="Produce"} > 100
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka produce request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The 99th percentile of produce request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighFetchLatency
expr: kafka_request_time_99p_milliseconds{type="Fetch"} > 1000
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka fetch request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The 99th percentile of fetch request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighErrorRequestRate
expr: sum(rate(kafka_request_error_count_total{error!="NONE"}[1m])) by (job, instance, error) > 0.1
for: 2m
labels:
severity: critical
annotations:
summary: "High Kafka error request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The error request rate of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighPartitionCount
expr: kafka_partition_count > 5000
for: 1m
labels:
severity: critical
annotations:
summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many partitions: {{ $value }}."
description: "The partition count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighConnectionCount
expr: sum(kafka_server_connection_count) by (job, instance) > 1000
for: 1m
labels:
severity: critical
annotations:
summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many connections: {{ $value }}."
description: "The connection count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaGroupHighConsumerLag
expr: sum(max(kafka_log_end_offset) by (job, topic, partition)) by (job, topic)
- on (topic) group_left (consumer_group) sum(max(kafka_group_commit_offset) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) > 10000
for: 1m
labels:
severity: warning
annotations:
summary: "High group consumer lag {{ printf \"%0.f\" $value }} for consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }}."
description: "The consumer lag of consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }} is exceeding threshold."
Loading