Skip to content

Update exporter dashboards #158

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions kustomize/monitoring/alertmanager-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ data:
alertmanager.yml: |
###
#
# Copyright 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###

Expand Down Expand Up @@ -31,7 +31,7 @@ data:
receivers:
- name: 'default-receiver'
email_configs:
- to: 'example@yourcompany.com'
- to: 'example@crunchydata.com'
send_resolved: true

## Examples of alternative alert receivers. See documentation for more info on how to configure these fully
Expand Down
58 changes: 48 additions & 10 deletions kustomize/monitoring/alertmanager-rules-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ data:
summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'


########## SYSTEM RULES ##########
- alert: ExporterDown
expr: avg_over_time(up[5m]) < 0.5
for: 10s
labels:
service: system
severity: critical
severity_num: 300
annotations:
description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
summary: 'Prometheus Exporter Service Down'


########## POSTGRESQL RULES ##########
- alert: PGIsUp
expr: pg_up < 1
Expand Down Expand Up @@ -173,6 +186,27 @@ data:
description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
summary: 'PGSQL Instance connections'

- alert: DiskFillPredict
expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70
for: 5m
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
summary: 'Disk predicted to be full in 24 hours'
description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage'

- alert: PGClusterRoleChange
expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details'

- alert: PGDiskSize
expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75
for: 60s
Expand All @@ -196,7 +230,7 @@ data:
summary: 'PGSQL Instance size critical'

- alert: PGReplicationByteLag
expr: ccp_replication_status_byte_lag > 5.24288e+07
expr: ccp_replication_lag_size_bytes > 5.24288e+07
for: 60s
labels:
service: postgresql
Expand All @@ -207,7 +241,7 @@ data:
summary: 'PGSQL Instance replica lag warning'

- alert: PGReplicationByteLag
expr: ccp_replication_status_byte_lag > 1.048576e+08
expr: ccp_replication_lag_size_bytes > 1.048576e+08
for: 60s
labels:
service: postgresql
Expand Down Expand Up @@ -313,12 +347,15 @@ data:
# Otherwise rule will be applied to all stanzas returned on target system if not set.
#
# Relevant metric names are:
# ccp_backrest_last_full_time_since_completion_seconds
# ccp_backrest_last_incr_time_since_completion_seconds
# ccp_backrest_last_diff_time_since_completion_seconds
# ccp_backrest_last_full_backup_time_since_completion_seconds
# ccp_backrest_last_incr_backup_time_since_completion_seconds
# ccp_backrest_last_diff_backup_time_since_completion_seconds
#
# To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
# Further adjustment may be needed depending on your backup runtimes/schedule.
#
# - alert: PGBackRestLastCompletedFull_main
# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 604800
# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
# for: 60s
# labels:
# service: postgresql
Expand All @@ -328,7 +365,7 @@ data:
# summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
#
# - alert: PGBackRestLastCompletedIncr_main
# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 86400
# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
# for: 60s
# labels:
# service: postgresql
Expand All @@ -340,14 +377,14 @@ data:
#
# Runtime monitoring is handled with a single metric:
#
# ccp_backrest_last_runtime_backup_runtime_seconds
# ccp_backrest_last_info_backup_runtime_seconds
#
# Runtime monitoring should have the "backup_type" label set.
# Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
# Stanza should also be set if runtimes per stanza have different expected times
#
# - alert: PGBackRestLastRuntimeFull_main
# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
# for: 60s
# labels:
# service: postgresql
Expand All @@ -357,7 +394,7 @@ data:
# summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
#
# - alert: PGBackRestLastRuntimeDiff_main
# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
# for: 60s
# labels:
# service: postgresql
Expand All @@ -382,6 +419,7 @@ data:
# severity_num: 300
# annotations:
# description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'

kind: ConfigMap
metadata:
labels:
Expand Down
4 changes: 2 additions & 2 deletions kustomize/monitoring/crunchy_grafana_dashboards.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
###
#
# Copyright 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###
apiVersion: 1
Expand All @@ -13,4 +13,4 @@ providers:
disableDeletion: false
updateIntervalSeconds: 3 #how often Grafana will scan for changed dashboards
options:
path: $GF_PATHS_PROVISIONING/dashboards
path: /etc/grafana/provisioning/dashboards
4 changes: 2 additions & 2 deletions kustomize/monitoring/dashboards/pgbackrest.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1624546649377,
"iteration": 1625069660860,
"links": [
{
"asDropdown": false,
Expand Down Expand Up @@ -664,7 +664,7 @@
]
},
"time": {
"from": "now-30m",
"from": "now-2w",
"to": "now"
},
"timepicker": {
Expand Down
4 changes: 2 additions & 2 deletions kustomize/monitoring/dashboards/pod_details.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
}
]
},
"editable": true,
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1624647381559,
"iteration": 1625069717503,
"links": [
{
"icon": "external link",
Expand Down
2 changes: 1 addition & 1 deletion kustomize/monitoring/dashboards/postgres_overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1624491413218,
"iteration": 1625069480601,
"links": [],
"panels": [
{
Expand Down
6 changes: 3 additions & 3 deletions kustomize/monitoring/dashboards/postgresql_details.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@
}
]
},
"editable": true,
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1624495934950,
"iteration": 1625069813048,
"links": [
{
"asDropdown": false,
Expand Down Expand Up @@ -2143,6 +2143,6 @@
},
"timezone": "browser",
"title": "PostgreSQLDetails",
"uid": "pc4NNgknk",
"uid": "fMip0cuMk",
"version": 1
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
}
]
},
"editable": true,
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1624491530019,
"iteration": 1625069909806,
"links": [
{
"asDropdown": false,
Expand Down Expand Up @@ -626,7 +626,7 @@
]
},
"time": {
"from": "now-30m",
"from": "now-1h",
"to": "now"
},
"timepicker": {
Expand Down
2 changes: 1 addition & 1 deletion kustomize/monitoring/dashboards/prometheus_alerts.json
Original file line number Diff line number Diff line change
Expand Up @@ -938,7 +938,7 @@
"list": []
},
"time": {
"from": "now-30m",
"from": "now-1h",
"to": "now"
},
"timepicker": {
Expand Down
2 changes: 1 addition & 1 deletion kustomize/monitoring/dashboards/query_statistics.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1624501789811,
"iteration": 1625070004605,
"links": [
{
"icon": "external link",
Expand Down
2 changes: 1 addition & 1 deletion kustomize/monitoring/deploy-grafana.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ spec:
value: crunchy-prometheus
- name: PROM_PORT
value: "9090"
image: grafana/grafana:7.4.5
image: grafana/grafana:8.5.10
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
Expand Down
2 changes: 1 addition & 1 deletion kustomize/monitoring/deploy-prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
name: crunchy-prometheus
spec:
containers:
- image: prom/prometheus:v2.27.1
- image: prom/prometheus:v2.33.5
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
Expand Down