CrunchyData · benjaminjb · Sep 30, 2022 · Sep 29, 2022
diff --git a/kustomize/monitoring/alertmanager-config.yaml b/kustomize/monitoring/alertmanager-config.yaml
@@ -3,7 +3,7 @@ data:
   alertmanager.yml: |
     ###
     #
-    # Copyright 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
+    # Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
     #
     ###
 
@@ -31,7 +31,7 @@ data:
     receivers:
     - name: 'default-receiver'
       email_configs:
-      - to: 'example@yourcompany.com'
+      - to: 'example@crunchydata.com'
         send_resolved: true
 
     ## Examples of alternative alert receivers. See documentation for more info on how to configure these fully

diff --git a/kustomize/monitoring/alertmanager-rules-config.yaml b/kustomize/monitoring/alertmanager-rules-config.yaml
@@ -23,6 +23,19 @@ data:
           summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
 
 
+    ########## SYSTEM RULES ##########
+      - alert: ExporterDown
+        expr: avg_over_time(up[5m]) < 0.5
+        for: 10s
+        labels:
+          service: system
+          severity: critical
+          severity_num: 300
+        annotations:
+          description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
+          summary: 'Prometheus Exporter Service Down'
+
+
     ########## POSTGRESQL RULES ##########
       - alert: PGIsUp
         expr: pg_up < 1
@@ -173,6 +186,27 @@ data:
           description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
           summary: 'PGSQL Instance connections'
 
+      - alert: DiskFillPredict
+        expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70
+        for: 5m
+        labels:
+          service: postgresql
+          severity: warning
+          severity_num: 200
+        annotations:
+          summary: 'Disk predicted to be full in 24 hours'
+          description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage'
+
+      - alert: PGClusterRoleChange
+        expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1
+        for: 60s
+        labels:
+          service: postgresql
+          severity: critical
+          severity_num: 300
+        annotations:
+          summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details'
+
       - alert: PGDiskSize
         expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75
         for: 60s
@@ -196,7 +230,7 @@ data:
           summary: 'PGSQL Instance size critical'
 
       - alert: PGReplicationByteLag
-        expr: ccp_replication_status_byte_lag > 5.24288e+07
+        expr: ccp_replication_lag_size_bytes > 5.24288e+07
         for: 60s
         labels:
           service: postgresql
@@ -207,7 +241,7 @@ data:
           summary: 'PGSQL Instance replica lag warning'
 
       - alert: PGReplicationByteLag
-        expr: ccp_replication_status_byte_lag > 1.048576e+08
+        expr: ccp_replication_lag_size_bytes > 1.048576e+08
         for: 60s
         labels:
           service: postgresql
@@ -313,12 +347,15 @@ data:
     #   Otherwise rule will be applied to all stanzas returned on target system if not set.
     #
     # Relevant metric names are:
-    #   ccp_backrest_last_full_time_since_completion_seconds
-    #   ccp_backrest_last_incr_time_since_completion_seconds
-    #   ccp_backrest_last_diff_time_since_completion_seconds
+    #   ccp_backrest_last_full_backup_time_since_completion_seconds
+    #   ccp_backrest_last_incr_backup_time_since_completion_seconds
+    #   ccp_backrest_last_diff_backup_time_since_completion_seconds
+    #
+    # To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
+    #    Further adjustment may be needed depending on your backup runtimes/schedule.
     #
     #  - alert: PGBackRestLastCompletedFull_main
-    #    expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 604800
+    #    expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
     #    for: 60s
     #    labels:
     #       service: postgresql
@@ -328,7 +365,7 @@ data:
     #       summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
     #
     #  - alert: PGBackRestLastCompletedIncr_main
-    #    expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 86400
+    #    expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
     #    for: 60s
     #    labels:
     #       service: postgresql
@@ -340,14 +377,14 @@ data:
     #
     # Runtime monitoring is handled with a single metric:
     #
-    #   ccp_backrest_last_runtime_backup_runtime_seconds
+    #   ccp_backrest_last_info_backup_runtime_seconds
     #
     # Runtime monitoring should have the "backup_type" label set.
     #   Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
     # Stanza should also be set if runtimes per stanza have different expected times
     #
     #  - alert: PGBackRestLastRuntimeFull_main
-    #    expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
+    #    expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
     #    for: 60s
     #    labels:
     #       service: postgresql
@@ -357,7 +394,7 @@ data:
     #       summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
     #
     #  - alert: PGBackRestLastRuntimeDiff_main
-    #    expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
+    #    expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
     #    for: 60s
     #    labels:
     #       service: postgresql
@@ -382,6 +419,7 @@ data:
     #      severity_num: 300
     #    annotations:
     #      description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'
+
 kind: ConfigMap
 metadata:
   labels:

diff --git a/kustomize/monitoring/crunchy_grafana_dashboards.yml b/kustomize/monitoring/crunchy_grafana_dashboards.yml
@@ -1,6 +1,6 @@
 ###
 #
-# Copyright 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
+# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
 #
 ###
 apiVersion: 1
@@ -13,4 +13,4 @@ providers:
   disableDeletion: false
   updateIntervalSeconds: 3 #how often Grafana will scan for changed dashboards
   options:
-    path: $GF_PATHS_PROVISIONING/dashboards
+    path: /etc/grafana/provisioning/dashboards
diff --git a/kustomize/monitoring/dashboards/pgbackrest.json b/kustomize/monitoring/dashboards/pgbackrest.json
@@ -52,7 +52,7 @@
   "gnetId": null,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1624546649377,
+  "iteration": 1625069660860,
   "links": [
     {
       "asDropdown": false,
@@ -664,7 +664,7 @@
     ]
   },
   "time": {
-    "from": "now-30m",
+    "from": "now-2w",
     "to": "now"
   },
   "timepicker": {

diff --git a/kustomize/monitoring/dashboards/pod_details.json b/kustomize/monitoring/dashboards/pod_details.json
@@ -42,11 +42,11 @@
       }
     ]
   },
-  "editable": true,
+  "editable": false,
   "gnetId": null,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1624647381559,
+  "iteration": 1625069717503,
   "links": [
     {
       "icon": "external link",

diff --git a/kustomize/monitoring/dashboards/postgres_overview.json b/kustomize/monitoring/dashboards/postgres_overview.json
@@ -46,7 +46,7 @@
   "gnetId": null,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1624491413218,
+  "iteration": 1625069480601,
   "links": [],
   "panels": [
     {

diff --git a/kustomize/monitoring/dashboards/postgresql_details.json b/kustomize/monitoring/dashboards/postgresql_details.json
@@ -54,11 +54,11 @@
       }
     ]
   },
-  "editable": true,
+  "editable": false,
   "gnetId": null,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1624495934950,
+  "iteration": 1625069813048,
   "links": [
     {
       "asDropdown": false,
@@ -2143,6 +2143,6 @@
   },
   "timezone": "browser",
   "title": "PostgreSQLDetails",
-  "uid": "pc4NNgknk",
+  "uid": "fMip0cuMk",
   "version": 1
 }
diff --git a/kustomize/monitoring/dashboards/postgresql_service_health.json b/kustomize/monitoring/dashboards/postgresql_service_health.json
@@ -42,11 +42,11 @@
       }
     ]
   },
-  "editable": true,
+  "editable": false,
   "gnetId": null,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1624491530019,
+  "iteration": 1625069909806,
   "links": [
     {
       "asDropdown": false,
@@ -626,7 +626,7 @@
     ]
   },
   "time": {
-    "from": "now-30m",
+    "from": "now-1h",
     "to": "now"
   },
   "timepicker": {

diff --git a/kustomize/monitoring/dashboards/prometheus_alerts.json b/kustomize/monitoring/dashboards/prometheus_alerts.json
@@ -938,7 +938,7 @@
     "list": []
   },
   "time": {
-    "from": "now-30m",
+    "from": "now-1h",
     "to": "now"
   },
   "timepicker": {

diff --git a/kustomize/monitoring/dashboards/query_statistics.json b/kustomize/monitoring/dashboards/query_statistics.json
@@ -59,7 +59,7 @@
   "gnetId": null,
   "graphTooltip": 0,
   "id": null,
-  "iteration": 1624501789811,
+  "iteration": 1625070004605,
   "links": [
     {
       "icon": "external link",

diff --git a/kustomize/monitoring/deploy-grafana.yaml b/kustomize/monitoring/deploy-grafana.yaml
@@ -37,7 +37,7 @@ spec:
           value: crunchy-prometheus
         - name: PROM_PORT
           value: "9090"
-        image: grafana/grafana:7.4.5
+        image: grafana/grafana:8.5.10
         imagePullPolicy: IfNotPresent
         livenessProbe:
           failureThreshold: 3

diff --git a/kustomize/monitoring/deploy-prometheus.yaml b/kustomize/monitoring/deploy-prometheus.yaml
@@ -27,7 +27,7 @@ spec:
         name: crunchy-prometheus
     spec:
       containers:
-      - image: prom/prometheus:v2.27.1
+      - image: prom/prometheus:v2.33.5
         imagePullPolicy: IfNotPresent
         livenessProbe:
           failureThreshold: 3