Merge pull request #526 from 3scale/sop-annotations-in-alerts

Sop annotations in alerts
3scale · Jun 2, 2022 · fef16ae · fef16ae
2 parents b5eb7b9 + 418fe08
commit fef16ae
Show file tree

Hide file tree

Showing 13 changed files with 73 additions and 0 deletions.
diff --git a/doc/prometheusrules/apicast.yaml b/doc/prometheusrules/apicast.yaml
@@ -15,6 +15,7 @@ spec:
     - alert: ThreescaleApicastJobDown
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
       expr: up{job=~".*/apicast-production|.*/apicast-staging",namespace="__NAMESPACE__"} == 0
       for: 1m
@@ -23,6 +24,7 @@ spec:
     - alert: ThreescaleApicastRequestTime
       annotations:
         description: High number of request taking more than a second to be processed
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_request_time.adoc
         summary: Request on instance {{ $labels.instance }} is taking more than one second to process the requests
       expr: sum(rate(total_response_time_seconds_bucket{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[1m])) - sum(rate(upstream_response_time_seconds_bucket{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[1m])) > 1
       for: 2m
@@ -31,6 +33,7 @@ spec:
     - alert: ThreescaleApicastHttp4xxErrorRate
       annotations:
         description: The number of request with 4XX is bigger than the 5% of total request.
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_http_4xx_error_rate.adoc
         summary: APICast high HTTP 4XX error rate (instance {{ $labels.instance }})
       expr: sum(rate(apicast_status{namespace='__NAMESPACE__', status=~"^4.."}[1m])) / sum(rate(apicast_status{namespace='__NAMESPACE__'}[1m])) * 100 > 5
       for: 5m
@@ -42,6 +45,7 @@ spec:
           APIcast p99 latency is higher than 5 seconds
             VALUE = {{ $value }}
             LABELS: {{ $labels }}
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_apicast_latency.adoc
         summary: APICast latency high (instance {{ $labels.instance }})
       expr: histogram_quantile(0.99, sum(rate(total_response_time_seconds_bucket{namespace='__NAMESPACE__',}[30m])) by (le)) > 5
       for: 5m
@@ -50,6 +54,7 @@ spec:
     - alert: ThreescaleApicastWorkerRestart
       annotations:
         description: A new thread has been started. This could indicate that a worker process has died due to the memory limits being exceeded. Please investigate the memory pressure on pod (instance {{ $labels.instance }})
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_worker_restart.adoc
         summary: A new worker process in Nginx has been started
       expr: changes(worker_process{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[5m]) > 0
       for: 5m

diff --git a/doc/prometheusrules/backend-listener.yaml b/doc/prometheusrules/backend-listener.yaml
@@ -15,6 +15,7 @@ spec:
     - alert: ThreescaleBackendListener5XXRequestsHigh
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_listener_5xx_requests_high.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes
       expr: sum(rate(apisonator_listener_response_codes{job=~"backend.*",namespace="__NAMESPACE__",resp_code="5xx"}[5m])) by (namespace,job,resp_code) > 5000
       for: 5m
@@ -23,6 +24,7 @@ spec:
     - alert: ThreescaleBackendListenerJobDown
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
       expr: up{job=~".*backend-listener.*",namespace="__NAMESPACE__"} == 0
       for: 1m

diff --git a/doc/prometheusrules/backend-worker.yaml b/doc/prometheusrules/backend-worker.yaml
@@ -15,6 +15,7 @@ spec:
     - alert: ThreescaleBackendWorkerJobsCountRunningHigh
       annotations:
         description: '{{$labels.container_name}} replica controller on {{$labels.namespace}} project: Has more than 1000 jobs processed in the last 5 minutes'
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_worker_jobs_count_running_high.adoc
         summary: '{{$labels.container_name}} replica controller on {{$labels.namespace}}: Has more than 10000 jobs processed in the last 5 minutes'
       expr: sum(avg_over_time(apisonator_worker_job_count{job=~"backend.*",namespace="__NAMESPACE__"} [5m])) by (namespace,job) > 10000
       for: 5m
@@ -23,6 +24,7 @@ spec:
     - alert: ThreescaleBackendWorkerJobDown
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
       expr: up{job=~".*backend-worker.*",namespace="__NAMESPACE__"} == 0
       for: 1m

diff --git a/doc/prometheusrules/system-app.yaml b/doc/prometheusrules/system-app.yaml
@@ -15,6 +15,7 @@ spec:
     - alert: ThreescaleSystemApp5XXRequestsHigh
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/system_app_5xx_requests_high.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute
       expr: sum(rate(rails_requests_total{namespace="__NAMESPACE__",pod=~"system-app-[a-z0-9]+-[a-z0-9]+",status=~"5[0-9]*"}[1m])) by (namespace,job) > 50
       for: 1m
@@ -23,6 +24,7 @@ spec:
     - alert: ThreescaleSystemAppJobDown
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
       expr: up{job=~".*system-app.*",namespace="__NAMESPACE__"} == 0
       for: 1m

diff --git a/doc/prometheusrules/system-sidekiq.yaml b/doc/prometheusrules/system-sidekiq.yaml
@@ -15,6 +15,7 @@ spec:
     - alert: ThreescaleSystemSidekiqJobDown
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
       expr: up{job=~".*system-sidekiq.*",namespace="__NAMESPACE__"} == 0
       for: 1m

diff --git a/doc/prometheusrules/threescale-kube-state-metrics.yaml b/doc/prometheusrules/threescale-kube-state-metrics.yaml
@@ -12,48 +12,55 @@ spec:
     - alert: ThreescalePodCrashLooping
       annotations:
         message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_crash_looping.adoc
       expr: rate(kube_pod_container_status_restarts_total{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[15m]) * 60 * 5 > 0
       for: 5m
       labels:
         severity: critical
     - alert: ThreescalePodNotReady
       annotations:
         message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes.
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc
       expr: sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{namespace="__NAMESPACE__",owner_kind!="Job"})) > 0
       for: 5m
       labels:
         severity: critical
     - alert: ThreescaleReplicationControllerReplicasMismatch
       annotations:
         message: ReplicationController {{ $labels.namespace }}/{{ $labels.replicationcontroller }} has not matched the expected number of replicas for longer than 5 minutes.
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/replication_controller_replicas_mismatch.adoc
       expr: kube_replicationcontroller_spec_replicas {namespace="__NAMESPACE__",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"} != kube_replicationcontroller_status_ready_replicas {namespace="__NAMESPACE__",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}
       for: 5m
       labels:
         severity: critical
     - alert: ThreescaleContainerWaiting
       annotations:
         message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has been in waiting state for longer than 1 hour.
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_waiting.adoc
       expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) > 0
       for: 1h
       labels:
         severity: warning
     - alert: ThreescaleContainerCPUHigh
       annotations:
         message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has High CPU usage for longer than 15 minutes.
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_high.adoc
       expr: sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by (namespace, container, pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by (namespace, container, pod) * 100 > 90
       for: 15m
       labels:
         severity: warning
     - alert: ThreescaleContainerMemoryHigh
       annotations:
         message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has High Memory usage for longer than 15 minutes.
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_memory_high.adoc
       expr: sum(container_memory_usage_bytes{namespace="__NAMESPACE__",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by(namespace, container, pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by(namespace, container, pod) * 100 > 90
       for: 15m
       labels:
         severity: warning
     - alert: ThreescaleContainerCPUThrottlingHigh
       annotations:
         message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_throttling_high.adoc
       expr: sum(increase(container_cpu_cfs_throttled_periods_total{namespace="__NAMESPACE__",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)" }[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[5m])) by (container, pod, namespace) > ( 25 / 100 )
       for: 15m
       labels:

diff --git a/doc/prometheusrules/zync-que.yaml b/doc/prometheusrules/zync-que.yaml
@@ -15,6 +15,7 @@ spec:
     - alert: ThreescaleZyncQueJobDown
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
       expr: up{job=~".*/zync-que",namespace="__NAMESPACE__"} == 0
       for: 1m
@@ -23,6 +24,7 @@ spec:
     - alert: ThreescaleZyncQueScheduledJobCountHigh
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} has scheduled job count over 100
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_scheduled_job_count_high.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} has scheduled job count over 100
       expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='scheduled',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250
       for: 1m
@@ -31,6 +33,7 @@ spec:
     - alert: ThreescaleZyncQueFailedJobCountHigh
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} has failed job count over 100
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_failed_job_count_high.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} has failed job count over 100
       expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='failed',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250
       for: 1m
@@ -39,6 +42,7 @@ spec:
     - alert: ThreescaleZyncQueReadyJobCountHigh
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} has ready job count over 100
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_ready_job_count_high.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} has ready job count over 100
       expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='ready',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250
       for: 1m

diff --git a/doc/prometheusrules/zync.yaml b/doc/prometheusrules/zync.yaml
@@ -15,6 +15,7 @@ spec:
     - alert: ThreescaleZyncJobDown
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
       expr: up{job=~".*/zync",namespace="__NAMESPACE__"} == 0
       for: 1m
@@ -23,6 +24,7 @@ spec:
     - alert: ThreescaleZync5XXRequestsHigh
       annotations:
         description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute
+        sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_5xx_requests_high.adoc
         summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute
       expr: sum(rate(rails_requests_total{namespace="__NAMESPACE__",pod=~"zync-[a-z0-9]+-[a-z0-9]+",status=~"5[0-9]*"}[1m])) by (namespace,job) > 50
       for: 1m

diff --git a/pkg/3scale/amp/component/apicast_monitoring.go b/pkg/3scale/amp/component/apicast_monitoring.go
@@ -106,6 +106,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
 						{
 							Alert: "ThreescaleApicastJobDown",
 							Annotations: map[string]string{
+								"sop_url":     ThreescalePrometheusJobDownURL,
 								"summary":     "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
 								"description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
 							},
@@ -118,6 +119,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
 						{
 							Alert: "ThreescaleApicastRequestTime",
 							Annotations: map[string]string{
+								"sop_url":     ThreescaleApicastRequestTimeURL,
 								"summary":     "Request on instance {{ $labels.instance }} is taking more than one second to process the requests",
 								"description": "High number of request taking more than a second to be processed",
 							},
@@ -130,6 +132,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
 						{
 							Alert: "ThreescaleApicastHttp4xxErrorRate",
 							Annotations: map[string]string{
+								"sop_url":     ThreescaleApicastHttp4xxErrorRateURL,
 								"summary":     "APICast high HTTP 4XX error rate (instance {{ $labels.instance }})",
 								"description": "The number of request with 4XX is bigger than the 5% of total request.",
 							},
@@ -142,6 +145,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
 						{
 							Alert: "ThreescaleApicastLatencyHigh",
 							Annotations: map[string]string{
+								"sop_url":     ThreescaleApicastLatencyHighURL,
 								"summary":     "APICast latency high (instance {{ $labels.instance }})",
 								"description": "APIcast p99 latency is higher than 5 seconds\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}",
 							},
@@ -154,6 +158,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
 						{
 							Alert: "ThreescaleApicastWorkerRestart",
 							Annotations: map[string]string{
+								"sop_url":     ThreescaleApicastWorkerRestartURL,
 								"summary":     "A new worker process in Nginx has been started",
 								"description": "A new thread has been started. This could indicate that a worker process has died due to the memory limits being exceeded. Please investigate the memory pressure on pod (instance {{ $labels.instance }})",
 							},

diff --git a/pkg/3scale/amp/component/backend_monitoring.go b/pkg/3scale/amp/component/backend_monitoring.go
@@ -86,6 +86,7 @@ func (backend *Backend) BackendWorkerPrometheusRules() *monitoringv1.PrometheusR
 						{
 							Alert: "ThreescaleBackendWorkerJobsCountRunningHigh",
 							Annotations: map[string]string{
+								"sop_url":     ThreescaleBackendWorkerJobsCountRunningHighURL,
 								"summary":     "{{$labels.container_name}} replica controller on {{$labels.namespace}}: Has more than 10000 jobs processed in the last 5 minutes",
 								"description": "{{$labels.container_name}} replica controller on {{$labels.namespace}} project: Has more than 1000 jobs processed in the last 5 minutes",
 							},
@@ -98,6 +99,7 @@ func (backend *Backend) BackendWorkerPrometheusRules() *monitoringv1.PrometheusR
 						{
 							Alert: "ThreescaleBackendWorkerJobDown",
 							Annotations: map[string]string{
+								"sop_url":     ThreescalePrometheusJobDownURL,
 								"summary":     "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
 								"description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
 							},
@@ -132,6 +134,7 @@ func (backend *Backend) BackendListenerPrometheusRules() *monitoringv1.Prometheu
 						{
 							Alert: "ThreescaleBackendListener5XXRequestsHigh",
 							Annotations: map[string]string{
+								"sop_url":     ThreescaleBackendListener5XXRequestsHighURL,
 								"summary":     "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes",
 								"description": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes",
 							},
@@ -144,6 +147,7 @@ func (backend *Backend) BackendListenerPrometheusRules() *monitoringv1.Prometheu
 						{
 							Alert: "ThreescaleBackendListenerJobDown",
 							Annotations: map[string]string{
+								"sop_url":     ThreescalePrometheusJobDownURL,
 								"summary":     "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
 								"description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
 							},