From a0cfa5f5bcb0f590b7debad553fce13895ae1e13 Mon Sep 17 00:00:00 2001 From: Eguzki Astiz Lezaun Date: Tue, 17 Nov 2020 15:44:33 +0100 Subject: [PATCH 1/7] add SOP url annotation to prometheus rules: zync and zync-que --- pkg/3scale/amp/component/generic_monitoring.go | 9 +++++++++ pkg/3scale/amp/component/zync_monitoring.go | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/pkg/3scale/amp/component/generic_monitoring.go b/pkg/3scale/amp/component/generic_monitoring.go index a077cf498..e29a90e27 100644 --- a/pkg/3scale/amp/component/generic_monitoring.go +++ b/pkg/3scale/amp/component/generic_monitoring.go @@ -11,6 +11,15 @@ import ( "k8s.io/apimachinery/pkg/util/intstr" ) +// Add alert sop urls here +const ( + ThreescalePodNotReadyURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc" + ThreescaleZync5XXRequestsHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_5xx_requests_high.adoc" + ThreescaleZyncQueScheduledJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_scheduled_job_count_high.adoc" + ThreescaleZyncQueFailedJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_failed_job_count_high.adoc" + ThreescaleZyncQueReadyJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_ready_job_count_high.adoc" +) + func KubernetesResourcesByNamespaceGrafanaDashboard(sumRate, ns, appLabel string) *grafanav1alpha1.GrafanaDashboard { data := &struct { Namespace, SumRate string diff --git a/pkg/3scale/amp/component/zync_monitoring.go b/pkg/3scale/amp/component/zync_monitoring.go index fc18c244f..87fb4f0b2 100644 --- a/pkg/3scale/amp/component/zync_monitoring.go +++ b/pkg/3scale/amp/component/zync_monitoring.go @@ -87,6 +87,7 @@ func (zync *Zync) ZyncPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleZyncJobDown", Annotations: map[string]string{ + "sop_url": ThreescalePodNotReadyURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, @@ -99,6 +100,7 @@ func (zync *Zync) ZyncPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleZync5XXRequestsHigh", Annotations: map[string]string{ + "sop_url": ThreescaleZync5XXRequestsHighURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute", }, @@ -133,6 +135,7 @@ func (zync *Zync) ZyncQuePrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleZyncQueJobDown", Annotations: map[string]string{ + "sop_url": ThreescalePodNotReadyURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, @@ -145,6 +148,7 @@ func (zync *Zync) ZyncQuePrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleZyncQueScheduledJobCountHigh", Annotations: map[string]string{ + "sop_url": ThreescaleZyncQueScheduledJobCountHighURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has scheduled job count over 100", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} has scheduled job count over 100", }, @@ -157,6 +161,7 @@ func (zync *Zync) ZyncQuePrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleZyncQueFailedJobCountHigh", Annotations: map[string]string{ + "sop_url": ThreescaleZyncQueFailedJobCountHighURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has failed job count over 100", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} has failed job count over 100", }, @@ -169,6 +174,7 @@ func (zync *Zync) ZyncQuePrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleZyncQueReadyJobCountHigh", Annotations: map[string]string{ + "sop_url": ThreescaleZyncQueReadyJobCountHighURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has ready job count over 100", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} has ready job count over 100", }, From f3ba7aab74d17a9630e275f31d6e9f70847cc76c Mon Sep 17 00:00:00 2001 From: Eguzki Astiz Lezaun Date: Tue, 17 Nov 2020 15:53:46 +0100 Subject: [PATCH 2/7] add SOP url annotation to prometheus rules: backend and generic rules --- .../amp/component/backend_monitoring.go | 4 +++ .../amp/component/generic_monitoring.go | 26 +++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pkg/3scale/amp/component/backend_monitoring.go b/pkg/3scale/amp/component/backend_monitoring.go index 3c50edf07..39fc996cb 100644 --- a/pkg/3scale/amp/component/backend_monitoring.go +++ b/pkg/3scale/amp/component/backend_monitoring.go @@ -86,6 +86,7 @@ func (backend *Backend) BackendWorkerPrometheusRules() *monitoringv1.PrometheusR { Alert: "ThreescaleBackendWorkerJobsCountRunningHigh", Annotations: map[string]string{ + "sop_url": ThreescaleBackendWorkerJobsCountRunningHighURL, "summary": "{{$labels.container_name}} replica controller on {{$labels.namespace}}: Has more than 10000 jobs processed in the last 5 minutes", "description": "{{$labels.container_name}} replica controller on {{$labels.namespace}} project: Has more than 1000 jobs processed in the last 5 minutes", }, @@ -98,6 +99,7 @@ func (backend *Backend) BackendWorkerPrometheusRules() *monitoringv1.PrometheusR { Alert: "ThreescaleBackendWorkerJobDown", Annotations: map[string]string{ + "sop_url": ThreescalePodNotReadyURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, @@ -132,6 +134,7 @@ func (backend *Backend) BackendListenerPrometheusRules() *monitoringv1.Prometheu { Alert: "ThreescaleBackendListener5XXRequestsHigh", Annotations: map[string]string{ + "sop_url": ThreescaleBackendListener5XXRequestsHighURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes", }, @@ -144,6 +147,7 @@ func (backend *Backend) BackendListenerPrometheusRules() *monitoringv1.Prometheu { Alert: "ThreescaleBackendListenerJobDown", Annotations: map[string]string{ + "sop_url": ThreescalePodNotReadyURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, diff --git a/pkg/3scale/amp/component/generic_monitoring.go b/pkg/3scale/amp/component/generic_monitoring.go index e29a90e27..adf90b147 100644 --- a/pkg/3scale/amp/component/generic_monitoring.go +++ b/pkg/3scale/amp/component/generic_monitoring.go @@ -13,11 +13,20 @@ import ( // Add alert sop urls here const ( - ThreescalePodNotReadyURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc" - ThreescaleZync5XXRequestsHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_5xx_requests_high.adoc" - ThreescaleZyncQueScheduledJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_scheduled_job_count_high.adoc" - ThreescaleZyncQueFailedJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_failed_job_count_high.adoc" - ThreescaleZyncQueReadyJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_ready_job_count_high.adoc" + ThreescalePodNotReadyURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc" + ThreescaleZync5XXRequestsHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_5xx_requests_high.adoc" + ThreescaleZyncQueScheduledJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_scheduled_job_count_high.adoc" + ThreescaleZyncQueFailedJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_failed_job_count_high.adoc" + ThreescaleZyncQueReadyJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_ready_job_count_high.adoc" + ThreescaleBackendWorkerJobsCountRunningHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_worker_jobs_count_running_high.adoc" + ThreescaleBackendListener5XXRequestsHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_listener_5xx_requests_high.adoc" + ThreescalePodCrashLoopingURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_crash_looping.adoc" + ThreescalePodNotReadyURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc" + ThreescaleReplicationControllerReplicasMismatchURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/replication_controller_replicas_mismatch.adoc" + ThreescaleContainerWaitingURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_waiting.adoc" + ThreescaleContainerCPUHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_high.adoc" + ThreescaleContainerMemoryHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_memory_high.adoc" + ThreescaleContainerMemoryHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_throttling_high.adoc" ) func KubernetesResourcesByNamespaceGrafanaDashboard(sumRate, ns, appLabel string) *grafanav1alpha1.GrafanaDashboard { @@ -80,6 +89,7 @@ func KubeStateMetricsPrometheusRules(sumRate, ns, appLabel string) *monitoringv1 { Alert: "ThreescalePodCrashLooping", Annotations: map[string]string{ + "sop_url": ThreescalePodCrashLoopingURL, "message": `Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.`, }, Expr: intstr.FromString(fmt.Sprintf(`rate(kube_pod_container_status_restarts_total{namespace="%s",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[15m]) * 60 * 5 > 0`, ns)), @@ -91,6 +101,7 @@ func KubeStateMetricsPrometheusRules(sumRate, ns, appLabel string) *monitoringv1 { Alert: "ThreescalePodNotReady", Annotations: map[string]string{ + "sop_url": ThreescalePodNotReadyURL, "message": `Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes.`, }, Expr: intstr.FromString(fmt.Sprintf(`sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{namespace="%s",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{namespace="%s",owner_kind!="Job"})) > 0`, ns, ns)), @@ -102,6 +113,7 @@ func KubeStateMetricsPrometheusRules(sumRate, ns, appLabel string) *monitoringv1 { Alert: "ThreescaleReplicationControllerReplicasMismatch", Annotations: map[string]string{ + "sop_url": ThreescaleReplicationControllerReplicasMismatchURL, "message": `ReplicationController {{ $labels.namespace }}/{{ $labels.replicationcontroller }} has not matched the expected number of replicas for longer than 5 minutes.`, }, Expr: intstr.FromString(fmt.Sprintf(`kube_replicationcontroller_spec_replicas {namespace="%s",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"} != kube_replicationcontroller_status_ready_replicas {namespace="%s",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}`, ns, ns)), @@ -113,6 +125,7 @@ func KubeStateMetricsPrometheusRules(sumRate, ns, appLabel string) *monitoringv1 { Alert: "ThreescaleContainerWaiting", Annotations: map[string]string{ + "sop_url": ThreescaleContainerWaitingURL, "message": `Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has been in waiting state for longer than 1 hour.`, }, Expr: intstr.FromString(fmt.Sprintf(`sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace="%s",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) > 0`, ns)), @@ -124,6 +137,7 @@ func KubeStateMetricsPrometheusRules(sumRate, ns, appLabel string) *monitoringv1 { Alert: "ThreescaleContainerCPUHigh", Annotations: map[string]string{ + "sop_url": ThreescaleContainerCPUHighURL, "message": `Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has High CPU usage for longer than 15 minutes.`, }, Expr: intstr.FromString(fmt.Sprintf(`sum(node_namespace_pod_container:container_cpu_usage_seconds_total:%s{namespace="%s",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by (namespace, container, pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace="%s",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by (namespace, container, pod) * 100 > 90`, sumRate, ns, ns)), @@ -135,6 +149,7 @@ func KubeStateMetricsPrometheusRules(sumRate, ns, appLabel string) *monitoringv1 { Alert: "ThreescaleContainerMemoryHigh", Annotations: map[string]string{ + "sop_url": ThreescaleContainerMemoryHighURL, "message": `Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has High Memory usage for longer than 15 minutes.`, }, Expr: intstr.FromString(fmt.Sprintf(`sum(container_memory_usage_bytes{namespace="%s",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by(namespace, container, pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="%s",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by(namespace, container, pod) * 100 > 90`, ns, ns)), @@ -146,6 +161,7 @@ func KubeStateMetricsPrometheusRules(sumRate, ns, appLabel string) *monitoringv1 { Alert: "ThreescaleContainerCPUThrottlingHigh", Annotations: map[string]string{ + "sop_url": ThreescaleContainerMemoryHighURL, "message": `{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.`, }, Expr: intstr.FromString(fmt.Sprintf(`sum(increase(container_cpu_cfs_throttled_periods_total{namespace="%s",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)" }[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{namespace="%s",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[5m])) by (container, pod, namespace) > ( 25 / 100 )`, ns, ns)), From 613f5af79a6bbe9094a56066ff2c6d151f786bf5 Mon Sep 17 00:00:00 2001 From: Eguzki Astiz Lezaun Date: Tue, 17 Nov 2020 15:59:14 +0100 Subject: [PATCH 3/7] add SOP url annotation to prometheus rules: apicast rules --- pkg/3scale/amp/component/apicast_monitoring.go | 5 +++++ pkg/3scale/amp/component/generic_monitoring.go | 9 ++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pkg/3scale/amp/component/apicast_monitoring.go b/pkg/3scale/amp/component/apicast_monitoring.go index 03270bfb9..5e5bace80 100644 --- a/pkg/3scale/amp/component/apicast_monitoring.go +++ b/pkg/3scale/amp/component/apicast_monitoring.go @@ -106,6 +106,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleApicastJobDown", Annotations: map[string]string{ + "sop_url": ThreescalePodNotReadyURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, @@ -118,6 +119,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleApicastRequestTime", Annotations: map[string]string{ + "sop_url": ThreescaleApicastRequestTimeURL, "summary": "Request on instance {{ $labels.instance }} is taking more than one second to process the requests", "description": "High number of request taking more than a second to be processed", }, @@ -130,6 +132,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleApicastHttp4xxErrorRate", Annotations: map[string]string{ + "sop_url": ThreescaleApicastHttp4xxErrorRateURL, "summary": "APICast high HTTP 4XX error rate (instance {{ $labels.instance }})", "description": "The number of request with 4XX is bigger than the 5% of total request.", }, @@ -142,6 +145,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleApicastLatencyHigh", Annotations: map[string]string{ + "sop_url": ThreescaleApicastLatencyHighURL, "summary": "APICast latency high (instance {{ $labels.instance }})", "description": "APIcast p99 latency is higher than 5 seconds\n VALUE = {{ $value }}\n LABELS: {{ $labels }}", }, @@ -154,6 +158,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleApicastWorkerRestart", Annotations: map[string]string{ + "sop_url": ThreescaleApicastWorkerRestartURL, "summary": "A new worker process in Nginx has been started", "description": "A new thread has been started. This could indicate that a worker process has died due to the memory limits being exceeded. Please investigate the memory pressure on pod (instance {{ $labels.instance }})", }, diff --git a/pkg/3scale/amp/component/generic_monitoring.go b/pkg/3scale/amp/component/generic_monitoring.go index adf90b147..a710eef37 100644 --- a/pkg/3scale/amp/component/generic_monitoring.go +++ b/pkg/3scale/amp/component/generic_monitoring.go @@ -21,12 +21,15 @@ const ( ThreescaleBackendWorkerJobsCountRunningHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_worker_jobs_count_running_high.adoc" ThreescaleBackendListener5XXRequestsHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_listener_5xx_requests_high.adoc" ThreescalePodCrashLoopingURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_crash_looping.adoc" - ThreescalePodNotReadyURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc" ThreescaleReplicationControllerReplicasMismatchURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/replication_controller_replicas_mismatch.adoc" ThreescaleContainerWaitingURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_waiting.adoc" ThreescaleContainerCPUHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_high.adoc" ThreescaleContainerMemoryHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_memory_high.adoc" - ThreescaleContainerMemoryHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_throttling_high.adoc" + ThreescaleContainerCPUThrottlingHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_throttling_high.adoc" + ThreescaleApicastRequestTimeURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_request_time.adoc" + ThreescaleApicastHttp4xxErrorRateURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_http_4xx_error_rate.adoc" + ThreescaleApicastLatencyHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_apicast_latency.adoc" + ThreescaleApicastWorkerRestartURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_worker_restart.adoc" ) func KubernetesResourcesByNamespaceGrafanaDashboard(sumRate, ns, appLabel string) *grafanav1alpha1.GrafanaDashboard { @@ -161,7 +164,7 @@ func KubeStateMetricsPrometheusRules(sumRate, ns, appLabel string) *monitoringv1 { Alert: "ThreescaleContainerCPUThrottlingHigh", Annotations: map[string]string{ - "sop_url": ThreescaleContainerMemoryHighURL, + "sop_url": ThreescaleContainerCPUThrottlingHighURL, "message": `{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.`, }, Expr: intstr.FromString(fmt.Sprintf(`sum(increase(container_cpu_cfs_throttled_periods_total{namespace="%s",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)" }[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{namespace="%s",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[5m])) by (container, pod, namespace) > ( 25 / 100 )`, ns, ns)), From c2704df2104f8a31ea0c10960e8ca1b4ef1d5fe0 Mon Sep 17 00:00:00 2001 From: Eguzki Astiz Lezaun Date: Tue, 17 Nov 2020 16:08:30 +0100 Subject: [PATCH 4/7] sop url: fix job down urls --- pkg/3scale/amp/component/apicast_monitoring.go | 2 +- pkg/3scale/amp/component/backend_monitoring.go | 4 ++-- pkg/3scale/amp/component/generic_monitoring.go | 1 + pkg/3scale/amp/component/system_monitoring.go | 1 + pkg/3scale/amp/component/zync_monitoring.go | 4 ++-- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pkg/3scale/amp/component/apicast_monitoring.go b/pkg/3scale/amp/component/apicast_monitoring.go index 5e5bace80..e5fea84ba 100644 --- a/pkg/3scale/amp/component/apicast_monitoring.go +++ b/pkg/3scale/amp/component/apicast_monitoring.go @@ -106,7 +106,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleApicastJobDown", Annotations: map[string]string{ - "sop_url": ThreescalePodNotReadyURL, + "sop_url": ThreescalePrometheusJobDownURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, diff --git a/pkg/3scale/amp/component/backend_monitoring.go b/pkg/3scale/amp/component/backend_monitoring.go index 39fc996cb..0bb799870 100644 --- a/pkg/3scale/amp/component/backend_monitoring.go +++ b/pkg/3scale/amp/component/backend_monitoring.go @@ -99,7 +99,7 @@ func (backend *Backend) BackendWorkerPrometheusRules() *monitoringv1.PrometheusR { Alert: "ThreescaleBackendWorkerJobDown", Annotations: map[string]string{ - "sop_url": ThreescalePodNotReadyURL, + "sop_url": ThreescalePrometheusJobDownURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, @@ -147,7 +147,7 @@ func (backend *Backend) BackendListenerPrometheusRules() *monitoringv1.Prometheu { Alert: "ThreescaleBackendListenerJobDown", Annotations: map[string]string{ - "sop_url": ThreescalePodNotReadyURL, + "sop_url": ThreescalePrometheusJobDownURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, diff --git a/pkg/3scale/amp/component/generic_monitoring.go b/pkg/3scale/amp/component/generic_monitoring.go index a710eef37..465d3fe8f 100644 --- a/pkg/3scale/amp/component/generic_monitoring.go +++ b/pkg/3scale/amp/component/generic_monitoring.go @@ -13,6 +13,7 @@ import ( // Add alert sop urls here const ( + ThreescalePrometheusJobDownURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc" ThreescalePodNotReadyURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc" ThreescaleZync5XXRequestsHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_5xx_requests_high.adoc" ThreescaleZyncQueScheduledJobCountHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_scheduled_job_count_high.adoc" diff --git a/pkg/3scale/amp/component/system_monitoring.go b/pkg/3scale/amp/component/system_monitoring.go index 9e7647e53..b963a6c67 100644 --- a/pkg/3scale/amp/component/system_monitoring.go +++ b/pkg/3scale/amp/component/system_monitoring.go @@ -114,6 +114,7 @@ func (system *System) SystemAppPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleSystemApp5XXRequestsHigh", Annotations: map[string]string{ + "sop_url": ThreescaleApicastLatencyHighURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute", }, diff --git a/pkg/3scale/amp/component/zync_monitoring.go b/pkg/3scale/amp/component/zync_monitoring.go index 87fb4f0b2..a21174282 100644 --- a/pkg/3scale/amp/component/zync_monitoring.go +++ b/pkg/3scale/amp/component/zync_monitoring.go @@ -87,7 +87,7 @@ func (zync *Zync) ZyncPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleZyncJobDown", Annotations: map[string]string{ - "sop_url": ThreescalePodNotReadyURL, + "sop_url": ThreescalePrometheusJobDownURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, @@ -135,7 +135,7 @@ func (zync *Zync) ZyncQuePrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleZyncQueJobDown", Annotations: map[string]string{ - "sop_url": ThreescalePodNotReadyURL, + "sop_url": ThreescalePrometheusJobDownURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, From eb1330537132f0006beee6856f0ac887d1d24b7e Mon Sep 17 00:00:00 2001 From: Eguzki Astiz Lezaun Date: Tue, 17 Nov 2020 17:54:14 +0100 Subject: [PATCH 5/7] [WIP] add SOP url annotation to prometheus rules: system rules --- pkg/3scale/amp/component/system_monitoring.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/3scale/amp/component/system_monitoring.go b/pkg/3scale/amp/component/system_monitoring.go index b963a6c67..9a1103284 100644 --- a/pkg/3scale/amp/component/system_monitoring.go +++ b/pkg/3scale/amp/component/system_monitoring.go @@ -114,7 +114,6 @@ func (system *System) SystemAppPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleSystemApp5XXRequestsHigh", Annotations: map[string]string{ - "sop_url": ThreescaleApicastLatencyHighURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute", }, @@ -127,6 +126,7 @@ func (system *System) SystemAppPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleSystemAppJobDown", Annotations: map[string]string{ + "sop_url": ThreescalePrometheusJobDownURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, @@ -161,6 +161,7 @@ func (system *System) SystemSidekiqPrometheusRules() *monitoringv1.PrometheusRul { Alert: "ThreescaleSystemSidekiqJobDown", Annotations: map[string]string{ + "sop_url": ThreescalePrometheusJobDownURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN", }, From 1354a70d3241c3136b10f4156b45610162dd5934 Mon Sep 17 00:00:00 2001 From: Eguzki Astiz Lezaun Date: Thu, 2 Jun 2022 11:02:03 +0200 Subject: [PATCH 6/7] add SOP url annotation to prometheus rules: system --- pkg/3scale/amp/component/generic_monitoring.go | 1 + pkg/3scale/amp/component/system_monitoring.go | 1 + 2 files changed, 2 insertions(+) diff --git a/pkg/3scale/amp/component/generic_monitoring.go b/pkg/3scale/amp/component/generic_monitoring.go index 465d3fe8f..6d75cc709 100644 --- a/pkg/3scale/amp/component/generic_monitoring.go +++ b/pkg/3scale/amp/component/generic_monitoring.go @@ -31,6 +31,7 @@ const ( ThreescaleApicastHttp4xxErrorRateURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_http_4xx_error_rate.adoc" ThreescaleApicastLatencyHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_apicast_latency.adoc" ThreescaleApicastWorkerRestartURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_worker_restart.adoc" + ThreescaleSystemApp5XXRequestsHighURL = "https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/system_app_5xx_requests_high.adoc" ) func KubernetesResourcesByNamespaceGrafanaDashboard(sumRate, ns, appLabel string) *grafanav1alpha1.GrafanaDashboard { diff --git a/pkg/3scale/amp/component/system_monitoring.go b/pkg/3scale/amp/component/system_monitoring.go index 9a1103284..13b524a11 100644 --- a/pkg/3scale/amp/component/system_monitoring.go +++ b/pkg/3scale/amp/component/system_monitoring.go @@ -114,6 +114,7 @@ func (system *System) SystemAppPrometheusRules() *monitoringv1.PrometheusRule { { Alert: "ThreescaleSystemApp5XXRequestsHigh", Annotations: map[string]string{ + "sop_url": ThreescaleSystemApp5XXRequestsHighURL, "summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute", "description": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute", }, From 418fe081c907450037efb2e9d92fc9b4f69a9c2d Mon Sep 17 00:00:00 2001 From: Eguzki Astiz Lezaun Date: Thu, 2 Jun 2022 11:11:58 +0200 Subject: [PATCH 7/7] doc/prometheusrules: update --- doc/prometheusrules/apicast.yaml | 5 +++++ doc/prometheusrules/backend-listener.yaml | 2 ++ doc/prometheusrules/backend-worker.yaml | 2 ++ doc/prometheusrules/system-app.yaml | 2 ++ doc/prometheusrules/system-sidekiq.yaml | 1 + doc/prometheusrules/threescale-kube-state-metrics.yaml | 7 +++++++ doc/prometheusrules/zync-que.yaml | 4 ++++ doc/prometheusrules/zync.yaml | 2 ++ 8 files changed, 25 insertions(+) diff --git a/doc/prometheusrules/apicast.yaml b/doc/prometheusrules/apicast.yaml index 25903a78a..ad440ac8c 100644 --- a/doc/prometheusrules/apicast.yaml +++ b/doc/prometheusrules/apicast.yaml @@ -15,6 +15,7 @@ spec: - alert: ThreescaleApicastJobDown annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN expr: up{job=~".*/apicast-production|.*/apicast-staging",namespace="__NAMESPACE__"} == 0 for: 1m @@ -23,6 +24,7 @@ spec: - alert: ThreescaleApicastRequestTime annotations: description: High number of request taking more than a second to be processed + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_request_time.adoc summary: Request on instance {{ $labels.instance }} is taking more than one second to process the requests expr: sum(rate(total_response_time_seconds_bucket{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[1m])) - sum(rate(upstream_response_time_seconds_bucket{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[1m])) > 1 for: 2m @@ -31,6 +33,7 @@ spec: - alert: ThreescaleApicastHttp4xxErrorRate annotations: description: The number of request with 4XX is bigger than the 5% of total request. + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_http_4xx_error_rate.adoc summary: APICast high HTTP 4XX error rate (instance {{ $labels.instance }}) expr: sum(rate(apicast_status{namespace='__NAMESPACE__', status=~"^4.."}[1m])) / sum(rate(apicast_status{namespace='__NAMESPACE__'}[1m])) * 100 > 5 for: 5m @@ -42,6 +45,7 @@ spec: APIcast p99 latency is higher than 5 seconds VALUE = {{ $value }} LABELS: {{ $labels }} + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_apicast_latency.adoc summary: APICast latency high (instance {{ $labels.instance }}) expr: histogram_quantile(0.99, sum(rate(total_response_time_seconds_bucket{namespace='__NAMESPACE__',}[30m])) by (le)) > 5 for: 5m @@ -50,6 +54,7 @@ spec: - alert: ThreescaleApicastWorkerRestart annotations: description: A new thread has been started. This could indicate that a worker process has died due to the memory limits being exceeded. Please investigate the memory pressure on pod (instance {{ $labels.instance }}) + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_worker_restart.adoc summary: A new worker process in Nginx has been started expr: changes(worker_process{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[5m]) > 0 for: 5m diff --git a/doc/prometheusrules/backend-listener.yaml b/doc/prometheusrules/backend-listener.yaml index 0d39724e9..c2d05c092 100644 --- a/doc/prometheusrules/backend-listener.yaml +++ b/doc/prometheusrules/backend-listener.yaml @@ -15,6 +15,7 @@ spec: - alert: ThreescaleBackendListener5XXRequestsHigh annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_listener_5xx_requests_high.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes expr: sum(rate(apisonator_listener_response_codes{job=~"backend.*",namespace="__NAMESPACE__",resp_code="5xx"}[5m])) by (namespace,job,resp_code) > 5000 for: 5m @@ -23,6 +24,7 @@ spec: - alert: ThreescaleBackendListenerJobDown annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN expr: up{job=~".*backend-listener.*",namespace="__NAMESPACE__"} == 0 for: 1m diff --git a/doc/prometheusrules/backend-worker.yaml b/doc/prometheusrules/backend-worker.yaml index 73f203279..527f35945 100644 --- a/doc/prometheusrules/backend-worker.yaml +++ b/doc/prometheusrules/backend-worker.yaml @@ -15,6 +15,7 @@ spec: - alert: ThreescaleBackendWorkerJobsCountRunningHigh annotations: description: '{{$labels.container_name}} replica controller on {{$labels.namespace}} project: Has more than 1000 jobs processed in the last 5 minutes' + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_worker_jobs_count_running_high.adoc summary: '{{$labels.container_name}} replica controller on {{$labels.namespace}}: Has more than 10000 jobs processed in the last 5 minutes' expr: sum(avg_over_time(apisonator_worker_job_count{job=~"backend.*",namespace="__NAMESPACE__"} [5m])) by (namespace,job) > 10000 for: 5m @@ -23,6 +24,7 @@ spec: - alert: ThreescaleBackendWorkerJobDown annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN expr: up{job=~".*backend-worker.*",namespace="__NAMESPACE__"} == 0 for: 1m diff --git a/doc/prometheusrules/system-app.yaml b/doc/prometheusrules/system-app.yaml index 07b8e9931..25f757f64 100644 --- a/doc/prometheusrules/system-app.yaml +++ b/doc/prometheusrules/system-app.yaml @@ -15,6 +15,7 @@ spec: - alert: ThreescaleSystemApp5XXRequestsHigh annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/system_app_5xx_requests_high.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute expr: sum(rate(rails_requests_total{namespace="__NAMESPACE__",pod=~"system-app-[a-z0-9]+-[a-z0-9]+",status=~"5[0-9]*"}[1m])) by (namespace,job) > 50 for: 1m @@ -23,6 +24,7 @@ spec: - alert: ThreescaleSystemAppJobDown annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN expr: up{job=~".*system-app.*",namespace="__NAMESPACE__"} == 0 for: 1m diff --git a/doc/prometheusrules/system-sidekiq.yaml b/doc/prometheusrules/system-sidekiq.yaml index 6e8a871e7..2c901afa4 100644 --- a/doc/prometheusrules/system-sidekiq.yaml +++ b/doc/prometheusrules/system-sidekiq.yaml @@ -15,6 +15,7 @@ spec: - alert: ThreescaleSystemSidekiqJobDown annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN expr: up{job=~".*system-sidekiq.*",namespace="__NAMESPACE__"} == 0 for: 1m diff --git a/doc/prometheusrules/threescale-kube-state-metrics.yaml b/doc/prometheusrules/threescale-kube-state-metrics.yaml index f18cb6c8b..b9b58d992 100644 --- a/doc/prometheusrules/threescale-kube-state-metrics.yaml +++ b/doc/prometheusrules/threescale-kube-state-metrics.yaml @@ -12,6 +12,7 @@ spec: - alert: ThreescalePodCrashLooping annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_crash_looping.adoc expr: rate(kube_pod_container_status_restarts_total{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[15m]) * 60 * 5 > 0 for: 5m labels: @@ -19,6 +20,7 @@ spec: - alert: ThreescalePodNotReady annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes. + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc expr: sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{namespace="__NAMESPACE__",owner_kind!="Job"})) > 0 for: 5m labels: @@ -26,6 +28,7 @@ spec: - alert: ThreescaleReplicationControllerReplicasMismatch annotations: message: ReplicationController {{ $labels.namespace }}/{{ $labels.replicationcontroller }} has not matched the expected number of replicas for longer than 5 minutes. + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/replication_controller_replicas_mismatch.adoc expr: kube_replicationcontroller_spec_replicas {namespace="__NAMESPACE__",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"} != kube_replicationcontroller_status_ready_replicas {namespace="__NAMESPACE__",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"} for: 5m labels: @@ -33,6 +36,7 @@ spec: - alert: ThreescaleContainerWaiting annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has been in waiting state for longer than 1 hour. + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_waiting.adoc expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) > 0 for: 1h labels: @@ -40,6 +44,7 @@ spec: - alert: ThreescaleContainerCPUHigh annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has High CPU usage for longer than 15 minutes. + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_high.adoc expr: sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by (namespace, container, pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by (namespace, container, pod) * 100 > 90 for: 15m labels: @@ -47,6 +52,7 @@ spec: - alert: ThreescaleContainerMemoryHigh annotations: message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has High Memory usage for longer than 15 minutes. + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_memory_high.adoc expr: sum(container_memory_usage_bytes{namespace="__NAMESPACE__",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by(namespace, container, pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by(namespace, container, pod) * 100 > 90 for: 15m labels: @@ -54,6 +60,7 @@ spec: - alert: ThreescaleContainerCPUThrottlingHigh annotations: message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_throttling_high.adoc expr: sum(increase(container_cpu_cfs_throttled_periods_total{namespace="__NAMESPACE__",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)" }[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[5m])) by (container, pod, namespace) > ( 25 / 100 ) for: 15m labels: diff --git a/doc/prometheusrules/zync-que.yaml b/doc/prometheusrules/zync-que.yaml index abc6f2337..2f31ca718 100644 --- a/doc/prometheusrules/zync-que.yaml +++ b/doc/prometheusrules/zync-que.yaml @@ -15,6 +15,7 @@ spec: - alert: ThreescaleZyncQueJobDown annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN expr: up{job=~".*/zync-que",namespace="__NAMESPACE__"} == 0 for: 1m @@ -23,6 +24,7 @@ spec: - alert: ThreescaleZyncQueScheduledJobCountHigh annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} has scheduled job count over 100 + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_scheduled_job_count_high.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} has scheduled job count over 100 expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='scheduled',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250 for: 1m @@ -31,6 +33,7 @@ spec: - alert: ThreescaleZyncQueFailedJobCountHigh annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} has failed job count over 100 + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_failed_job_count_high.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} has failed job count over 100 expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='failed',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250 for: 1m @@ -39,6 +42,7 @@ spec: - alert: ThreescaleZyncQueReadyJobCountHigh annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} has ready job count over 100 + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_ready_job_count_high.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} has ready job count over 100 expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='ready',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250 for: 1m diff --git a/doc/prometheusrules/zync.yaml b/doc/prometheusrules/zync.yaml index a8a1401a6..0769d608e 100644 --- a/doc/prometheusrules/zync.yaml +++ b/doc/prometheusrules/zync.yaml @@ -15,6 +15,7 @@ spec: - alert: ThreescaleZyncJobDown annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN expr: up{job=~".*/zync",namespace="__NAMESPACE__"} == 0 for: 1m @@ -23,6 +24,7 @@ spec: - alert: ThreescaleZync5XXRequestsHigh annotations: description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute + sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_5xx_requests_high.adoc summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute expr: sum(rate(rails_requests_total{namespace="__NAMESPACE__",pod=~"zync-[a-z0-9]+-[a-z0-9]+",status=~"5[0-9]*"}[1m])) by (namespace,job) > 50 for: 1m