Skip to content

Commit

Permalink
Merge pull request #526 from 3scale/sop-annotations-in-alerts
Browse files Browse the repository at this point in the history
Sop annotations in alerts
  • Loading branch information
eguzki committed Jun 2, 2022
2 parents b5eb7b9 + 418fe08 commit fef16ae
Show file tree
Hide file tree
Showing 13 changed files with 73 additions and 0 deletions.
5 changes: 5 additions & 0 deletions doc/prometheusrules/apicast.yaml
Expand Up @@ -15,6 +15,7 @@ spec:
- alert: ThreescaleApicastJobDown
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
expr: up{job=~".*/apicast-production|.*/apicast-staging",namespace="__NAMESPACE__"} == 0
for: 1m
Expand All @@ -23,6 +24,7 @@ spec:
- alert: ThreescaleApicastRequestTime
annotations:
description: High number of request taking more than a second to be processed
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_request_time.adoc
summary: Request on instance {{ $labels.instance }} is taking more than one second to process the requests
expr: sum(rate(total_response_time_seconds_bucket{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[1m])) - sum(rate(upstream_response_time_seconds_bucket{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[1m])) > 1
for: 2m
Expand All @@ -31,6 +33,7 @@ spec:
- alert: ThreescaleApicastHttp4xxErrorRate
annotations:
description: The number of request with 4XX is bigger than the 5% of total request.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_http_4xx_error_rate.adoc
summary: APICast high HTTP 4XX error rate (instance {{ $labels.instance }})
expr: sum(rate(apicast_status{namespace='__NAMESPACE__', status=~"^4.."}[1m])) / sum(rate(apicast_status{namespace='__NAMESPACE__'}[1m])) * 100 > 5
for: 5m
Expand All @@ -42,6 +45,7 @@ spec:
APIcast p99 latency is higher than 5 seconds
VALUE = {{ $value }}
LABELS: {{ $labels }}
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_apicast_latency.adoc
summary: APICast latency high (instance {{ $labels.instance }})
expr: histogram_quantile(0.99, sum(rate(total_response_time_seconds_bucket{namespace='__NAMESPACE__',}[30m])) by (le)) > 5
for: 5m
Expand All @@ -50,6 +54,7 @@ spec:
- alert: ThreescaleApicastWorkerRestart
annotations:
description: A new thread has been started. This could indicate that a worker process has died due to the memory limits being exceeded. Please investigate the memory pressure on pod (instance {{ $labels.instance }})
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/apicast_worker_restart.adoc
summary: A new worker process in Nginx has been started
expr: changes(worker_process{namespace='__NAMESPACE__', pod=~'apicast-production.*'}[5m]) > 0
for: 5m
Expand Down
2 changes: 2 additions & 0 deletions doc/prometheusrules/backend-listener.yaml
Expand Up @@ -15,6 +15,7 @@ spec:
- alert: ThreescaleBackendListener5XXRequestsHigh
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_listener_5xx_requests_high.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes
expr: sum(rate(apisonator_listener_response_codes{job=~"backend.*",namespace="__NAMESPACE__",resp_code="5xx"}[5m])) by (namespace,job,resp_code) > 5000
for: 5m
Expand All @@ -23,6 +24,7 @@ spec:
- alert: ThreescaleBackendListenerJobDown
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
expr: up{job=~".*backend-listener.*",namespace="__NAMESPACE__"} == 0
for: 1m
Expand Down
2 changes: 2 additions & 0 deletions doc/prometheusrules/backend-worker.yaml
Expand Up @@ -15,6 +15,7 @@ spec:
- alert: ThreescaleBackendWorkerJobsCountRunningHigh
annotations:
description: '{{$labels.container_name}} replica controller on {{$labels.namespace}} project: Has more than 1000 jobs processed in the last 5 minutes'
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/backend_worker_jobs_count_running_high.adoc
summary: '{{$labels.container_name}} replica controller on {{$labels.namespace}}: Has more than 10000 jobs processed in the last 5 minutes'
expr: sum(avg_over_time(apisonator_worker_job_count{job=~"backend.*",namespace="__NAMESPACE__"} [5m])) by (namespace,job) > 10000
for: 5m
Expand All @@ -23,6 +24,7 @@ spec:
- alert: ThreescaleBackendWorkerJobDown
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
expr: up{job=~".*backend-worker.*",namespace="__NAMESPACE__"} == 0
for: 1m
Expand Down
2 changes: 2 additions & 0 deletions doc/prometheusrules/system-app.yaml
Expand Up @@ -15,6 +15,7 @@ spec:
- alert: ThreescaleSystemApp5XXRequestsHigh
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/system_app_5xx_requests_high.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute
expr: sum(rate(rails_requests_total{namespace="__NAMESPACE__",pod=~"system-app-[a-z0-9]+-[a-z0-9]+",status=~"5[0-9]*"}[1m])) by (namespace,job) > 50
for: 1m
Expand All @@ -23,6 +24,7 @@ spec:
- alert: ThreescaleSystemAppJobDown
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
expr: up{job=~".*system-app.*",namespace="__NAMESPACE__"} == 0
for: 1m
Expand Down
1 change: 1 addition & 0 deletions doc/prometheusrules/system-sidekiq.yaml
Expand Up @@ -15,6 +15,7 @@ spec:
- alert: ThreescaleSystemSidekiqJobDown
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
expr: up{job=~".*system-sidekiq.*",namespace="__NAMESPACE__"} == 0
for: 1m
Expand Down
7 changes: 7 additions & 0 deletions doc/prometheusrules/threescale-kube-state-metrics.yaml
Expand Up @@ -12,48 +12,55 @@ spec:
- alert: ThreescalePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_crash_looping.adoc
expr: rate(kube_pod_container_status_restarts_total{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[15m]) * 60 * 5 > 0
for: 5m
labels:
severity: critical
- alert: ThreescalePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/pod_not_ready.adoc
expr: sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{namespace="__NAMESPACE__",owner_kind!="Job"})) > 0
for: 5m
labels:
severity: critical
- alert: ThreescaleReplicationControllerReplicasMismatch
annotations:
message: ReplicationController {{ $labels.namespace }}/{{ $labels.replicationcontroller }} has not matched the expected number of replicas for longer than 5 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/replication_controller_replicas_mismatch.adoc
expr: kube_replicationcontroller_spec_replicas {namespace="__NAMESPACE__",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"} != kube_replicationcontroller_status_ready_replicas {namespace="__NAMESPACE__",replicationcontroller=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}
for: 5m
labels:
severity: critical
- alert: ThreescaleContainerWaiting
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has been in waiting state for longer than 1 hour.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_waiting.adoc
expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) > 0
for: 1h
labels:
severity: warning
- alert: ThreescaleContainerCPUHigh
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has High CPU usage for longer than 15 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_high.adoc
expr: sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by (namespace, container, pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by (namespace, container, pod) * 100 > 90
for: 15m
labels:
severity: warning
- alert: ThreescaleContainerMemoryHigh
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container }} has High Memory usage for longer than 15 minutes.
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_memory_high.adoc
expr: sum(container_memory_usage_bytes{namespace="__NAMESPACE__",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by(namespace, container, pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}) by(namespace, container, pod) * 100 > 90
for: 15m
labels:
severity: warning
- alert: ThreescaleContainerCPUThrottlingHigh
annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/container_cpu_throttling_high.adoc
expr: sum(increase(container_cpu_cfs_throttled_periods_total{namespace="__NAMESPACE__",container!="",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)" }[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total{namespace="__NAMESPACE__",pod=~"(apicast-.*|backend-.*|system-.*|zync-.*)"}[5m])) by (container, pod, namespace) > ( 25 / 100 )
for: 15m
labels:
Expand Down
4 changes: 4 additions & 0 deletions doc/prometheusrules/zync-que.yaml
Expand Up @@ -15,6 +15,7 @@ spec:
- alert: ThreescaleZyncQueJobDown
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
expr: up{job=~".*/zync-que",namespace="__NAMESPACE__"} == 0
for: 1m
Expand All @@ -23,6 +24,7 @@ spec:
- alert: ThreescaleZyncQueScheduledJobCountHigh
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} has scheduled job count over 100
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_scheduled_job_count_high.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} has scheduled job count over 100
expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='scheduled',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250
for: 1m
Expand All @@ -31,6 +33,7 @@ spec:
- alert: ThreescaleZyncQueFailedJobCountHigh
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} has failed job count over 100
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_failed_job_count_high.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} has failed job count over 100
expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='failed',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250
for: 1m
Expand All @@ -39,6 +42,7 @@ spec:
- alert: ThreescaleZyncQueReadyJobCountHigh
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} has ready job count over 100
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_que_ready_job_count_high.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} has ready job count over 100
expr: max(que_jobs_scheduled_total{pod=~'zync-que.*',type='ready',namespace="__NAMESPACE__"}) by (namespace,job,exported_job) > 250
for: 1m
Expand Down
2 changes: 2 additions & 0 deletions doc/prometheusrules/zync.yaml
Expand Up @@ -15,6 +15,7 @@ spec:
- alert: ThreescaleZyncJobDown
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/prometheus_job_down.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN
expr: up{job=~".*/zync",namespace="__NAMESPACE__"} == 0
for: 1m
Expand All @@ -23,6 +24,7 @@ spec:
- alert: ThreescaleZync5XXRequestsHigh
annotations:
description: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute
sop_url: https://github.com/3scale/3scale-Operations/blob/master/sops/alerts/zync_5xx_requests_high.adoc
summary: Job {{ $labels.job }} on {{ $labels.namespace }} has more than 50 HTTP 5xx requests in the last minute
expr: sum(rate(rails_requests_total{namespace="__NAMESPACE__",pod=~"zync-[a-z0-9]+-[a-z0-9]+",status=~"5[0-9]*"}[1m])) by (namespace,job) > 50
for: 1m
Expand Down
5 changes: 5 additions & 0 deletions pkg/3scale/amp/component/apicast_monitoring.go
Expand Up @@ -106,6 +106,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
{
Alert: "ThreescaleApicastJobDown",
Annotations: map[string]string{
"sop_url": ThreescalePrometheusJobDownURL,
"summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
"description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
},
Expand All @@ -118,6 +119,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
{
Alert: "ThreescaleApicastRequestTime",
Annotations: map[string]string{
"sop_url": ThreescaleApicastRequestTimeURL,
"summary": "Request on instance {{ $labels.instance }} is taking more than one second to process the requests",
"description": "High number of request taking more than a second to be processed",
},
Expand All @@ -130,6 +132,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
{
Alert: "ThreescaleApicastHttp4xxErrorRate",
Annotations: map[string]string{
"sop_url": ThreescaleApicastHttp4xxErrorRateURL,
"summary": "APICast high HTTP 4XX error rate (instance {{ $labels.instance }})",
"description": "The number of request with 4XX is bigger than the 5% of total request.",
},
Expand All @@ -142,6 +145,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
{
Alert: "ThreescaleApicastLatencyHigh",
Annotations: map[string]string{
"sop_url": ThreescaleApicastLatencyHighURL,
"summary": "APICast latency high (instance {{ $labels.instance }})",
"description": "APIcast p99 latency is higher than 5 seconds\n VALUE = {{ $value }}\n LABELS: {{ $labels }}",
},
Expand All @@ -154,6 +158,7 @@ func (apicast *Apicast) ApicastPrometheusRules() *monitoringv1.PrometheusRule {
{
Alert: "ThreescaleApicastWorkerRestart",
Annotations: map[string]string{
"sop_url": ThreescaleApicastWorkerRestartURL,
"summary": "A new worker process in Nginx has been started",
"description": "A new thread has been started. This could indicate that a worker process has died due to the memory limits being exceeded. Please investigate the memory pressure on pod (instance {{ $labels.instance }})",
},
Expand Down
4 changes: 4 additions & 0 deletions pkg/3scale/amp/component/backend_monitoring.go
Expand Up @@ -86,6 +86,7 @@ func (backend *Backend) BackendWorkerPrometheusRules() *monitoringv1.PrometheusR
{
Alert: "ThreescaleBackendWorkerJobsCountRunningHigh",
Annotations: map[string]string{
"sop_url": ThreescaleBackendWorkerJobsCountRunningHighURL,
"summary": "{{$labels.container_name}} replica controller on {{$labels.namespace}}: Has more than 10000 jobs processed in the last 5 minutes",
"description": "{{$labels.container_name}} replica controller on {{$labels.namespace}} project: Has more than 1000 jobs processed in the last 5 minutes",
},
Expand All @@ -98,6 +99,7 @@ func (backend *Backend) BackendWorkerPrometheusRules() *monitoringv1.PrometheusR
{
Alert: "ThreescaleBackendWorkerJobDown",
Annotations: map[string]string{
"sop_url": ThreescalePrometheusJobDownURL,
"summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
"description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
},
Expand Down Expand Up @@ -132,6 +134,7 @@ func (backend *Backend) BackendListenerPrometheusRules() *monitoringv1.Prometheu
{
Alert: "ThreescaleBackendListener5XXRequestsHigh",
Annotations: map[string]string{
"sop_url": ThreescaleBackendListener5XXRequestsHighURL,
"summary": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes",
"description": "Job {{ $labels.job }} on {{ $labels.namespace }} has more than 5000 HTTP 5xx requests in the last 5 minutes",
},
Expand All @@ -144,6 +147,7 @@ func (backend *Backend) BackendListenerPrometheusRules() *monitoringv1.Prometheu
{
Alert: "ThreescaleBackendListenerJobDown",
Annotations: map[string]string{
"sop_url": ThreescalePrometheusJobDownURL,
"summary": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
"description": "Job {{ $labels.job }} on {{ $labels.namespace }} is DOWN",
},
Expand Down

0 comments on commit fef16ae

Please sign in to comment.