-
Notifications
You must be signed in to change notification settings - Fork 243
/
fr-alerts.yaml
137 lines (137 loc) · 4.68 KB
/
fr-alerts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
{{ define "fr.rules.yaml.tpl" }}
groups:
- name: cluster.rules
rules:
- alert: node_memory_MemFree_bytes
expr: node_memory_MemFree < 100000000
for: 5m
labels:
severity: warning
annotations:
description: "Node memory running low"
- alert: up
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
description: "Service is down for more than 1 minute"
- alert: instance:node_cpu:rate:sum
expr: instance:node_cpu:rate:sum > 80
for: 5m
labels:
severity: warning
annotations:
description: "Instance cpu above 80% for over 5 mins"
- alert: node_disk_io_time_seconds_total
expr: rate(node_disk_io_time_seconds_total[5m]) > 0.2
for: 5m
labels:
severity: warning
annotations:
description: "Disk IO time over 300ms for 5 mins"
- name: am.rules
rules:
- alert: am_cts_task_seconds_total
expr: sum by (instance) (rate(am_cts_task_seconds_total[90s])) / sum by (instance) (rate(am_cts_task_count[90s])) > 0.1
for: 3m
labels:
severity: warning
annotations:
description: "Average CTS Task Service Time"
- alert: am_cts_reaper_search_seconds
expr: am_cts_reaper_search_seconds > 0.25
labels:
severity: warning
annotations:
description: "CTS Overall Average search service time"
- alert: am_cts_reaper_deletion_total
expr: irate(am_cts_reaper_deletion_total{reaper_type=~"search"}[1m]) > 0
labels:
severity: warning
annotations:
description: "CTS Search based Deletion Throughput"
- alert: am_cts_task_count
expr: rate(am_cts_task_count{outcome=~"failure"}[5m]) > 0
labels:
severity: warning
annotations:
description: "CTS Delete task throughput failure"
- name: ds.rules
rules:
- alert: ds_disk_free_space_bytes
expr: ds_disk_free_space_bytes < 5000000000
for: 5m
labels:
severity: warning
annotations:
description: "Free disk space below 5GB, please free up some space now"
summary: "Free disk space running low"
- alert: ds_replication_replica_remote_replicas_current_delay_seconds
expr: max(ds_replication_replica_remote_replicas_current_delay_seconds) > 0.5
for: 5m
labels:
severity: warning
annotations:
description: "DS Max replication latency"
- alert: ds_connection_handlers_ldap_requests_seconds_total
expr: sum by (job, ldap_handler, type) (irate(ds_connection_handlers_ldap_requests_seconds_total{job=~"$ds_instance",ldap_handler=~"$ldap_handler"}[1m])) / sum by (job, ldap_handler, type) (irate(ds_connection_handlers_ldap_requests_count{job=~"$ds_instance",ldap_handler=~"$ldap_handler"}[1m]))
for: 5m
labels:
severity: warning
annotations:
description: "DS Current response times"
- name: ig.rules
rules:
- alert: ig_route_response_time_seconds
expr: ig_route_response_time_seconds{route="default",name="default",router="gateway._router"} > 0.4
for: 5m
labels:
severity: warning
annotations:
description: "Rate (calls/seconds) of responses with their associated times in milliseconds"
- alert: ig_route_response_error_total
expr: ig_route_response_error_total{route="default",name="default",router="gateway._router"} > 0
for: 5m
labels:
severity: warning
annotations:
description: "count of all responses which generated an exception"
- name: idm.rules
rules:
- alert: idm_repo_seconds
expr: idm_repo_seconds{operation="read",repo_type="jdbc",resource_mapping="cluster_states"} > 0.1
for: 3m
labels:
severity: warning
annotations:
description: "read operation to a JDBC datasource"
- alert: idm_repo_seconds_2
expr: idm_repo_seconds{operation="read",repo_type="jdbc",resource_mapping="reconprogressstate"} > 0.1
for: 3m
labels:
severity: warning
annotations:
description: "read operation to a JDBC datasource, recon progress"
- alert: idm_repo_seconds_3
expr: idm_repo_seconds{operation="update",repo_type="jdbc",resource_mapping="cluster_states"} > 0.1
for: 3m
labels:
severity: warning
annotations:
description: "update operation to a JDBC datasource"
- alert: idm_repo_seconds_4
expr: idm_repo_seconds{operation="update",repo_type="jdbc",resource_mapping="reconprogressstate"} > 0.1
for: 3m
labels:
severity: warning
annotations:
description: "update operation to a JDBC datasource, recon progress"
- alert: idm_repo_get_connection_seconds
expr: idm_repo_get_connection_seconds{repo_type="jdbc"} > 0.005
for: 3m
labels:
severity: warning
annotations:
description: "Rate of successful/unsuccessful retrieval of a repo connection"
{{ end }}