forked from kubernetes/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
metrics_util.go
369 lines (323 loc) · 10.6 KB
/
metrics_util.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e
import (
"bytes"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
"os"
"sort"
"strconv"
"strings"
"time"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/fields"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/pkg/util/sets"
"github.com/prometheus/common/expfmt"
"github.com/prometheus/common/model"
)
const (
podStartupThreshold time.Duration = 5 * time.Second
listPodLatencyThreshold time.Duration = 2 * time.Second
apiCallLatencySmallThreshold time.Duration = 250 * time.Millisecond
apiCallLatencyMediumThreshold time.Duration = 500 * time.Millisecond
apiCallLatencyLargeThreshold time.Duration = 1 * time.Second
)
// Dashboard metrics
type LatencyMetric struct {
Perc50 time.Duration `json:"Perc50"`
Perc90 time.Duration `json:"Perc90"`
Perc99 time.Duration `json:"Perc99"`
}
type PodStartupLatency struct {
Latency LatencyMetric `json:"latency"`
}
type APICall struct {
Resource string `json:"resource"`
Verb string `json:"verb"`
Latency LatencyMetric `json:"latency"`
}
type APIResponsiveness struct {
APICalls []APICall `json:"apicalls"`
}
func (a APIResponsiveness) Len() int { return len(a.APICalls) }
func (a APIResponsiveness) Swap(i, j int) { a.APICalls[i], a.APICalls[j] = a.APICalls[j], a.APICalls[i] }
func (a APIResponsiveness) Less(i, j int) bool {
return a.APICalls[i].Latency.Perc99 < a.APICalls[j].Latency.Perc99
}
// 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median)
// Only 0.5, 0.9 and 0.99 quantiles are supported.
func (a *APIResponsiveness) addMetric(resource, verb string, quantile float64, latency time.Duration) {
for i, apicall := range a.APICalls {
if apicall.Resource == resource && apicall.Verb == verb {
a.APICalls[i] = setQuantile(apicall, quantile, latency)
return
}
}
apicall := setQuantile(APICall{Resource: resource, Verb: verb}, quantile, latency)
a.APICalls = append(a.APICalls, apicall)
}
// 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median)
// Only 0.5, 0.9 and 0.99 quantiles are supported.
func setQuantile(apicall APICall, quantile float64, latency time.Duration) APICall {
switch quantile {
case 0.5:
apicall.Latency.Perc50 = latency
case 0.9:
apicall.Latency.Perc90 = latency
case 0.99:
apicall.Latency.Perc99 = latency
}
return apicall
}
func readLatencyMetrics(c *client.Client) (APIResponsiveness, error) {
var a APIResponsiveness
body, err := getMetrics(c)
if err != nil {
return a, err
}
samples, err := extractMetricSamples(body)
if err != nil {
return a, err
}
ignoredResources := sets.NewString("events")
// TODO: figure out why we're getting non-capitalized proxy and fix this.
ignoredVerbs := sets.NewString("WATCHLIST", "PROXY", "proxy")
for _, sample := range samples {
// Example line:
// apiserver_request_latencies_summary{resource="namespaces",verb="LIST",quantile="0.99"} 908
if sample.Metric[model.MetricNameLabel] != "apiserver_request_latencies_summary" {
continue
}
resource := string(sample.Metric["resource"])
verb := string(sample.Metric["verb"])
if ignoredResources.Has(resource) || ignoredVerbs.Has(verb) {
continue
}
latency := sample.Value
quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
if err != nil {
return a, err
}
a.addMetric(resource, verb, quantile, time.Duration(int64(latency))*time.Microsecond)
}
return a, err
}
// Returns threshold for API call depending on the size of the cluster.
// In general our goal is 1s, but for smaller clusters, we want to enforce
// smaller limits, to allow noticing regressions.
func apiCallLatencyThreshold(numNodes int) time.Duration {
if numNodes <= 250 {
return apiCallLatencySmallThreshold
}
if numNodes <= 500 {
return apiCallLatencyMediumThreshold
}
return apiCallLatencyLargeThreshold
}
// Prints top five summary metrics for request types with latency and returns
// number of such request types above threshold.
func HighLatencyRequests(c *client.Client) (int, error) {
nodes, err := c.Nodes().List(labels.Everything(), fields.Everything())
if err != nil {
return 0, err
}
numNodes := len(nodes.Items)
metrics, err := readLatencyMetrics(c)
if err != nil {
return 0, err
}
sort.Sort(sort.Reverse(metrics))
badMetrics := 0
top := 5
for _, metric := range metrics.APICalls {
threshold := apiCallLatencyThreshold(numNodes)
if metric.Verb == "LIST" && metric.Resource == "pods" {
threshold = listPodLatencyThreshold
}
isBad := false
if metric.Latency.Perc99 > threshold {
badMetrics++
isBad = true
}
if top > 0 || isBad {
top--
prefix := ""
if isBad {
prefix = "WARNING "
}
Logf("%vTop latency metric: %+v", prefix, metric)
}
}
Logf("API calls latencies: %s", prettyPrintJSON(metrics))
return badMetrics, nil
}
// Verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
// within the threshold.
func VerifyPodStartupLatency(latency PodStartupLatency) error {
Logf("Pod startup latency: %s", prettyPrintJSON(latency))
if latency.Latency.Perc50 > podStartupThreshold {
return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.Latency.Perc50)
}
if latency.Latency.Perc90 > podStartupThreshold {
return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.Latency.Perc90)
}
if latency.Latency.Perc99 > podStartupThreshold {
return fmt.Errorf("too high pod startup latency 99th percentil: %v", latency.Latency.Perc99)
}
return nil
}
// Resets latency metrics in apiserver.
func resetMetrics(c *client.Client) error {
Logf("Resetting latency metrics in apiserver...")
body, err := c.Get().AbsPath("/resetMetrics").DoRaw()
if err != nil {
return err
}
if string(body) != "metrics reset\n" {
return fmt.Errorf("Unexpected response: %q", string(body))
}
return nil
}
// Retrieves metrics information.
func getMetrics(c *client.Client) (string, error) {
body, err := c.Get().AbsPath("/metrics").DoRaw()
if err != nil {
return "", err
}
return string(body), nil
}
func prettyPrintJSON(metrics interface{}) string {
output := &bytes.Buffer{}
if err := json.NewEncoder(output).Encode(metrics); err != nil {
return ""
}
formatted := &bytes.Buffer{}
if err := json.Indent(formatted, output.Bytes(), "", " "); err != nil {
return ""
}
return string(formatted.Bytes())
}
// Retrieves debug information.
func getDebugInfo(c *client.Client) (map[string]string, error) {
data := make(map[string]string)
for _, key := range []string{"block", "goroutine", "heap", "threadcreate"} {
resp, err := http.Get(c.Get().AbsPath(fmt.Sprintf("debug/pprof/%s", key)).URL().String() + "?debug=2")
if err != nil {
Logf("Warning: Error trying to fetch %s debug data: %v", key, err)
continue
}
body, err := ioutil.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
Logf("Warning: Error trying to read %s debug data: %v", key, err)
}
data[key] = string(body)
}
return data, nil
}
func writePerfData(c *client.Client, dirName string, postfix string) error {
fname := fmt.Sprintf("%s/metrics_%s.txt", dirName, postfix)
handler, err := os.Create(fname)
if err != nil {
return fmt.Errorf("Error creating file '%s': %v", fname, err)
}
metrics, err := getMetrics(c)
if err != nil {
return fmt.Errorf("Error retrieving metrics: %v", err)
}
_, err = handler.WriteString(metrics)
if err != nil {
return fmt.Errorf("Error writing metrics: %v", err)
}
err = handler.Close()
if err != nil {
return fmt.Errorf("Error closing '%s': %v", fname, err)
}
debug, err := getDebugInfo(c)
if err != nil {
return fmt.Errorf("Error retrieving debug information: %v", err)
}
for key, value := range debug {
fname := fmt.Sprintf("%s/%s_%s.txt", dirName, key, postfix)
handler, err = os.Create(fname)
if err != nil {
return fmt.Errorf("Error creating file '%s': %v", fname, err)
}
_, err = handler.WriteString(value)
if err != nil {
return fmt.Errorf("Error writing %s: %v", key, err)
}
err = handler.Close()
if err != nil {
return fmt.Errorf("Error closing '%s': %v", fname, err)
}
}
return nil
}
// extractMetricSamples parses the prometheus metric samples from the input string.
func extractMetricSamples(metricsBlob string) ([]*model.Sample, error) {
dec, err := expfmt.NewDecoder(strings.NewReader(metricsBlob), expfmt.FmtText)
if err != nil {
return nil, err
}
decoder := expfmt.SampleDecoder{
Dec: dec,
Opts: &expfmt.DecodeOptions{},
}
var samples []*model.Sample
for {
var v model.Vector
if err = decoder.Decode(&v); err != nil {
if err == io.EOF {
// Expected loop termination condition.
return samples, nil
}
return nil, err
}
samples = append(samples, v...)
}
}
// logSuspiciousLatency logs metrics/docker errors from all nodes that had slow startup times
// If latencyDataLag is nil then it will be populated from latencyData
func logSuspiciousLatency(latencyData []podLatencyData, latencyDataLag []podLatencyData, nodeCount int, c *client.Client) {
if latencyDataLag == nil {
latencyDataLag = latencyData
}
for _, l := range latencyData {
if l.Latency > NodeStartupThreshold {
HighLatencyKubeletOperations(c, 1*time.Second, l.Node)
}
}
Logf("Approx throughput: %v pods/min",
float64(nodeCount)/(latencyDataLag[len(latencyDataLag)-1].Latency.Minutes()))
}
// testMaximumLatencyValue verifies the highest latency value is less than or equal to
// the given time.Duration. Since the arrays are sorted we are looking at the last
// element which will always be the highest. If the latency is higher than the max Failf
// is called.
func testMaximumLatencyValue(latencies []podLatencyData, max time.Duration, name string) {
highestLatency := latencies[len(latencies)-1]
if !(highestLatency.Latency <= max) {
Failf("%s were not all under %s: %#v", name, max.String(), latencies)
}
}
func printLatencies(latencies []podLatencyData, header string) {
metrics := extractLatencyMetrics(latencies)
Logf("10%% %s: %v", header, latencies[(len(latencies)*9)/10:])
Logf("perc50: %v, perc90: %v, perc99: %v", metrics.Perc50, metrics.Perc90, metrics.Perc99)
}