-
Notifications
You must be signed in to change notification settings - Fork 1
/
metrics.go
405 lines (343 loc) · 18.6 KB
/
metrics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
package server
// metrics module provides various metrics about our server
//
// Copyright (c) 2020 - Valentin Kuznetsov <vkuznet AT gmail dot com>
import (
"fmt"
"os"
"runtime"
"time"
"github.com/shirou/gopsutil/cpu"
"github.com/shirou/gopsutil/load"
"github.com/shirou/gopsutil/mem"
"github.com/shirou/gopsutil/net"
"github.com/shirou/gopsutil/process"
)
// MetricsLastUpdateTime keeps track of last update time of the metrics
var MetricsLastUpdateTime time.Time
// RPS represents requests per second for a given server
var RPS float64
// RPSPhysical represents requests per second for a given server times number of physical CPU cores
var RPSPhysical float64
// RPSLogical represents requests per second for a given server times number of logical CPU cores
var RPSLogical float64
// NumPhysicalCores represents number of cores in our node
var NumPhysicalCores int
// NumLogicalCores represents number of cores in our node
var NumLogicalCores int
// AvgGetRequestTime represents average GET request time
var AvgGetRequestTime float64
// AvgPostRequestTime represents average POST request time
var AvgPostRequestTime float64
// AvgPutRequestTime represents average PUT request time
var AvgPutRequestTime float64
// RequestStats holds metrics related to number of requests on a server
type RequestStats struct {
TotalGetRequests uint64
TotalPostRequests uint64
TotalPutRequests uint64
Time time.Time
NumPhysicalCores int
NumLogicalCores int
}
// Update RequestStatus metrics
func (r *RequestStats) Update() {
r.TotalGetRequests = TotalGetRequests
r.TotalPostRequests = TotalPostRequests
r.TotalPutRequests = TotalPutRequests
r.NumPhysicalCores = NumPhysicalCores
r.NumLogicalCores = NumLogicalCores
r.Time = time.Now()
}
var rstat *RequestStats
// Memory structure keeps track of server memory
type Memory struct {
Total uint64 `json:"total"`
Free uint64 `json:"free"`
Used uint64 `json:"used"`
UsedPercent float64 `json:"usedPercent"`
}
// Mem structure keeps track of virtual/swap memory of the server
type Mem struct {
Virtual Memory `json:"virtual"` // virtual memory metrics from gopsutils
Swap Memory `json:"swap"` // swap memory metrics from gopsutils
}
// Metrics provide various metrics about our server
type Metrics struct {
CPU []float64 `json:"cpu"` // cpu metrics from gopsutils
CpuPercent float64 `json:"cpu_pct"` // cpu percent
Connections []net.ConnectionStat `json:"connections"` // connections metrics from gopsutils
Load load.AvgStat `json:"load"` // load metrics from gopsutils
Memory Mem `json:"memory"` // memory metrics from gopsutils
OpenFiles []process.OpenFilesStat `json:"openFiles"` // open files metrics from gopsutils
GoRoutines uint64 `json:"goroutines"` // total number of go routines at run-time
Uptime float64 `json:"uptime"` // uptime of the server
GetRequests uint64 `json:"getRequests"` // total number of get requests across all services
PostRequests uint64 `json:"postRequests"` // total number of post requests across all services
PutRequests uint64 `json:"putRequests"` // total number of post requests across all services
AvgGetTime float64 `json:"avgGetTime"` // avg GET request time
AvgPostTime float64 `json:"avgPostTime"` // avg POST request time
AvgPutTime float64 `json:"avgPutTime"` // avg PUT request time
RPS float64 `json:"rps"` // throughput req/sec
RPSPhysical float64 `json:"rpsPhysical"` // throughput req/sec using physical cpu
RPSLogical float64 `json:"rpsLogical"` // throughput req/sec using logical cpu
ProcFS ProcFS `json:"procfs"` // metrics from prometheus procfs
MaxDBConnections uint64 `json:"maxDBConnections"` // max number of DB connections
MaxIdleConnections uint64 `json:"maxIdleConnections"` // max number of idle DB connections
// Migration server metrics
MigrationRequests uint64 `json:"migrationRequests"` // total number of migration requests across all services
MigrationPending uint64 `json:"migrationPending"` // total number of pending migration requests across all services
MigrationInProgress uint64 `json:"migrationInProgress"` // total number of in progress migration requests across all services
MigrationFailed uint64 `json:"migrationFailed"` // total number of failed migration requests across all services
MigrationTermFailed uint64 `json:"migrationTermFailed"` // total number of term failed migration requests across all services
MigrationCompleted uint64 `json:"migrationCompleted"` // total number of completed migration requests across all services
MigrationQueued uint64 `json:"migrationQueued"` // total number of queued migration requests across all services
MigrationExistInDB uint64 `json:"migrationExistInDB"` // total number of exist in db migration requests across all services
}
func metrics() Metrics {
if rstat == nil {
rstat = &RequestStats{}
rstat.Time = time.Now()
}
// get cpu and mem profiles
m, _ := mem.VirtualMemory()
s, _ := mem.SwapMemory()
l, _ := load.Avg()
c, _ := cpu.Percent(time.Millisecond, true)
process, perr := process.NewProcess(int32(os.Getpid()))
// get unfinished queries
metrics := Metrics{}
metrics.GoRoutines = uint64(runtime.NumGoroutine())
virt := Memory{Total: m.Total, Free: m.Free, Used: m.Used, UsedPercent: m.UsedPercent}
swap := Memory{Total: s.Total, Free: s.Free, Used: s.Used, UsedPercent: s.UsedPercent}
metrics.Memory = Mem{Virtual: virt, Swap: swap}
metrics.Load = *l
metrics.CPU = c
if perr == nil { // if we got process info
conn, err := process.Connections()
if err == nil {
metrics.Connections = conn
}
openFiles, err := process.OpenFiles()
if err == nil {
metrics.OpenFiles = openFiles
}
}
// get cpu percent
cpuPct, err := process.Percent(time.Duration(1 * time.Second))
if err == nil {
metrics.CpuPercent = cpuPct
}
metrics.ProcFS = ProcFSMetrics()
metrics.Uptime = time.Since(StartTime).Seconds()
metrics.AvgGetTime = AvgGetRequestTime
metrics.AvgPostTime = AvgPostRequestTime
metrics.AvgPutTime = AvgPutRequestTime
metrics.GetRequests = TotalGetRequests
metrics.PostRequests = TotalPostRequests
metrics.PutRequests = TotalPutRequests
lapse := time.Since(rstat.Time).Seconds()
total := float64(TotalGetRequests + TotalPostRequests + TotalPutRequests)
metrics.RPS = (total - float64(rstat.TotalGetRequests+rstat.TotalPostRequests+rstat.TotalPutRequests)) / lapse
metrics.RPSLogical = float64(rstat.NumLogicalCores-NumLogicalCores) / lapse
metrics.RPSPhysical = float64(rstat.NumPhysicalCores-NumPhysicalCores) / lapse
rstat.Update()
return metrics
}
// helper function to generate metrics in prometheus format
func promMetrics(prefix string) string {
var out string
data := metrics()
// cpu info
out += fmt.Sprintf("# HELP %s_cpu percentage of cpu used per CPU\n", prefix)
out += fmt.Sprintf("# TYPE %s_cpu gauge\n", prefix)
for i, v := range data.CPU {
out += fmt.Sprintf("%s_cpu{core=\"%d\"} %v\n", prefix, i, v)
}
// connections
var totCon, estCon, lisCon uint64
for _, c := range data.Connections {
v := c.Status
switch v {
case "ESTABLISHED":
estCon++
case "LISTEN":
lisCon++
}
}
totCon = uint64(len(data.Connections))
out += fmt.Sprintf("# HELP %s_total_connections\n", prefix)
out += fmt.Sprintf("# TYPE %s_total_connections gauge\n", prefix)
out += fmt.Sprintf("%s_total_connections %v\n", prefix, totCon)
out += fmt.Sprintf("# HELP %s_established_connections\n", prefix)
out += fmt.Sprintf("# TYPE %s_established_connections gauge\n", prefix)
out += fmt.Sprintf("%s_established_connections %v\n", prefix, estCon)
out += fmt.Sprintf("# HELP %s_listen_connections\n", prefix)
out += fmt.Sprintf("# TYPE %s_listen_connections gauge\n", prefix)
out += fmt.Sprintf("%s_listen_connections %v\n", prefix, lisCon)
// procfs metrics
// cpuTotal, vsize, rss, openFDs, maxFDs, maxVsize
out += fmt.Sprintf("# HELP %s_procfs_cputotal\n", prefix)
out += fmt.Sprintf("# TYPE %s_procfs_cputotal gauge\n", prefix)
out += fmt.Sprintf("%s_procfs_cputotal %v\n", prefix, data.ProcFS.CpuTotal)
out += fmt.Sprintf("# HELP %s_procfs_vsize\n", prefix)
out += fmt.Sprintf("# TYPE %s_procfs_vsize gauge\n", prefix)
out += fmt.Sprintf("%s_procfs_vsize %v\n", prefix, data.ProcFS.Vsize)
out += fmt.Sprintf("# HELP %s_procfs_rss\n", prefix)
out += fmt.Sprintf("# TYPE %s_procfs_rss gauge\n", prefix)
out += fmt.Sprintf("%s_procfs_rss %v\n", prefix, data.ProcFS.Rss)
out += fmt.Sprintf("# HELP %s_procfs_openfds\n", prefix)
out += fmt.Sprintf("# TYPE %s_procfs_openfds gauge\n", prefix)
out += fmt.Sprintf("%s_procfs_openfds %v\n", prefix, data.ProcFS.OpenFDs)
out += fmt.Sprintf("# HELP %s_procfs_maxfds\n", prefix)
out += fmt.Sprintf("# TYPE %s_procfs_maxfds gauge\n", prefix)
out += fmt.Sprintf("%s_procfs_maxfds %v\n", prefix, data.ProcFS.MaxFDs)
out += fmt.Sprintf("# HELP %s_procfs_maxvsize\n", prefix)
out += fmt.Sprintf("# TYPE %s_procfs_maxvsize gauge\n", prefix)
out += fmt.Sprintf("%s_procfs_maxvsize %v\n", prefix, data.ProcFS.MaxVsize)
// procfs /proc/stat metrics
out += fmt.Sprintf("# HELP %s_procfs_sumusercpus\n", prefix)
out += fmt.Sprintf("# TYPE %s_procfs_sumusercpus gauge\n", prefix)
out += fmt.Sprintf("%s_procfs_sumusercpus %v\n", prefix, data.ProcFS.SumUserCPUs)
out += fmt.Sprintf("# HELP %s_procfs_sumsystemcpus\n", prefix)
out += fmt.Sprintf("# TYPE %s_procfs_sumsystemcpus gauge\n", prefix)
out += fmt.Sprintf("%s_procfs_sumsystemcpus %v\n", prefix, data.ProcFS.SumSystemCPUs)
// cpu percent
out += fmt.Sprintf("# HELP %s_cpu_pct\n", prefix)
out += fmt.Sprintf("# TYPE %s_cpu_pct gauge\n", prefix)
out += fmt.Sprintf("%s_cpu_pct %v\n", prefix, data.CpuPercent)
// load
out += fmt.Sprintf("# HELP %s_load1\n", prefix)
out += fmt.Sprintf("# TYPE %s_load1 gauge\n", prefix)
out += fmt.Sprintf("%s_load1 %v\n", prefix, data.Load.Load1)
out += fmt.Sprintf("# HELP %s_load5\n", prefix)
out += fmt.Sprintf("# TYPE %s_load5 gauge\n", prefix)
out += fmt.Sprintf("%s_load5 %v\n", prefix, data.Load.Load5)
out += fmt.Sprintf("# HELP %s_load15\n", prefix)
out += fmt.Sprintf("# TYPE %s_load15 gauge\n", prefix)
out += fmt.Sprintf("%s_load15 %v\n", prefix, data.Load.Load15)
// memory virtual
out += fmt.Sprintf("# HELP %s_mem_virt_total reports total virtual memory in bytes\n", prefix)
out += fmt.Sprintf("# TYPE %s_mem_virt_total gauge\n", prefix)
out += fmt.Sprintf("%s_mem_virt_total %v\n", prefix, data.Memory.Virtual.Total)
out += fmt.Sprintf("# HELP %s_mem_virt_free reports free virtual memory in bytes\n", prefix)
out += fmt.Sprintf("# TYPE %s_mem_virt_free gauge\n", prefix)
out += fmt.Sprintf("%s_mem_virt_free %v\n", prefix, data.Memory.Virtual.Free)
out += fmt.Sprintf("# HELP %s_mem_virt_used reports used virtual memory in bytes\n", prefix)
out += fmt.Sprintf("# TYPE %s_mem_virt_used gauge\n", prefix)
out += fmt.Sprintf("%s_mem_virt_used %v\n", prefix, data.Memory.Virtual.Used)
out += fmt.Sprintf("# HELP %s_mem_virt_pct reports percentage of virtual memory\n", prefix)
out += fmt.Sprintf("# TYPE %s_mem_virt_pct gauge\n", prefix)
out += fmt.Sprintf("%s_mem_virt_pct %v\n", prefix, data.Memory.Virtual.UsedPercent)
// memory swap
out += fmt.Sprintf("# HELP %s_mem_swap_total reports total swap memory in bytes\n", prefix)
out += fmt.Sprintf("# TYPE %s_mem_swap_total gauge\n", prefix)
out += fmt.Sprintf("%s_mem_swap_total %v\n", prefix, data.Memory.Swap.Total)
out += fmt.Sprintf("# HELP %s_mem_swap_free reports free swap memory in bytes\n", prefix)
out += fmt.Sprintf("# TYPE %s_mem_swap_free gauge\n", prefix)
out += fmt.Sprintf("%s_mem_swap_free %v\n", prefix, data.Memory.Swap.Free)
out += fmt.Sprintf("# HELP %s_mem_swap_used reports used swap memory in bytes\n", prefix)
out += fmt.Sprintf("# TYPE %s_mem_swap_used gauge\n", prefix)
out += fmt.Sprintf("%s_mem_swap_used %v\n", prefix, data.Memory.Swap.Used)
out += fmt.Sprintf("# HELP %s_mem_swap_pct reports percentage swap memory\n", prefix)
out += fmt.Sprintf("# TYPE %s_mem_swap_pct gauge\n", prefix)
out += fmt.Sprintf("%s_mem_swap_pct %v\n", prefix, data.Memory.Swap.UsedPercent)
// open files
out += fmt.Sprintf("# HELP %s_open_files reports total number of open file descriptors\n", prefix)
out += fmt.Sprintf("# TYPE %s_open_files gauge\n", prefix)
out += fmt.Sprintf("%s_open_files %v\n", prefix, len(data.OpenFiles))
// go routines
out += fmt.Sprintf("# HELP %s_goroutines reports total number of go routines\n", prefix)
out += fmt.Sprintf("# TYPE %s_goroutines counter\n", prefix)
out += fmt.Sprintf("%s_goroutines %v\n", prefix, data.GoRoutines)
// uptime
out += fmt.Sprintf("# HELP %s_uptime reports server uptime in seconds\n", prefix)
out += fmt.Sprintf("# TYPE %s_uptime counter\n", prefix)
out += fmt.Sprintf("%s_uptime %v\n", prefix, data.Uptime)
// total requests
out += fmt.Sprintf("# HELP %s_get_requests reports total number of HTTP GET requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_get_requests counter\n", prefix)
out += fmt.Sprintf("%s_get_requests %v\n", prefix, data.GetRequests)
out += fmt.Sprintf("# HELP %s_post_requests reports total number of HTTP POST requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_post_requests counter\n", prefix)
out += fmt.Sprintf("%s_post_requests %v\n", prefix, data.PostRequests)
out += fmt.Sprintf("# HELP %s_put_requests reports total number of HTTP POST requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_put_requests counter\n", prefix)
out += fmt.Sprintf("%s_put_requests %v\n", prefix, data.PutRequests)
// throughput, rps, rps physical cpu, rps logical cpu
out += fmt.Sprintf("# HELP %s_rps reports request per second average\n", prefix)
out += fmt.Sprintf("# TYPE %s_rps gauge\n", prefix)
out += fmt.Sprintf("%s_rps %v\n", prefix, data.RPS)
out += fmt.Sprintf("# HELP %s_avg_get_time reports average get request time\n", prefix)
out += fmt.Sprintf("# TYPE %s_avg_get_time gauge\n", prefix)
out += fmt.Sprintf("%s_avg_get_time %v\n", prefix, data.AvgGetTime)
out += fmt.Sprintf("# HELP %s_avg_post_time reports average post request time\n", prefix)
out += fmt.Sprintf("# TYPE %s_avg_post_time gauge\n", prefix)
out += fmt.Sprintf("%s_avg_post_time %v\n", prefix, data.AvgPostTime)
out += fmt.Sprintf("# HELP %s_avg_put_time reports average put request time\n", prefix)
out += fmt.Sprintf("# TYPE %s_avg_put_time gauge\n", prefix)
out += fmt.Sprintf("%s_avg_put_time %v\n", prefix, data.AvgPutTime)
out += fmt.Sprintf("# HELP %s_avg_get_time reports average get request time\n", prefix)
out += fmt.Sprintf("# TYPE %s_avg_get_time gauge\n", prefix)
out += fmt.Sprintf("%s_avg_get_time %v\n", prefix, data.AvgGetTime)
out += fmt.Sprintf("# HELP %s_rps_physical_cpu reports request per second average weighted by physical CPU cores\n", prefix)
out += fmt.Sprintf("# TYPE %s_rps_physical_cpu gauge\n", prefix)
out += fmt.Sprintf("%s_rps_physical_cpu %v\n", prefix, data.RPSPhysical)
out += fmt.Sprintf("# HELP %s_rps_logical_cpu reports request per second average weighted by logical CPU cures\n", prefix)
out += fmt.Sprintf("# TYPE %s_rps_logical_cpu gauge\n", prefix)
out += fmt.Sprintf("%s_rps_logical_cpu %v\n", prefix, data.RPSLogical)
// migration server metrics
out += fmt.Sprintf("# HELP %s_requests reports total number of migration requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_requests counter\n", prefix)
out += fmt.Sprintf("%s_requests %v\n", prefix, data.MigrationRequests)
out += fmt.Sprintf("# HELP %s_pending reports total number of pending migration requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_pending counter\n", prefix)
out += fmt.Sprintf("%s_pending %v\n", prefix, data.MigrationPending)
out += fmt.Sprintf("# HELP %s_in_progress reports total number of in progress migration requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_in_progress counter\n", prefix)
out += fmt.Sprintf("%s_in_progress %v\n", prefix, data.MigrationInProgress)
out += fmt.Sprintf("# HELP %s_failed reports total number of failed migration requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_failed counter\n", prefix)
out += fmt.Sprintf("%s_failed %v\n", prefix, data.MigrationFailed)
out += fmt.Sprintf("# HELP %s_term_failed reports total number of term failed migration requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_term_failed counter\n", prefix)
out += fmt.Sprintf("%s_term_failed %v\n", prefix, data.MigrationTermFailed)
out += fmt.Sprintf("# HELP %s_completed reports total number of completed migration requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_completed counter\n", prefix)
out += fmt.Sprintf("%s_completed %v\n", prefix, data.MigrationCompleted)
out += fmt.Sprintf("# HELP %s_queued reports total number of queued migration requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_queued counter\n", prefix)
out += fmt.Sprintf("%s_queued %v\n", prefix, data.MigrationQueued)
out += fmt.Sprintf("# HELP %s_exist_in_db reports total number of exist in db migration requests\n", prefix)
out += fmt.Sprintf("# TYPE %s_exist_in_db counter\n", prefix)
out += fmt.Sprintf("%s_exist_in_db %v\n", prefix, data.MigrationExistInDB)
return out
}
// helper function to update RPS values
func updateRPS() {
total := float64(TotalGetRequests + TotalPostRequests + TotalPutRequests)
oldLogical := float64(NumLogicalCores)
oldPhysical := float64(NumPhysicalCores)
time.Sleep(1 * time.Minute)
for {
RPS = (float64(TotalGetRequests+TotalPostRequests+TotalPutRequests) - total) / 3600
RPSLogical = (float64(NumLogicalCores) - oldLogical) / 3600.
RPSPhysical = (float64(NumPhysicalCores) - oldPhysical) / 3600.
total = float64(TotalGetRequests + TotalPostRequests + TotalPutRequests)
oldLogical = float64(NumLogicalCores)
oldPhysical = float64(NumPhysicalCores)
time.Sleep(1 * time.Minute)
}
}
// helper function to update avg get request time
func updateGetRequestTime(time0 time.Time) {
AvgGetRequestTime += time.Since(time0).Seconds() / float64(TotalGetRequests)
}
// helper function to update avg post request time
func updatePostRequestTime(time0 time.Time) {
AvgPostRequestTime += time.Since(time0).Seconds() / float64(TotalPostRequests)
}
// helper function to update avg put request time
func updatePutRequestTime(time0 time.Time) {
AvgPutRequestTime += time.Since(time0).Seconds() / float64(TotalPutRequests)
}