Skip to content

Commit 288d38b

Browse files
authored
fix memory leak in runtime.node.heap.* metrics (#5476)
* fix memory leak in runtime.node.heap metrics * switch to promise-based timers for test * add benchmark
1 parent d1d7ccf commit 288d38b

File tree

5 files changed

+125
-74
lines changed

5 files changed

+125
-74
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
This benchmark runs the with runtime metrics and an accelerated flush. While
2+
this can catch code regressions, it's mostly meant to catch things like memory
3+
leaks where metrics would start piling up. This can be hard to catch in tests,
4+
but it would cause the app to become slower and slower over time which would
5+
be visible in the benchmark.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
'use strict'
2+
3+
require('../../..').init()
4+
5+
setTimeout(() => {}, 1000)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"name": "runtime-metrics",
3+
"run": "node index.js",
4+
"run_with_affinity": "bash -c \"taskset -c $CPU_AFFINITY node index.js\"",
5+
"cachegrind": false,
6+
"iterations": 5,
7+
"instructions": true,
8+
"variants": {
9+
"control": {
10+
"env": {
11+
"DD_RUNTIME_METRICS_ENABLED": "false"
12+
}
13+
},
14+
"with-runtime-metrics": {
15+
"baseline": "control",
16+
"env": {
17+
"DD_RUNTIME_METRICS_ENABLED": "true",
18+
"DD_RUNTIME_METRICS_FLUSH_INTERVAL": "20"
19+
}
20+
}
21+
}
22+
}

packages/dd-trace/src/runtime_metrics/runtime_metrics.js

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ const Histogram = require('../histogram')
1010
const { performance, PerformanceObserver } = require('perf_hooks')
1111

1212
const { NODE_MAJOR, NODE_MINOR } = require('../../../../version')
13-
const INTERVAL = 10 * 1000
13+
const { DD_RUNTIME_METRICS_FLUSH_INTERVAL = '10000' } = process.env
14+
const INTERVAL = parseInt(DD_RUNTIME_METRICS_FLUSH_INTERVAL, 10)
1415

1516
// Node >=16 has PerformanceObserver with `gc` type, but <16.7 had a critical bug.
1617
// See: https://github.com/nodejs/node/issues/39548
@@ -270,12 +271,12 @@ function captureNativeMetrics () {
270271
})
271272

272273
for (let i = 0, l = spaces.length; i < l; i++) {
273-
const tags = [`heap_space:${spaces[i].space_name}`]
274+
const tag = `heap_space:${spaces[i].space_name}`
274275

275-
client.gauge('runtime.node.heap.size.by.space', spaces[i].space_size, tags)
276-
client.gauge('runtime.node.heap.used_size.by.space', spaces[i].space_used_size, tags)
277-
client.gauge('runtime.node.heap.available_size.by.space', spaces[i].space_available_size, tags)
278-
client.gauge('runtime.node.heap.physical_size.by.space', spaces[i].physical_space_size, tags)
276+
client.gauge('runtime.node.heap.size.by.space', spaces[i].space_size, tag)
277+
client.gauge('runtime.node.heap.used_size.by.space', spaces[i].space_used_size, tag)
278+
client.gauge('runtime.node.heap.available_size.by.space', spaces[i].space_available_size, tag)
279+
client.gauge('runtime.node.heap.physical_size.by.space', spaces[i].physical_space_size, tag)
279280
}
280281
}
281282

packages/dd-trace/test/runtime_metrics.spec.js

Lines changed: 86 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ suiteDescribe('runtimeMetrics', () => {
146146
}
147147
}
148148

149-
setImmediate = globalThis.setImmediate
149+
setImmediate = require('timers/promises').setImmediate
150150
clock = sinon.useFakeTimers()
151151

152152
runtimeMetrics.start(config)
@@ -188,79 +188,97 @@ suiteDescribe('runtimeMetrics', () => {
188188
})
189189
})
190190

191-
it('should start collecting runtimeMetrics every 10 seconds', (done) => {
191+
it('should start collecting runtimeMetrics every 10 seconds', async () => {
192192
runtimeMetrics.stop()
193193
runtimeMetrics.start(config)
194194

195195
global.gc()
196196

197-
setImmediate(() => setImmediate(() => { // Wait for GC observer to trigger.
198-
clock.tick(10000)
197+
// Wait for GC observer to trigger.
198+
await setImmediate()
199+
await setImmediate()
200+
201+
clock.tick(10000)
202+
203+
expect(client.gauge).to.have.been.calledWith('runtime.node.cpu.user')
204+
expect(client.gauge).to.have.been.calledWith('runtime.node.cpu.system')
205+
expect(client.gauge).to.have.been.calledWith('runtime.node.cpu.total')
206+
207+
expect(client.gauge).to.have.been.calledWith('runtime.node.mem.rss')
208+
expect(client.gauge).to.have.been.calledWith('runtime.node.mem.heap_total')
209+
expect(client.gauge).to.have.been.calledWith('runtime.node.mem.heap_used')
210+
211+
expect(client.gauge).to.have.been.calledWith('runtime.node.process.uptime')
212+
213+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_heap_size')
214+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_heap_size_executable')
215+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_physical_size')
216+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_available_size')
217+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_heap_size')
218+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.heap_size_limit')
219+
220+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.malloced_memory')
221+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.peak_malloced_memory')
222+
223+
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.max', sinon.match.number)
224+
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.min', sinon.match.number)
225+
expect(client.increment).to.have.been.calledWith('runtime.node.event_loop.delay.sum', sinon.match.number)
226+
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.avg', sinon.match.number)
227+
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.median', sinon.match.number)
228+
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.95percentile', sinon.match.number)
229+
expect(client.increment).to.have.been.calledWith('runtime.node.event_loop.delay.count', sinon.match.number)
230+
231+
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.utilization')
232+
233+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.max', sinon.match.number)
234+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.min', sinon.match.number)
235+
expect(client.increment).to.have.been.calledWith('runtime.node.gc.pause.sum', sinon.match.number)
236+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.avg', sinon.match.number)
237+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.median', sinon.match.number)
238+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.95percentile', sinon.match.number)
239+
expect(client.increment).to.have.been.calledWith('runtime.node.gc.pause.count', sinon.match.number)
240+
241+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.max', sinon.match.number)
242+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.min', sinon.match.number)
243+
expect(client.increment).to.have.been.calledWith('runtime.node.gc.pause.by.type.sum', sinon.match.number)
244+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.avg', sinon.match.number)
245+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.median', sinon.match.number)
246+
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.95percentile', sinon.match.number)
247+
expect(client.increment).to.have.been.calledWith('runtime.node.gc.pause.by.type.count', sinon.match.number)
248+
expect(client.increment).to.have.been.calledWith(
249+
'runtime.node.gc.pause.by.type.count', sinon.match.any, sinon.match(val => {
250+
return val && /^gc_type:[a-z_]+$/.test(val[0])
251+
})
252+
)
253+
254+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.size.by.space')
255+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.used_size.by.space')
256+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.available_size.by.space')
257+
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.physical_size.by.space')
258+
259+
expect(client.flush).to.have.been.called
260+
})
261+
262+
it('should collect individual metrics only once every 10 seconds', async () => {
263+
runtimeMetrics.stop()
264+
runtimeMetrics.start(config)
265+
266+
global.gc()
267+
268+
// Wait for GC observer to trigger.
269+
await setImmediate()
270+
await setImmediate()
271+
272+
clock.tick(60 * 60 * 1000)
199273

200-
try {
201-
expect(client.gauge).to.have.been.calledWith('runtime.node.cpu.user')
202-
expect(client.gauge).to.have.been.calledWith('runtime.node.cpu.system')
203-
expect(client.gauge).to.have.been.calledWith('runtime.node.cpu.total')
204-
205-
expect(client.gauge).to.have.been.calledWith('runtime.node.mem.rss')
206-
expect(client.gauge).to.have.been.calledWith('runtime.node.mem.heap_total')
207-
expect(client.gauge).to.have.been.calledWith('runtime.node.mem.heap_used')
208-
209-
expect(client.gauge).to.have.been.calledWith('runtime.node.process.uptime')
210-
211-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_heap_size')
212-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_heap_size_executable')
213-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_physical_size')
214-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_available_size')
215-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.total_heap_size')
216-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.heap_size_limit')
217-
218-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.malloced_memory')
219-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.peak_malloced_memory')
220-
221-
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.max', sinon.match.number)
222-
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.min', sinon.match.number)
223-
expect(client.increment).to.have.been.calledWith('runtime.node.event_loop.delay.sum', sinon.match.number)
224-
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.avg', sinon.match.number)
225-
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.median', sinon.match.number)
226-
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.delay.95percentile', sinon.match.number)
227-
expect(client.increment).to.have.been.calledWith('runtime.node.event_loop.delay.count', sinon.match.number)
228-
229-
expect(client.gauge).to.have.been.calledWith('runtime.node.event_loop.utilization')
230-
231-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.max', sinon.match.number)
232-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.min', sinon.match.number)
233-
expect(client.increment).to.have.been.calledWith('runtime.node.gc.pause.sum', sinon.match.number)
234-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.avg', sinon.match.number)
235-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.median', sinon.match.number)
236-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.95percentile', sinon.match.number)
237-
expect(client.increment).to.have.been.calledWith('runtime.node.gc.pause.count', sinon.match.number)
238-
239-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.max', sinon.match.number)
240-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.min', sinon.match.number)
241-
expect(client.increment).to.have.been.calledWith('runtime.node.gc.pause.by.type.sum', sinon.match.number)
242-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.avg', sinon.match.number)
243-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.median', sinon.match.number)
244-
expect(client.gauge).to.have.been.calledWith('runtime.node.gc.pause.by.type.95percentile', sinon.match.number)
245-
expect(client.increment).to.have.been.calledWith('runtime.node.gc.pause.by.type.count', sinon.match.number)
246-
expect(client.increment).to.have.been.calledWith(
247-
'runtime.node.gc.pause.by.type.count', sinon.match.any, sinon.match(val => {
248-
return val && /^gc_type:[a-z_]+$/.test(val[0])
249-
})
250-
)
251-
252-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.size.by.space')
253-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.used_size.by.space')
254-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.available_size.by.space')
255-
expect(client.gauge).to.have.been.calledWith('runtime.node.heap.physical_size.by.space')
256-
257-
expect(client.flush).to.have.been.called
258-
259-
done()
260-
} catch (e) {
261-
done(e)
262-
}
263-
}))
274+
// If a metric is leaking, it will leak expontentially because it will
275+
// be sent one more time each flush, in addition to the previous
276+
// flushes that also had the metric multiple times in them, so after
277+
// 1 hour even if a single metric is leaking it would get over
278+
// 64980 calls on its own without any other metric. A slightly lower
279+
// value is used here to be on the safer side.
280+
expect(client.gauge.callCount).to.be.lt(60000)
281+
expect(client.increment.callCount).to.be.lt(60000)
264282
})
265283
})
266284

0 commit comments

Comments
 (0)