Skip to content

Commit

Permalink
profiler: randomize when execution traces are collected
Browse files Browse the repository at this point in the history
We currently record execution traces at a fixed interval. This means
that apps which are deployed across several instances simulatenously
will have time periods where no instances have execution trace data.
This also biases us against activity which occurs with a frequency which
is harmonic with the trace collection frequency. To fully address this
we would need to decouple execution trace collection from the normal
profiling cycle. But as a first step, we should give every profiling
cycle a chance of recording data. This commit does that: each profiling
cycle we record an execution trace with probability (profiling period) /
(trace period). This way we still maintain the same desired avarage data
rate of ~one trace every 15 minutes by default.
  • Loading branch information
nsrip-dd committed Nov 29, 2023
1 parent a6e5ba9 commit a04d37d
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 20 deletions.
11 changes: 10 additions & 1 deletion profiler/profiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"errors"
"fmt"
"io"
"math/rand"
"os"
"path/filepath"
"runtime"
Expand Down Expand Up @@ -88,7 +89,15 @@ type profiler struct {

func (p *profiler) shouldTrace() bool {
p.cfg.traceConfig.Refresh()
return p.cfg.traceConfig.Enabled && time.Since(p.lastTrace) > p.cfg.traceConfig.Period
if !p.cfg.traceConfig.Enabled {
return false
}
if p.cfg.traceConfig.Period < p.cfg.period {
return true
}
// Randomly record a trace with probability (profile period) / (trace period)
prob := float64(p.cfg.period) / float64(p.cfg.traceConfig.Period)
return rand.Float64() < prob
}

// testHooks are functions that are replaced during testing which would normally
Expand Down
60 changes: 41 additions & 19 deletions profiler/profiler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ func TestAllUploaded(t *testing.T) {
defer server.Close()

t.Setenv("DD_PROFILING_WAIT_PROFILE", "1")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "10ms") // match profile period
Start(
WithAgentAddr(server.Listener.Addr().String()),
WithProfileTypes(
Expand All @@ -384,7 +385,7 @@ func TestAllUploaded(t *testing.T) {
"goroutines.pprof",
"goroutineswait.pprof",
}
if executionTraceEnabledDefault && seq == 0 {
if executionTraceEnabledDefault {
expected = append(expected, "go.trace")
}
assert.ElementsMatch(t, expected, profile.event.Attachments)
Expand Down Expand Up @@ -462,31 +463,57 @@ func TestImmediateProfile(t *testing.T) {
}
}

func TestExecutionTrace(t *testing.T) {
func TestExecutionTraceCPUProfileRate(t *testing.T) {
got := make(chan profileMeta)
server := httptest.NewServer(&mockBackend{t: t, profiles: got})
defer server.Close()

// cpuProfileRate is picked randomly so we can check for it in the trace
// data to reduce the chance that it occurs in the trace data for some other
// reduce. In theory we could use the entire int64 space, but when we do
// reason. In theory we could use the entire int64 space, but when we do
// this the runtime can crash with the error shown below.
//
// runtime: kevent on fd 3 failed with 60
// fatal error: runtime: netpoll failed
cpuProfileRate := int(9999 + rand.Int63n(9999))

t.Setenv("DD_PROFILING_EXECUTION_TRACE_ENABLED", "true")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "3s")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "10ms")
err := Start(
WithAgentAddr(server.Listener.Addr().String()),
WithProfileTypes(CPUProfile),
WithPeriod(1*time.Second),
WithPeriod(10*time.Millisecond),
CPUProfileRate(int(cpuProfileRate)),
)
require.NoError(t, err)
defer Stop()

m := <-got
assertContainsCPUProfileRateLog(t, m.attachments["go.trace"], cpuProfileRate)
}

// assertContainsCPUProfileRateLog checks for the presence of the log written by
// traceLogCPUProfileRate. It's a bit hacky, but probably good enough for now :).
func assertContainsCPUProfileRateLog(t *testing.T, traceData []byte, cpuProfileRate int) {
assert.True(t, bytes.Contains(traceData, []byte("cpuProfileRate")))
assert.True(t, bytes.Contains(traceData, []byte(fmt.Sprintf("%d", cpuProfileRate))))
}

func TestExecutionTraceRandom(t *testing.T) {
got := make(chan profileMeta)
server := httptest.NewServer(&mockBackend{t: t, profiles: got})
defer server.Close()

t.Setenv("DD_PROFILING_EXECUTION_TRACE_ENABLED", "true")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "150ms")
err := Start(
WithAgentAddr(server.Listener.Addr().String()),
WithProfileTypes(),
WithPeriod(10*time.Millisecond),
)
require.NoError(t, err)
defer Stop()

contains := func(haystack []string, needle string) bool {
for _, s := range haystack {
if s == needle {
Expand All @@ -496,27 +523,22 @@ func TestExecutionTrace(t *testing.T) {
return false
}
seenTraces := 0
for i := 0; i < 4; i++ {
for i := 0; i < 100; i++ {
m := <-got
t.Log(m.event.Attachments, m.tags)
if contains(m.event.Attachments, "go.trace") && contains(m.tags, "go_execution_traced:yes") {
seenTraces++
assertContainsCPUProfileRateLog(t, m.attachments["go.trace"], cpuProfileRate)
}
}
// With a trace frequency of 3 seconds and a profiling period of 1
// second, we should see 2 traces after 4 profile collections: one at
// the start, and one 3 seconds later.
if seenTraces != 2 {
t.Errorf("wanted %d traces, got %d", 2, seenTraces)
}
}

// assertContainsCPUProfileRateLog checks for the presence of the log written by
// traceLogCPUProfileRate. It's a bit hacky, but probably good enough for now :).
func assertContainsCPUProfileRateLog(t *testing.T, traceData []byte, cpuProfileRate int) {
assert.True(t, bytes.Contains(traceData, []byte("cpuProfileRate")))
assert.True(t, bytes.Contains(traceData, []byte(fmt.Sprintf("%d", cpuProfileRate))))
// Given 100 profiles with a probability of (10ms/150ms) = 1/15, we expect
// an average of 6 profiles with traces. Theres a ~99.9% probability of this
// number being between 0 and 15. Call that good enough.
// TODO(nick): tighten this up further? We could do more trials at the expense
// of making this test much slower
if seenTraces > 15 {
t.Errorf("saw %d traces, expected between 0 and 15", seenTraces)
}
}

// TestEndpointCounts verfies that the unit of work feature works end to end.
Expand Down

0 comments on commit a04d37d

Please sign in to comment.