Skip to content

Commit

Permalink
profiler: randomize when execution traces are collected
Browse files Browse the repository at this point in the history
We currently record execution traces at a fixed interval. This means
that apps which are deployed across several instances simulatenously
will have time periods where no instances have execution trace data.
This also biases us against activity which occurs with a frequency which
is harmonic with the trace collection frequency. To fully address this
we would need to decouple execution trace collection from the normal
profiling cycle. But as a first step, we should give every profiling
cycle a chance of recording data. This commit does that: each profiling
cycle we record an execution trace with probability (profiling period) /
(trace period). This way we still maintain the same desired avarage data
rate of ~one trace every 15 minutes by default. We have one special
case, though: we want a trace for the first profile cycle to capture
startup activity, since that's usually quite different than normal
program activity.

Inline the shouldTrace function into the one place it should actually be
used. Prior to this commit, shouldTrace was called from two places: once
to decide whether to trace, and once as a double-check right before
starting the trace. The double-check is not particularly helpful, and if
we're making a decision randomly, then checking twice means we'd have to
win the "coin toss" twice to record data. So we inline the logic into a
single place, right before scheduling a trace.

This is tested by doing many "trials" of recording profiles with
different execution trace configurations, seeing how many traces we get,
and making assertions about the number we see based on the expected
probability of tracing. On the one hand, the actual change is fairly
simple so perhaps this level of testing is overkill. We also are
deliberatly introducing a "flaky" test. On the other hand, the first
draft of this change had a bug from calling shouldTrace twice, and the
added test catches that quite consistently. The test could be even
stronger but perhaps this is good enough to start.
  • Loading branch information
nsrip-dd committed Dec 15, 2023
1 parent 5a92f7b commit b684ba8
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 39 deletions.
3 changes: 0 additions & 3 deletions profiler/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,6 @@ var profileTypes = map[ProfileType]profileType{
Name: "execution-trace",
Filename: "go.trace",
Collect: func(p *profiler) ([]byte, error) {
if !p.shouldTrace() {
return nil, errors.New("started tracing erroneously, indicating a bug in the profiler")
}
p.lastTrace = time.Now()
buf := new(bytes.Buffer)
lt := newLimitedTraceCollector(buf, int64(p.cfg.traceConfig.Limit))
Expand Down
24 changes: 18 additions & 6 deletions profiler/profiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"errors"
"fmt"
"io"
"math/rand"
"os"
"path/filepath"
"runtime"
Expand Down Expand Up @@ -86,11 +87,6 @@ type profiler struct {
lastTrace time.Time
}

func (p *profiler) shouldTrace() bool {
p.cfg.traceConfig.Refresh()
return p.cfg.traceConfig.Enabled && time.Since(p.lastTrace) > p.cfg.traceConfig.Period
}

// testHooks are functions that are replaced during testing which would normally
// depend on accessing runtime state that is not needed/available for the test
type testHooks struct {
Expand Down Expand Up @@ -303,10 +299,26 @@ func (p *profiler) collect(ticker <-chan time.Time) {
// finished (because p.pendingProfiles will have been
// incremented to count every non-CPU profile before CPU
// profiling starts)

profileTypes := p.enabledProfileTypes()
if p.shouldTrace() {

// Decide whether we should record an execution trace
p.cfg.traceConfig.Refresh()
// Randomly record a trace with probability (profile period) / (trace period).
// Note that if the trace period is equal to or less than the profile period,
// we will always record a trace
// We do multiplication here instead of division to defensively guard against
// division by 0
shouldTraceRandomly := rand.Float64()*float64(p.cfg.traceConfig.Period) < float64(p.cfg.period)
// As a special case, we want to trace during the first
// profiling cycle since startup activity is generally much
// different than regular operation
firstCycle := bat.seq == 0
shouldTrace := p.cfg.traceConfig.Enabled && (shouldTraceRandomly || firstCycle)
if shouldTrace {
profileTypes = append(profileTypes, executionTrace)
}

for _, t := range profileTypes {
if t != CPUProfile {
p.pendingProfiles.Add(1)
Expand Down
145 changes: 115 additions & 30 deletions profiler/profiler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"encoding/json"
"fmt"
"io"
"math"
"math/rand"
"net"
"net/http"
Expand Down Expand Up @@ -376,6 +377,7 @@ func TestAllUploaded(t *testing.T) {
// TODO: Further check that the uploaded profiles are all valid

t.Setenv("DD_PROFILING_WAIT_PROFILE", "1")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "10ms") // match profile period
// The channel is buffered with 2 entries so we can check that the
// second batch of profiles is correct in case the profiler gets in a
// bad state after the first round of profiling.
Expand All @@ -400,7 +402,7 @@ func TestAllUploaded(t *testing.T) {
"goroutines.pprof",
"goroutineswait.pprof",
}
if executionTraceEnabledDefault && seq == 0 {
if executionTraceEnabledDefault {
expected = append(expected, "go.trace")
}
assert.ElementsMatch(t, expected, profile.event.Attachments)
Expand Down Expand Up @@ -455,47 +457,24 @@ func TestImmediateProfile(t *testing.T) {
}
}

func TestExecutionTrace(t *testing.T) {
func TestExecutionTraceCPUProfileRate(t *testing.T) {
// cpuProfileRate is picked randomly so we can check for it in the trace
// data to reduce the chance that it occurs in the trace data for some other
// reduce. In theory we could use the entire int64 space, but when we do
// reason. In theory we could use the entire int64 space, but when we do
// this the runtime can crash with the error shown below.
//
// runtime: kevent on fd 3 failed with 60
// fatal error: runtime: netpoll failed
cpuProfileRate := int(9999 + rand.Int63n(9999))

t.Setenv("DD_PROFILING_EXECUTION_TRACE_ENABLED", "true")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "3s")
profiles := startTestProfiler(t, 1,
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "10ms")
profile := <-startTestProfiler(t, 1,
WithPeriod(10*time.Millisecond),
WithProfileTypes(CPUProfile),
WithPeriod(1*time.Second),
CPUProfileRate(int(cpuProfileRate)),
)

contains := func(haystack []string, needle string) bool {
for _, s := range haystack {
if s == needle {
return true
}
}
return false
}
seenTraces := 0
for i := 0; i < 4; i++ {
m := <-profiles
t.Log(m.event.Attachments, m.tags)
if contains(m.event.Attachments, "go.trace") && contains(m.tags, "go_execution_traced:yes") {
seenTraces++
assertContainsCPUProfileRateLog(t, m.attachments["go.trace"], cpuProfileRate)
}
}
// With a trace frequency of 3 seconds and a profiling period of 1
// second, we should see 2 traces after 4 profile collections: one at
// the start, and one 3 seconds later.
if seenTraces != 2 {
t.Errorf("wanted %d traces, got %d", 2, seenTraces)
}
assertContainsCPUProfileRateLog(t, profile.attachments["go.trace"], cpuProfileRate)
}

// assertContainsCPUProfileRateLog checks for the presence of the log written by
Expand All @@ -505,6 +484,112 @@ func assertContainsCPUProfileRateLog(t *testing.T, traceData []byte, cpuProfileR
assert.True(t, bytes.Contains(traceData, []byte(fmt.Sprintf("%d", cpuProfileRate))))
}

func sliceContains[T comparable](haystack []T, needle T) bool {
for _, s := range haystack {
if s == needle {
return true
}
}
return false
}

func TestExecutionTraceMisconfiguration(t *testing.T) {
rl := new(log.RecordLogger)
defer log.UseLogger(rl)()
// Test the profiler with an execution trace period of 0.
// This is considered misconfiguration and tracing shouldn't be enabled.
//
// This test is partly defensive in nature: when doing randomized traces,
// recording with probability (period / execution trace period),
// depending on how it's implemented we may divide by 0. The Go
// spec says that implementations _will_ panic for integer division by
// 0, and _may_ choose to panic for floating point division by 0.
// See go.dev/ref/spec#Arithmetic_operators, and go.dev/issue/43577
t.Setenv("DD_PROFILING_EXECUTION_TRACE_ENABLED", "true")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "0ms")
profile := doOneShortProfileUpload(t,
WithProfileTypes(),
WithPeriod(10*time.Millisecond),
)
assert.NotContains(t, profile.event.Attachments, "go.trace")
log.Flush()
for _, m := range rl.Logs() {
if strings.Contains(m, "Invalid execution trace config") {
return
}
}
t.Log(rl.Logs())
t.Error("did not warn on invalid trace config")
}

func TestExecutionTraceRandom(t *testing.T) {
collectTraces := func(t *testing.T, profilePeriod, tracePeriod time.Duration, count int) int {
t.Setenv("DD_PROFILING_EXECUTION_TRACE_ENABLED", "true")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", tracePeriod.String())
profiles := startTestProfiler(t, 10,
WithProfileTypes(),
WithPeriod(profilePeriod),
)

seenTraces := 0
for i := 0; i < count; i++ {
profile := <-profiles
if sliceContains(profile.event.Attachments, "go.trace") && sliceContains(profile.tags, "go_execution_traced:yes") {
seenTraces++
} else if i == 0 {
t.Error("did not see a trace in the first upload")
}
}
return seenTraces
}

doTrial := func(t *testing.T, rate, tolerance float64) bool {
profileDurationMs := 10.0
traceDurationMs := profileDurationMs / rate
const count = 100
seen := collectTraces(t,
time.Duration(profileDurationMs)*time.Millisecond,
time.Duration(traceDurationMs)*time.Millisecond,
count,
)
// We're simulating Bernoulli trials with the given rate, which
// should follow a binomial distribution. Assert that we're
// within the given number of standard deviations of the
// expected mean
stdDev := math.Sqrt(count * rate * (1 - rate))
mean := count * rate
lower, upper := int(mean-tolerance*stdDev), int(mean+tolerance*stdDev)
if seen >= lower && seen <= upper {
return true
}
t.Logf("observed %d traces, outside the desired bound of [%d, %d]", seen, lower, upper)
return false
}

sampleRates := []float64{
1.0 / 15.0, // our default rate
0.5, // higher to catch failures from under-sampling
1.0, // record all the time
}
for _, rate := range sampleRates {
name := fmt.Sprintf("rate-%f", rate)
t.Run(name, func(t *testing.T) {
// We should be within 2 standard deviations ~95% of the time
// with a correct implementation. If we do this twice, then
// we have a ~99.999% chance of succeeding with a correct
// implementation. We keep a reasonably tight tolerance
// to ensure that an incorrect implementation is more likely
// to fail both times
for i := 0; i < 2; i++ {
if doTrial(t, rate, 2.0) {
return
}
}
t.Error("failed after retry")
})
}
}

// TestEndpointCounts verfies that the unit of work feature works end to end.
func TestEndpointCounts(t *testing.T) {
for _, enabled := range []bool{true, false} {
Expand Down

0 comments on commit b684ba8

Please sign in to comment.