Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

profiler: randomize when execution traces are collected #2401

Merged
merged 2 commits into from
Dec 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions profiler/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,6 @@ var profileTypes = map[ProfileType]profileType{
Name: "execution-trace",
Filename: "go.trace",
Collect: func(p *profiler) ([]byte, error) {
if !p.shouldTrace() {
return nil, errors.New("started tracing erroneously, indicating a bug in the profiler")
}
p.lastTrace = time.Now()
buf := new(bytes.Buffer)
lt := newLimitedTraceCollector(buf, int64(p.cfg.traceConfig.Limit))
Expand Down
24 changes: 18 additions & 6 deletions profiler/profiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"errors"
"fmt"
"io"
"math/rand"
"os"
"path/filepath"
"runtime"
Expand Down Expand Up @@ -86,11 +87,6 @@ type profiler struct {
lastTrace time.Time
}

func (p *profiler) shouldTrace() bool {
p.cfg.traceConfig.Refresh()
return p.cfg.traceConfig.Enabled && time.Since(p.lastTrace) > p.cfg.traceConfig.Period
}

// testHooks are functions that are replaced during testing which would normally
// depend on accessing runtime state that is not needed/available for the test
type testHooks struct {
Expand Down Expand Up @@ -303,10 +299,26 @@ func (p *profiler) collect(ticker <-chan time.Time) {
// finished (because p.pendingProfiles will have been
// incremented to count every non-CPU profile before CPU
// profiling starts)

profileTypes := p.enabledProfileTypes()
if p.shouldTrace() {

// Decide whether we should record an execution trace
p.cfg.traceConfig.Refresh()
// Randomly record a trace with probability (profile period) / (trace period).
// Note that if the trace period is equal to or less than the profile period,
// we will always record a trace
// We do multiplication here instead of division to defensively guard against
// division by 0
shouldTraceRandomly := rand.Float64()*float64(p.cfg.traceConfig.Period) < float64(p.cfg.period)
// As a special case, we want to trace during the first
// profiling cycle since startup activity is generally much
// different than regular operation
firstCycle := bat.seq == 0
shouldTrace := p.cfg.traceConfig.Enabled && (shouldTraceRandomly || firstCycle)
if shouldTrace {
profileTypes = append(profileTypes, executionTrace)
}

for _, t := range profileTypes {
if t != CPUProfile {
p.pendingProfiles.Add(1)
Expand Down
145 changes: 115 additions & 30 deletions profiler/profiler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"encoding/json"
"fmt"
"io"
"math"
"math/rand"
"net"
"net/http"
Expand Down Expand Up @@ -376,6 +377,7 @@ func TestAllUploaded(t *testing.T) {
// TODO: Further check that the uploaded profiles are all valid

t.Setenv("DD_PROFILING_WAIT_PROFILE", "1")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "10ms") // match profile period
// The channel is buffered with 2 entries so we can check that the
// second batch of profiles is correct in case the profiler gets in a
// bad state after the first round of profiling.
Expand All @@ -400,7 +402,7 @@ func TestAllUploaded(t *testing.T) {
"goroutines.pprof",
"goroutineswait.pprof",
}
if executionTraceEnabledDefault && seq == 0 {
if executionTraceEnabledDefault {
expected = append(expected, "go.trace")
}
assert.ElementsMatch(t, expected, profile.event.Attachments)
Expand Down Expand Up @@ -455,47 +457,24 @@ func TestImmediateProfile(t *testing.T) {
}
}

func TestExecutionTrace(t *testing.T) {
func TestExecutionTraceCPUProfileRate(t *testing.T) {
// cpuProfileRate is picked randomly so we can check for it in the trace
// data to reduce the chance that it occurs in the trace data for some other
// reduce. In theory we could use the entire int64 space, but when we do
// reason. In theory we could use the entire int64 space, but when we do
// this the runtime can crash with the error shown below.
//
// runtime: kevent on fd 3 failed with 60
// fatal error: runtime: netpoll failed
cpuProfileRate := int(9999 + rand.Int63n(9999))

t.Setenv("DD_PROFILING_EXECUTION_TRACE_ENABLED", "true")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "3s")
profiles := startTestProfiler(t, 1,
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "10ms")
profile := <-startTestProfiler(t, 1,
WithPeriod(10*time.Millisecond),
WithProfileTypes(CPUProfile),
WithPeriod(1*time.Second),
CPUProfileRate(int(cpuProfileRate)),
)

contains := func(haystack []string, needle string) bool {
for _, s := range haystack {
if s == needle {
return true
}
}
return false
}
seenTraces := 0
for i := 0; i < 4; i++ {
m := <-profiles
t.Log(m.event.Attachments, m.tags)
if contains(m.event.Attachments, "go.trace") && contains(m.tags, "go_execution_traced:yes") {
seenTraces++
assertContainsCPUProfileRateLog(t, m.attachments["go.trace"], cpuProfileRate)
}
}
// With a trace frequency of 3 seconds and a profiling period of 1
// second, we should see 2 traces after 4 profile collections: one at
// the start, and one 3 seconds later.
if seenTraces != 2 {
t.Errorf("wanted %d traces, got %d", 2, seenTraces)
}
assertContainsCPUProfileRateLog(t, profile.attachments["go.trace"], cpuProfileRate)
}

// assertContainsCPUProfileRateLog checks for the presence of the log written by
Expand All @@ -505,6 +484,112 @@ func assertContainsCPUProfileRateLog(t *testing.T, traceData []byte, cpuProfileR
assert.True(t, bytes.Contains(traceData, []byte(fmt.Sprintf("%d", cpuProfileRate))))
}

func sliceContains[T comparable](haystack []T, needle T) bool {
for _, s := range haystack {
if s == needle {
return true
}
}
return false
}
nsrip-dd marked this conversation as resolved.
Show resolved Hide resolved

func TestExecutionTraceMisconfiguration(t *testing.T) {
rl := new(log.RecordLogger)
defer log.UseLogger(rl)()
// Test the profiler with an execution trace period of 0.
// This is considered misconfiguration and tracing shouldn't be enabled.
//
// This test is partly defensive in nature: when doing randomized traces,
// recording with probability (period / execution trace period),
// depending on how it's implemented we may divide by 0. The Go
// spec says that implementations _will_ panic for integer division by
// 0, and _may_ choose to panic for floating point division by 0.
// See go.dev/ref/spec#Arithmetic_operators, and go.dev/issue/43577
t.Setenv("DD_PROFILING_EXECUTION_TRACE_ENABLED", "true")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", "0ms")
profile := doOneShortProfileUpload(t,
WithProfileTypes(),
WithPeriod(10*time.Millisecond),
)
assert.NotContains(t, profile.event.Attachments, "go.trace")
log.Flush()
for _, m := range rl.Logs() {
if strings.Contains(m, "Invalid execution trace config") {
return
}
}
t.Log(rl.Logs())
t.Error("did not warn on invalid trace config")
}

func TestExecutionTraceRandom(t *testing.T) {
collectTraces := func(t *testing.T, profilePeriod, tracePeriod time.Duration, count int) int {
t.Setenv("DD_PROFILING_EXECUTION_TRACE_ENABLED", "true")
t.Setenv("DD_PROFILING_EXECUTION_TRACE_PERIOD", tracePeriod.String())
profiles := startTestProfiler(t, 10,
WithProfileTypes(),
WithPeriod(profilePeriod),
)

seenTraces := 0
for i := 0; i < count; i++ {
profile := <-profiles
if sliceContains(profile.event.Attachments, "go.trace") && sliceContains(profile.tags, "go_execution_traced:yes") {
seenTraces++
} else if i == 0 {
t.Error("did not see a trace in the first upload")
}
}
return seenTraces
}

doTrial := func(t *testing.T, rate, tolerance float64) bool {
profileDurationMs := 10.0
traceDurationMs := profileDurationMs / rate
const count = 100
seen := collectTraces(t,
time.Duration(profileDurationMs)*time.Millisecond,
time.Duration(traceDurationMs)*time.Millisecond,
count,
)
// We're simulating Bernoulli trials with the given rate, which
// should follow a binomial distribution. Assert that we're
// within the given number of standard deviations of the
// expected mean
stdDev := math.Sqrt(count * rate * (1 - rate))
mean := count * rate
lower, upper := int(mean-tolerance*stdDev), int(mean+tolerance*stdDev)
if seen >= lower && seen <= upper {
return true
}
t.Logf("observed %d traces, outside the desired bound of [%d, %d]", seen, lower, upper)
return false
}

sampleRates := []float64{
1.0 / 15.0, // our default rate
0.5, // higher to catch failures from under-sampling
1.0, // record all the time
}
for _, rate := range sampleRates {
name := fmt.Sprintf("rate-%f", rate)
t.Run(name, func(t *testing.T) {
// We should be within 2 standard deviations ~95% of the time
// with a correct implementation. If we do this twice, then
// we have a ~99.999% chance of succeeding with a correct
// implementation. We keep a reasonably tight tolerance
// to ensure that an incorrect implementation is more likely
// to fail both times
for i := 0; i < 2; i++ {
if doTrial(t, rate, 2.0) {
return
}
}
t.Error("failed after retry")
})
}
}

// TestEndpointCounts verfies that the unit of work feature works end to end.
func TestEndpointCounts(t *testing.T) {
for _, enabled := range []bool{true, false} {
Expand Down
Loading