Skip to content

Commit

Permalink
profiler: goroutineswait saw-stop mechanism
Browse files Browse the repository at this point in the history
This is a safety mechanism for the new goroutineWaitProfile that
automatically disables the profile if the number of goroutines is too
high in order to avoid excessive stop-the-world pauses.

The default limit is 1000 which is somewhat arbitrary and should limit
STW pauses to ~30ms. It can be overwritten with
DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES.

WARNING: The goroutineWaitProfile is still experimental and not meant to
be used by users outside of datadog.

Fixes PROF-3234
  • Loading branch information
felixge committed Jun 23, 2021
1 parent 796ebfc commit 1201075
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 26 deletions.
62 changes: 36 additions & 26 deletions profiler/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
"unicode"
Expand Down Expand Up @@ -82,21 +83,22 @@ type config struct {
agentless bool
// targetURL is the upload destination URL. It will be set by the profiler on start to either apiURL or agentURL
// based on the other options.
targetURL string
apiURL string // apiURL is the Datadog intake API URL
agentURL string // agentURL is the Datadog agent profiling URL
service, env string
hostname string
statsd StatsdClient
httpClient *http.Client
tags []string
types map[ProfileType]struct{}
period time.Duration
cpuDuration time.Duration
uploadTimeout time.Duration
mutexFraction int
blockRate int
outputDir string
targetURL string
apiURL string // apiURL is the Datadog intake API URL
agentURL string // agentURL is the Datadog agent profiling URL
service, env string
hostname string
statsd StatsdClient
httpClient *http.Client
tags []string
types map[ProfileType]struct{}
period time.Duration
cpuDuration time.Duration
uploadTimeout time.Duration
maxGoroutinesWait int
mutexFraction int
blockRate int
outputDir string
}

func urlForSite(site string) (string, error) {
Expand Down Expand Up @@ -127,17 +129,18 @@ func (c *config) addProfileType(t ProfileType) {

func defaultConfig() (*config, error) {
c := config{
env: defaultEnv,
apiURL: defaultAPIURL,
service: filepath.Base(os.Args[0]),
statsd: &statsd.NoOpClient{},
httpClient: defaultClient,
period: DefaultPeriod,
cpuDuration: DefaultDuration,
blockRate: DefaultBlockRate,
mutexFraction: DefaultMutexFraction,
uploadTimeout: DefaultUploadTimeout,
tags: []string{fmt.Sprintf("pid:%d", os.Getpid())},
env: defaultEnv,
apiURL: defaultAPIURL,
service: filepath.Base(os.Args[0]),
statsd: &statsd.NoOpClient{},
httpClient: defaultClient,
period: DefaultPeriod,
cpuDuration: DefaultDuration,
blockRate: DefaultBlockRate,
mutexFraction: DefaultMutexFraction,
uploadTimeout: DefaultUploadTimeout,
maxGoroutinesWait: 1000, // arbitrary value, should limit STW to ~30ms
tags: []string{fmt.Sprintf("pid:%d", os.Getpid())},
}
for _, t := range defaultProfileTypes {
c.addProfileType(t)
Expand Down Expand Up @@ -206,6 +209,13 @@ func defaultConfig() (*config, error) {
if v := os.Getenv("DD_PROFILING_OUTPUT_DIR"); v != "" {
withOutputDir(v)(&c)
}
if v := os.Getenv("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES"); v != "" {
n, err := strconv.Atoi(v)
if err != nil {
return nil, fmt.Errorf("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES: %s", err)
}
c.maxGoroutinesWait = n
}
return &c, nil
}

Expand Down
5 changes: 5 additions & 0 deletions profiler/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"errors"
"fmt"
"io"
"runtime"
"runtime/pprof"
"time"

Expand Down Expand Up @@ -231,6 +232,10 @@ func goroutineProfile(cfg *config) (*profile, error) {
}

func goroutineWaitProfile(cfg *config) (*profile, error) {
if n := runtime.NumGoroutine(); n > cfg.maxGoroutinesWait {
return nil, fmt.Errorf("%d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d", n, cfg.maxGoroutinesWait)
}

var (
text = &bytes.Buffer{}
pprof = &bytes.Buffer{}
Expand Down
48 changes: 48 additions & 0 deletions profiler/profile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@ package profiler

import (
"bytes"
"fmt"
"io"
"io/ioutil"
"os"
"strconv"
"testing"
"time"

Expand Down Expand Up @@ -166,6 +169,51 @@ main.main()
"main.indirectShortSleepLoop2",
})
})

t.Run("goroutineswait DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES", func(t *testing.T) {
// spawGoroutines spawns n goroutines and then returns a func to stop them.
spawnGoroutines := func(n int) func() {
launched := make(chan struct{})
stopped := make(chan struct{})
for i := 0; i < n; i++ {
go func() {
launched <- struct{}{}
stopped <- struct{}{}
}()
<-launched
}
return func() {
for i := 0; i < n; i++ {
<-stopped
}
}
}

goroutines := 100
limit := 10

stop := spawnGoroutines(goroutines)
defer stop()
envVar := "DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES"
oldVal := os.Getenv(envVar)
os.Setenv(envVar, strconv.Itoa(limit))
defer os.Setenv(envVar, oldVal)

defer func(old func(_ string, _ io.Writer, _ int) error) { lookupProfile = old }(lookupProfile)
lookupProfile = func(_ string, w io.Writer, _ int) error {
_, err := w.Write([]byte(""))
return err
}

p, err := unstartedProfiler()
require.NoError(t, err)
_, err = p.runProfile(expGoroutineWaitProfile)
var errRoutines, errLimit int
msg := "%d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d"
fmt.Sscanf(err.Error(), msg, &errRoutines, &errLimit)
require.GreaterOrEqual(t, errRoutines, goroutines)
require.Equal(t, limit, errLimit)
})
}

func Test_goroutineDebug2ToPprof_CrashSafety(t *testing.T) {
Expand Down

0 comments on commit 1201075

Please sign in to comment.