From 120107553b70579f83811c9f12b2a2ddd059852f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Geisend=C3=B6rfer?= Date: Tue, 1 Jun 2021 15:13:04 +0200 Subject: [PATCH 1/5] profiler: goroutineswait saw-stop mechanism This is a safety mechanism for the new goroutineWaitProfile that automatically disables the profile if the number of goroutines is too high in order to avoid excessive stop-the-world pauses. The default limit is 1000 which is somewhat arbitrary and should limit STW pauses to ~30ms. It can be overwritten with DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES. WARNING: The goroutineWaitProfile is still experimental and not meant to be used by users outside of datadog. Fixes PROF-3234 --- profiler/options.go | 62 +++++++++++++++++++++++----------------- profiler/profile.go | 5 ++++ profiler/profile_test.go | 48 +++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 26 deletions(-) diff --git a/profiler/options.go b/profiler/options.go index ef6fcf3da5..2092c41304 100644 --- a/profiler/options.go +++ b/profiler/options.go @@ -14,6 +14,7 @@ import ( "os" "path/filepath" "runtime" + "strconv" "strings" "time" "unicode" @@ -82,21 +83,22 @@ type config struct { agentless bool // targetURL is the upload destination URL. It will be set by the profiler on start to either apiURL or agentURL // based on the other options. - targetURL string - apiURL string // apiURL is the Datadog intake API URL - agentURL string // agentURL is the Datadog agent profiling URL - service, env string - hostname string - statsd StatsdClient - httpClient *http.Client - tags []string - types map[ProfileType]struct{} - period time.Duration - cpuDuration time.Duration - uploadTimeout time.Duration - mutexFraction int - blockRate int - outputDir string + targetURL string + apiURL string // apiURL is the Datadog intake API URL + agentURL string // agentURL is the Datadog agent profiling URL + service, env string + hostname string + statsd StatsdClient + httpClient *http.Client + tags []string + types map[ProfileType]struct{} + period time.Duration + cpuDuration time.Duration + uploadTimeout time.Duration + maxGoroutinesWait int + mutexFraction int + blockRate int + outputDir string } func urlForSite(site string) (string, error) { @@ -127,17 +129,18 @@ func (c *config) addProfileType(t ProfileType) { func defaultConfig() (*config, error) { c := config{ - env: defaultEnv, - apiURL: defaultAPIURL, - service: filepath.Base(os.Args[0]), - statsd: &statsd.NoOpClient{}, - httpClient: defaultClient, - period: DefaultPeriod, - cpuDuration: DefaultDuration, - blockRate: DefaultBlockRate, - mutexFraction: DefaultMutexFraction, - uploadTimeout: DefaultUploadTimeout, - tags: []string{fmt.Sprintf("pid:%d", os.Getpid())}, + env: defaultEnv, + apiURL: defaultAPIURL, + service: filepath.Base(os.Args[0]), + statsd: &statsd.NoOpClient{}, + httpClient: defaultClient, + period: DefaultPeriod, + cpuDuration: DefaultDuration, + blockRate: DefaultBlockRate, + mutexFraction: DefaultMutexFraction, + uploadTimeout: DefaultUploadTimeout, + maxGoroutinesWait: 1000, // arbitrary value, should limit STW to ~30ms + tags: []string{fmt.Sprintf("pid:%d", os.Getpid())}, } for _, t := range defaultProfileTypes { c.addProfileType(t) @@ -206,6 +209,13 @@ func defaultConfig() (*config, error) { if v := os.Getenv("DD_PROFILING_OUTPUT_DIR"); v != "" { withOutputDir(v)(&c) } + if v := os.Getenv("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES"); v != "" { + n, err := strconv.Atoi(v) + if err != nil { + return nil, fmt.Errorf("DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES: %s", err) + } + c.maxGoroutinesWait = n + } return &c, nil } diff --git a/profiler/profile.go b/profiler/profile.go index f1eb098150..354050df72 100644 --- a/profiler/profile.go +++ b/profiler/profile.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "io" + "runtime" "runtime/pprof" "time" @@ -231,6 +232,10 @@ func goroutineProfile(cfg *config) (*profile, error) { } func goroutineWaitProfile(cfg *config) (*profile, error) { + if n := runtime.NumGoroutine(); n > cfg.maxGoroutinesWait { + return nil, fmt.Errorf("%d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d", n, cfg.maxGoroutinesWait) + } + var ( text = &bytes.Buffer{} pprof = &bytes.Buffer{} diff --git a/profiler/profile_test.go b/profiler/profile_test.go index 579375fa58..93efef311b 100644 --- a/profiler/profile_test.go +++ b/profiler/profile_test.go @@ -7,8 +7,11 @@ package profiler import ( "bytes" + "fmt" "io" "io/ioutil" + "os" + "strconv" "testing" "time" @@ -166,6 +169,51 @@ main.main() "main.indirectShortSleepLoop2", }) }) + + t.Run("goroutineswait DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES", func(t *testing.T) { + // spawGoroutines spawns n goroutines and then returns a func to stop them. + spawnGoroutines := func(n int) func() { + launched := make(chan struct{}) + stopped := make(chan struct{}) + for i := 0; i < n; i++ { + go func() { + launched <- struct{}{} + stopped <- struct{}{} + }() + <-launched + } + return func() { + for i := 0; i < n; i++ { + <-stopped + } + } + } + + goroutines := 100 + limit := 10 + + stop := spawnGoroutines(goroutines) + defer stop() + envVar := "DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES" + oldVal := os.Getenv(envVar) + os.Setenv(envVar, strconv.Itoa(limit)) + defer os.Setenv(envVar, oldVal) + + defer func(old func(_ string, _ io.Writer, _ int) error) { lookupProfile = old }(lookupProfile) + lookupProfile = func(_ string, w io.Writer, _ int) error { + _, err := w.Write([]byte("")) + return err + } + + p, err := unstartedProfiler() + require.NoError(t, err) + _, err = p.runProfile(expGoroutineWaitProfile) + var errRoutines, errLimit int + msg := "%d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d" + fmt.Sscanf(err.Error(), msg, &errRoutines, &errLimit) + require.GreaterOrEqual(t, errRoutines, goroutines) + require.Equal(t, limit, errLimit) + }) } func Test_goroutineDebug2ToPprof_CrashSafety(t *testing.T) { From 768a8b32218fcd178245bdd1d293f976083c295b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Geisend=C3=B6rfer?= Date: Wed, 23 Jun 2021 11:35:54 +0200 Subject: [PATCH 2/5] Rename test --- profiler/profile_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/profiler/profile_test.go b/profiler/profile_test.go index 93efef311b..c2e4c52340 100644 --- a/profiler/profile_test.go +++ b/profiler/profile_test.go @@ -170,7 +170,7 @@ main.main() }) }) - t.Run("goroutineswait DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES", func(t *testing.T) { + t.Run("goroutineswaitLimit", func(t *testing.T) { // spawGoroutines spawns n goroutines and then returns a func to stop them. spawnGoroutines := func(n int) func() { launched := make(chan struct{}) From 2538e333f2e905aaa34474f0653c9695dcae1e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Geisend=C3=B6rfer?= Date: Wed, 23 Jun 2021 11:39:40 +0200 Subject: [PATCH 3/5] Improve error message --- profiler/profile.go | 2 +- profiler/profile_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/profiler/profile.go b/profiler/profile.go index 354050df72..c58a680aab 100644 --- a/profiler/profile.go +++ b/profiler/profile.go @@ -233,7 +233,7 @@ func goroutineProfile(cfg *config) (*profile, error) { func goroutineWaitProfile(cfg *config) (*profile, error) { if n := runtime.NumGoroutine(); n > cfg.maxGoroutinesWait { - return nil, fmt.Errorf("%d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d", n, cfg.maxGoroutinesWait) + return nil, fmt.Errorf("skipping goroutines wait profile: %d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d", n, cfg.maxGoroutinesWait) } var ( diff --git a/profiler/profile_test.go b/profiler/profile_test.go index c2e4c52340..7bc8f8ea81 100644 --- a/profiler/profile_test.go +++ b/profiler/profile_test.go @@ -209,7 +209,7 @@ main.main() require.NoError(t, err) _, err = p.runProfile(expGoroutineWaitProfile) var errRoutines, errLimit int - msg := "%d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d" + msg := "skipping goroutines wait profile: %d goroutines exceeds DD_PROFILING_WAIT_PROFILE_MAX_GOROUTINES limit of %d" fmt.Sscanf(err.Error(), msg, &errRoutines, &errLimit) require.GreaterOrEqual(t, errRoutines, goroutines) require.Equal(t, limit, errLimit) From 2e9505385671b046eeabd3f308ecdfcb2b1270e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Geisend=C3=B6rfer?= Date: Thu, 24 Jun 2021 10:41:06 +0200 Subject: [PATCH 4/5] Clarify intend for extra channel --- profiler/profile_test.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/profiler/profile_test.go b/profiler/profile_test.go index 7bc8f8ea81..4630328a13 100644 --- a/profiler/profile_test.go +++ b/profiler/profile_test.go @@ -171,16 +171,19 @@ main.main() }) t.Run("goroutineswaitLimit", func(t *testing.T) { - // spawGoroutines spawns n goroutines and then returns a func to stop them. + // spawGoroutines spawns n goroutines, waits for them to start executing, + // and then returns a func to stop them. For more details about `executing` + // see: + // https://github.com/DataDog/dd-trace-go/pull/942#discussion_r656924335 spawnGoroutines := func(n int) func() { - launched := make(chan struct{}) + executing := make(chan struct{}) stopped := make(chan struct{}) for i := 0; i < n; i++ { go func() { - launched <- struct{}{} + executing <- struct{}{} stopped <- struct{}{} }() - <-launched + <-executing } return func() { for i := 0; i < n; i++ { From 1a856f47004037b38ef3e657831a64fa136a25f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20Geisend=C3=B6rfer?= Date: Thu, 24 Jun 2021 10:43:48 +0200 Subject: [PATCH 5/5] Rename stopped to stopping The new name is technically more accurate as the goroutine may not have stopped yet when a message appears in the channel. --- profiler/profile_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/profiler/profile_test.go b/profiler/profile_test.go index 4630328a13..74174f59f0 100644 --- a/profiler/profile_test.go +++ b/profiler/profile_test.go @@ -177,17 +177,17 @@ main.main() // https://github.com/DataDog/dd-trace-go/pull/942#discussion_r656924335 spawnGoroutines := func(n int) func() { executing := make(chan struct{}) - stopped := make(chan struct{}) + stopping := make(chan struct{}) for i := 0; i < n; i++ { go func() { executing <- struct{}{} - stopped <- struct{}{} + stopping <- struct{}{} }() <-executing } return func() { for i := 0; i < n; i++ { - <-stopped + <-stopping } } }