From 67b9633fa9e4b2684b4fad43c172819c10142743 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Fri, 5 Jul 2024 16:46:52 +0100 Subject: [PATCH 1/9] [PROF-10123] Add placeholder for missing allocations in allocation profiler **What does this PR do?** This PR fixes a corner case issue with the allocation profiler: if too many objects get allocated, we "clamp" the weight of every individual sample (to avoid introducing bias) to a maximum value. But we previously did not in any way represent the clamping on the profile itself, which meant that the total number of allocations reported by the profiler would be missing any object counts that were clamped. In practice, we don't expect this to happen very often, as we have a high clamping value `MAX_ALLOC_WEIGHT = 65535` (it's in `collectors_cpu_and_wall_time_worker.c`). This PR then introduces an extra placeholder sample, when needed, to represent allocations that went over the clamped value. Thus the total amount of objects allocated reported by the profiler now becomes accurate, even in the presence of clamping. **Motivation:** We're considering lowering the maximum clamping value, which would mean it would be hit more often, and thus it's important to fix this issue before we do that. **Additional Notes:** N/A **How to test the change?** I've added test coverage for this change. I also used the following snippet to quickly experiment while I was developing this PR: ```ruby Datadog::Profiling.wait_until_running allocator = Thread.new { loop { BasicObject.new } } sleep 10 allocator.kill; allocator.join puts "Allocated #{GC.stat(:total_allocated_objects)} objects total" ``` --- .../collectors_cpu_and_wall_time_worker.c | 8 ++++- .../collectors_thread_context.c | 31 +++++++++++++++++++ .../collectors_thread_context.h | 1 + .../cpu_and_wall_time_worker_spec.rb | 25 +++++++++++++++ .../collectors/thread_context_spec.rb | 29 +++++++++++++++++ 5 files changed, 93 insertions(+), 1 deletion(-) diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index dff70397c16..c132fd2eb40 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -1140,9 +1140,15 @@ static VALUE rescued_sample_allocation(VALUE tracepoint_data) { discrete_dynamic_sampler_events_since_last_sample(&state->allocation_sampler) : // if we aren't, then we're sampling every event 1; - // TODO: Signal in the profile that clamping happened? + + // To control bias from sampling, we clamp the maximum weight attributed to a single allocation sample. This avoids + // assigning a very large number to a sample, if for instance the dynamic sampling mechanism chose a really big interval. unsigned int weight = allocations_since_last_sample > MAX_ALLOC_WEIGHT ? MAX_ALLOC_WEIGHT : (unsigned int) allocations_since_last_sample; thread_context_collector_sample_allocation(state->thread_context_collector_instance, weight, new_object); + // ...but we still represent the missing allocations in the profile, thus the data will account for all allocations. + if (weight < allocations_since_last_sample) { + thread_context_collector_sample_missed_allocations(state->thread_context_collector_instance, allocations_since_last_sample - weight); + } // Return a dummy VALUE because we're called from rb_rescue2 which requires it return Qnil; diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 19b9b74109e..dad402523a6 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -231,6 +231,7 @@ static void ddtrace_otel_trace_identifiers_for( VALUE active_span, VALUE otel_values ); +static VALUE _native_sample_missed_allocations(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE missed_allocations); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -261,6 +262,7 @@ void collectors_thread_context_init(VALUE profiling_module) { rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1); rb_define_singleton_method(testing_module, "_native_gc_tracking", _native_gc_tracking, 1); rb_define_singleton_method(testing_module, "_native_new_empty_thread", _native_new_empty_thread, 0); + rb_define_singleton_method(testing_module, "_native_sample_missed_allocations", _native_sample_missed_allocations, 2); at_active_span_id = rb_intern_const("@active_span"); at_active_trace_id = rb_intern_const("@active_trace"); @@ -1400,3 +1402,32 @@ static void ddtrace_otel_trace_identifiers_for( *active_trace = current_trace; *numeric_span_id = resolved_numeric_span_id; } + +void thread_context_collector_sample_missed_allocations(VALUE self_instance, unsigned int missed_allocations) { + struct thread_context_collector_state *state; + TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); + + ddog_prof_Label labels[] = { + {.key = DDOG_CHARSLICE_C("thread id"), .str = DDOG_CHARSLICE_C("MA")}, + {.key = DDOG_CHARSLICE_C("thread name"), .str = DDOG_CHARSLICE_C("Missing Allocations")}, + {.key = DDOG_CHARSLICE_C("allocation class"), .str = DDOG_CHARSLICE_C("(Missing Allocations)")}, + }; + ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = sizeof(labels) / sizeof(labels[0])}; + + record_placeholder_stack( + state->sampling_buffer, + state->recorder_instance, + (sample_values) {.alloc_samples = missed_allocations}, + (sample_labels) { + .labels = slice_labels, + .state_label = NULL, + .end_timestamp_ns = 0, // For now we're not collecting timestamps for allocation events + }, + DDOG_CHARSLICE_C("Missing Allocations") + ); +} + +static VALUE _native_sample_missed_allocations(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE missed_allocations) { + thread_context_collector_sample_missed_allocations(collector_instance, NUM2UINT(missed_allocations)); + return Qtrue; +} diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h index 6299d96b43e..8566b16398d 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.h +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h @@ -9,6 +9,7 @@ void thread_context_collector_sample( VALUE profiler_overhead_stack_thread ); void thread_context_collector_sample_allocation(VALUE self_instance, unsigned int sample_weight, VALUE new_object); +void thread_context_collector_sample_missed_allocations(VALUE self_instance, unsigned int missed_allocations); VALUE thread_context_collector_sample_after_gc(VALUE self_instance); void thread_context_collector_on_gc_start(VALUE self_instance); bool thread_context_collector_on_gc_finish(VALUE self_instance); diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 633e1271ac6..0bfc3c5ff9b 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -507,6 +507,7 @@ context 'with dynamic_sampling_rate_enabled' do let(:options) { { dynamic_sampling_rate_enabled: true } } + it 'keeps statistics on how allocation sampling is doing' do stub_const('CpuAndWallTimeWorkerSpec::TestStruct', Struct.new(:foo)) @@ -536,6 +537,30 @@ one_second_in_ns = 1_000_000_000 expect(sampling_time_ns_max).to be < one_second_in_ns, "A single sample should not take longer than 1s, #{stats}" end + + # When large numbers of objects are allocated, the dynamic sampling rate kicks in, and we don't sample every + # object. + # We then assign a weight to every sample to compensate for this; to avoid bias, we have a limit on this weight, + # and we clamp it if it goes over the limit. + # But the total amount of allocations recorded should match the number we observed, and thus we record the + # remainder above the clamped value as a separate "Missing Allocations" step. + it 'records missed allocations when weights are clamped' do + start + + thread_that_allocates_as_fast_as_possible = Thread.new { loop { BasicObject.new } } + + allocation_samples = try_wait_until do + samples = samples_from_pprof(recorder.serialize!).select { |it| it.values[:'alloc-samples'] > 0 } + samples if samples.any? { |it| it.labels[:'thread name'] == 'Missing Allocations' } + end + + thread_that_allocates_as_fast_as_possible.kill + thread_that_allocates_as_fast_as_possible.join + + cpu_and_wall_time_worker.stop + + expect(allocation_samples).to_not be_empty + end end context 'when sampling optimized Ruby strings' do diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index 44323ce1665..3f76d6f1a4b 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -79,6 +79,10 @@ def sample_allocation(weight:, new_object: Object.new) described_class::Testing._native_sample_allocation(cpu_and_wall_time_collector, weight, new_object) end + def sample_missed_allocations(missed_allocations) + described_class::Testing._native_sample_missed_allocations(cpu_and_wall_time_collector, missed_allocations) + end + def thread_list described_class::Testing._native_thread_list end @@ -1210,6 +1214,31 @@ def self.otel_sdk_available? end end + describe '#sample_missed_allocations' do + let(:single_sample) do + expect(samples.size).to be 1 + samples.first + end + before { sample_missed_allocations(123) } + + it 'records the number of missed allocations' do + expect(single_sample.values).to include('alloc-samples': 123) + end + + it 'attributes the missed allocations to a "Missing Allocations" thread' do + expect(single_sample.labels).to include('thread id': 'MA', 'thread name': 'Missing Allocations') + end + + it 'attributes the missed allocations to a "(Missing Allocations)" allocation class' do + expect(single_sample.labels).to include('allocation class': '(Missing Allocations)') + end + + it 'includes a placeholder stack attributed to "Missing Allocations"' do + expect(single_sample.locations.size).to be 1 + expect(single_sample.locations.first.path).to eq 'Missing Allocations' + end + end + describe '#thread_list' do it "returns the same as Ruby's Thread.list" do expect(thread_list).to eq Thread.list From 1446109ea58be649fdc0740f59a1c36a2a38c9f1 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 8 Jul 2024 09:30:11 +0100 Subject: [PATCH 2/9] Replace "Missing Allocations" with "Skipped Samples" for placeholder name This was a great suggestion from the PR review. --- .../collectors_cpu_and_wall_time_worker.c | 4 ++-- .../collectors_thread_context.c | 18 +++++++-------- .../collectors_thread_context.h | 2 +- .../cpu_and_wall_time_worker_spec.rb | 6 ++--- .../collectors/thread_context_spec.rb | 22 +++++++++---------- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index c132fd2eb40..e27f8b53894 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -1145,9 +1145,9 @@ static VALUE rescued_sample_allocation(VALUE tracepoint_data) { // assigning a very large number to a sample, if for instance the dynamic sampling mechanism chose a really big interval. unsigned int weight = allocations_since_last_sample > MAX_ALLOC_WEIGHT ? MAX_ALLOC_WEIGHT : (unsigned int) allocations_since_last_sample; thread_context_collector_sample_allocation(state->thread_context_collector_instance, weight, new_object); - // ...but we still represent the missing allocations in the profile, thus the data will account for all allocations. + // ...but we still represent the skipped samples in the profile, thus the data will account for all allocations. if (weight < allocations_since_last_sample) { - thread_context_collector_sample_missed_allocations(state->thread_context_collector_instance, allocations_since_last_sample - weight); + thread_context_collector_sample_skipped_allocation_samples(state->thread_context_collector_instance, allocations_since_last_sample - weight); } // Return a dummy VALUE because we're called from rb_rescue2 which requires it diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index dad402523a6..6f12c7e64a0 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -231,7 +231,7 @@ static void ddtrace_otel_trace_identifiers_for( VALUE active_span, VALUE otel_values ); -static VALUE _native_sample_missed_allocations(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE missed_allocations); +static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE missed_allocations); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -262,7 +262,7 @@ void collectors_thread_context_init(VALUE profiling_module) { rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1); rb_define_singleton_method(testing_module, "_native_gc_tracking", _native_gc_tracking, 1); rb_define_singleton_method(testing_module, "_native_new_empty_thread", _native_new_empty_thread, 0); - rb_define_singleton_method(testing_module, "_native_sample_missed_allocations", _native_sample_missed_allocations, 2); + rb_define_singleton_method(testing_module, "_native_sample_skipped_allocation_samples", _native_sample_skipped_allocation_samples, 2); at_active_span_id = rb_intern_const("@active_span"); at_active_trace_id = rb_intern_const("@active_trace"); @@ -1403,14 +1403,14 @@ static void ddtrace_otel_trace_identifiers_for( *numeric_span_id = resolved_numeric_span_id; } -void thread_context_collector_sample_missed_allocations(VALUE self_instance, unsigned int missed_allocations) { +void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int missed_allocations) { struct thread_context_collector_state *state; TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); ddog_prof_Label labels[] = { - {.key = DDOG_CHARSLICE_C("thread id"), .str = DDOG_CHARSLICE_C("MA")}, - {.key = DDOG_CHARSLICE_C("thread name"), .str = DDOG_CHARSLICE_C("Missing Allocations")}, - {.key = DDOG_CHARSLICE_C("allocation class"), .str = DDOG_CHARSLICE_C("(Missing Allocations)")}, + {.key = DDOG_CHARSLICE_C("thread id"), .str = DDOG_CHARSLICE_C("SA")}, + {.key = DDOG_CHARSLICE_C("thread name"), .str = DDOG_CHARSLICE_C("Skipped Samples")}, + {.key = DDOG_CHARSLICE_C("allocation class"), .str = DDOG_CHARSLICE_C("(Skipped Samples)")}, }; ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = sizeof(labels) / sizeof(labels[0])}; @@ -1423,11 +1423,11 @@ void thread_context_collector_sample_missed_allocations(VALUE self_instance, uns .state_label = NULL, .end_timestamp_ns = 0, // For now we're not collecting timestamps for allocation events }, - DDOG_CHARSLICE_C("Missing Allocations") + DDOG_CHARSLICE_C("Skipped Samples") ); } -static VALUE _native_sample_missed_allocations(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE missed_allocations) { - thread_context_collector_sample_missed_allocations(collector_instance, NUM2UINT(missed_allocations)); +static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE missed_allocations) { + thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(missed_allocations)); return Qtrue; } diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h index 8566b16398d..16e9405ca3b 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.h +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h @@ -9,7 +9,7 @@ void thread_context_collector_sample( VALUE profiler_overhead_stack_thread ); void thread_context_collector_sample_allocation(VALUE self_instance, unsigned int sample_weight, VALUE new_object); -void thread_context_collector_sample_missed_allocations(VALUE self_instance, unsigned int missed_allocations); +void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int missed_allocations); VALUE thread_context_collector_sample_after_gc(VALUE self_instance); void thread_context_collector_on_gc_start(VALUE self_instance); bool thread_context_collector_on_gc_finish(VALUE self_instance); diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 0bfc3c5ff9b..954749500b9 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -543,15 +543,15 @@ # We then assign a weight to every sample to compensate for this; to avoid bias, we have a limit on this weight, # and we clamp it if it goes over the limit. # But the total amount of allocations recorded should match the number we observed, and thus we record the - # remainder above the clamped value as a separate "Missing Allocations" step. - it 'records missed allocations when weights are clamped' do + # remainder above the clamped value as a separate "Skipped Samples" step. + it 'records skipped allocation samples when weights are clamped' do start thread_that_allocates_as_fast_as_possible = Thread.new { loop { BasicObject.new } } allocation_samples = try_wait_until do samples = samples_from_pprof(recorder.serialize!).select { |it| it.values[:'alloc-samples'] > 0 } - samples if samples.any? { |it| it.labels[:'thread name'] == 'Missing Allocations' } + samples if samples.any? { |it| it.labels[:'thread name'] == 'Skipped Samples' } end thread_that_allocates_as_fast_as_possible.kill diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index 3f76d6f1a4b..09fc04e19b3 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -79,8 +79,8 @@ def sample_allocation(weight:, new_object: Object.new) described_class::Testing._native_sample_allocation(cpu_and_wall_time_collector, weight, new_object) end - def sample_missed_allocations(missed_allocations) - described_class::Testing._native_sample_missed_allocations(cpu_and_wall_time_collector, missed_allocations) + def sample_skipped_allocation_samples(missed_allocations) + described_class::Testing._native_sample_skipped_allocation_samples(cpu_and_wall_time_collector, missed_allocations) end def thread_list @@ -1214,28 +1214,28 @@ def self.otel_sdk_available? end end - describe '#sample_missed_allocations' do + describe '#sample_skipped_allocation_samples' do let(:single_sample) do expect(samples.size).to be 1 samples.first end - before { sample_missed_allocations(123) } + before { sample_skipped_allocation_samples(123) } - it 'records the number of missed allocations' do + it 'records the number of skipped allocations' do expect(single_sample.values).to include('alloc-samples': 123) end - it 'attributes the missed allocations to a "Missing Allocations" thread' do - expect(single_sample.labels).to include('thread id': 'MA', 'thread name': 'Missing Allocations') + it 'attributes the missed allocations to a "Skipped Samples" thread' do + expect(single_sample.labels).to include('thread id': 'SA', 'thread name': 'Skipped Samples') end - it 'attributes the missed allocations to a "(Missing Allocations)" allocation class' do - expect(single_sample.labels).to include('allocation class': '(Missing Allocations)') + it 'attributes the missed allocations to a "(Skipped Samples)" allocation class' do + expect(single_sample.labels).to include('allocation class': '(Skipped Samples)') end - it 'includes a placeholder stack attributed to "Missing Allocations"' do + it 'includes a placeholder stack attributed to "Skipped Samples"' do expect(single_sample.locations.size).to be 1 - expect(single_sample.locations.first.path).to eq 'Missing Allocations' + expect(single_sample.locations.first.path).to eq 'Skipped Samples' end end From 8c9939fe687c65e34b52faf2d275f2a8375727d9 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 8 Jul 2024 09:30:43 +0100 Subject: [PATCH 3/9] Add workaround for buggy gcc warning We've seen this in a few other places in the codebase, so let's use the usual workaround. --- .../collectors_thread_context.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 6f12c7e64a0..8704c42ecb4 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -1408,9 +1408,10 @@ void thread_context_collector_sample_skipped_allocation_samples(VALUE self_insta TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); ddog_prof_Label labels[] = { - {.key = DDOG_CHARSLICE_C("thread id"), .str = DDOG_CHARSLICE_C("SA")}, - {.key = DDOG_CHARSLICE_C("thread name"), .str = DDOG_CHARSLICE_C("Skipped Samples")}, - {.key = DDOG_CHARSLICE_C("allocation class"), .str = DDOG_CHARSLICE_C("(Skipped Samples)")}, + // Providing .num = 0 should not be needed but the tracer-2.7 docker image ships a buggy gcc that complains about this + {.key = DDOG_CHARSLICE_C("thread id"), .str = DDOG_CHARSLICE_C("SA"), .num = 0}, + {.key = DDOG_CHARSLICE_C("thread name"), .str = DDOG_CHARSLICE_C("Skipped Samples"), .num = 0}, + {.key = DDOG_CHARSLICE_C("allocation class"), .str = DDOG_CHARSLICE_C("(Skipped Samples)"), .num = 0}, }; ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = sizeof(labels) / sizeof(labels[0])}; From 2d682c3a086d52e5692bd5ed5fc47415ba70f4cf Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 8 Jul 2024 10:08:11 +0100 Subject: [PATCH 4/9] Minor: Rename a few left-behind items to the new "skipped samples" name --- .../collectors_thread_context.c | 10 +++++----- .../collectors_thread_context.h | 2 +- .../profiling/collectors/thread_context_spec.rb | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 8704c42ecb4..bb78763be68 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -231,7 +231,7 @@ static void ddtrace_otel_trace_identifiers_for( VALUE active_span, VALUE otel_values ); -static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE missed_allocations); +static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -1403,7 +1403,7 @@ static void ddtrace_otel_trace_identifiers_for( *numeric_span_id = resolved_numeric_span_id; } -void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int missed_allocations) { +void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int skipped_samples) { struct thread_context_collector_state *state; TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); @@ -1418,7 +1418,7 @@ void thread_context_collector_sample_skipped_allocation_samples(VALUE self_insta record_placeholder_stack( state->sampling_buffer, state->recorder_instance, - (sample_values) {.alloc_samples = missed_allocations}, + (sample_values) {.alloc_samples = skipped_samples}, (sample_labels) { .labels = slice_labels, .state_label = NULL, @@ -1428,7 +1428,7 @@ void thread_context_collector_sample_skipped_allocation_samples(VALUE self_insta ); } -static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE missed_allocations) { - thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(missed_allocations)); +static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples) { + thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples)); return Qtrue; } diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h index 16e9405ca3b..073c1caa911 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.h +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h @@ -9,7 +9,7 @@ void thread_context_collector_sample( VALUE profiler_overhead_stack_thread ); void thread_context_collector_sample_allocation(VALUE self_instance, unsigned int sample_weight, VALUE new_object); -void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int missed_allocations); +void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int skipped_samples); VALUE thread_context_collector_sample_after_gc(VALUE self_instance); void thread_context_collector_on_gc_start(VALUE self_instance); bool thread_context_collector_on_gc_finish(VALUE self_instance); diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index 09fc04e19b3..67dd0bfa43f 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -79,8 +79,8 @@ def sample_allocation(weight:, new_object: Object.new) described_class::Testing._native_sample_allocation(cpu_and_wall_time_collector, weight, new_object) end - def sample_skipped_allocation_samples(missed_allocations) - described_class::Testing._native_sample_skipped_allocation_samples(cpu_and_wall_time_collector, missed_allocations) + def sample_skipped_allocation_samples(skipped_samples) + described_class::Testing._native_sample_skipped_allocation_samples(cpu_and_wall_time_collector, skipped_samples) end def thread_list @@ -1225,11 +1225,11 @@ def self.otel_sdk_available? expect(single_sample.values).to include('alloc-samples': 123) end - it 'attributes the missed allocations to a "Skipped Samples" thread' do + it 'attributes the skipped samples to a "Skipped Samples" thread' do expect(single_sample.labels).to include('thread id': 'SA', 'thread name': 'Skipped Samples') end - it 'attributes the missed allocations to a "(Skipped Samples)" allocation class' do + it 'attributes the skipped samples to a "(Skipped Samples)" allocation class' do expect(single_sample.labels).to include('allocation class': '(Skipped Samples)') end From d19cfce3be7e1f7ba7c6979b138e02b9dc91ce3f Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 8 Jul 2024 11:24:58 +0100 Subject: [PATCH 5/9] Avoid deadlocking in profiler when `end_heap_allocation_recording` raises Due to a bug in the integration between how the heap sampler works and the new "missing allocations/skipped samples" placeholder, we could cause one of the heap sampler sanity checks to fail, and thus raise an exception. This exception would be correctly logged, e.g. something like > W, [2024-07-08T11:24:21.212654 #226523] WARN -- datadog: [datadog] > (lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb:72:in > `rescue in block (2 levels) in start') CpuAndWallTimeWorker thread > error. Cause: RuntimeError Ended a heap recording that was not > started Location: missing-testcase.rb:2:in `new' The exception would also cause the profiler to try to shut down. But this is where we would run into issues: the scheduler still tries to report one last profile before shutting down, and because the above exception left the stack recorder with the active slot mutex locked, this would make the scheduler deadlock forever, thus preventing the Ruby VM from shutting down. Part 1 on fixing this issue is to correctly unlock the mutex, thus getting rid of the deadlock. Part 2 (next commit) will be to not report a profile when the profiler stopped with a failure. This seems like a prudent way to go, since the profile may be in an inconsistent state. (Out of curiosity, both 1 or 2 would suffice to avoid the issue, but I think both seem useful individually to avoid future issues.) --- .../heap_recorder.c | 37 ++++++++++++++++--- .../heap_recorder.h | 4 +- .../stack_recorder.c | 10 ++++- spec/datadog/profiling/stack_recorder_spec.rb | 27 ++++++++++++++ 4 files changed, 71 insertions(+), 7 deletions(-) diff --git a/ext/datadog_profiling_native_extension/heap_recorder.c b/ext/datadog_profiling_native_extension/heap_recorder.c index 9702a32bbb8..d96ba5b544c 100644 --- a/ext/datadog_profiling_native_extension/heap_recorder.c +++ b/ext/datadog_profiling_native_extension/heap_recorder.c @@ -166,6 +166,12 @@ struct heap_recorder { size_t objects_frozen; } stats_last_update; }; + +struct end_heap_allocation_args { + struct heap_recorder *heap_recorder; + ddog_prof_Slice_Location locations; +}; + static heap_record* get_or_create_heap_record(heap_recorder*, ddog_prof_Slice_Location); static void cleanup_heap_record_if_unused(heap_recorder*, heap_record*); static void on_committed_object_record_cleanup(heap_recorder *heap_recorder, object_record *record); @@ -176,6 +182,7 @@ static int st_object_records_iterate(st_data_t, st_data_t, st_data_t); static int st_object_records_debug(st_data_t key, st_data_t value, st_data_t extra); static int update_object_record_entry(st_data_t*, st_data_t*, st_data_t, int); static void commit_recording(heap_recorder*, heap_record*, recording); +static VALUE end_heap_allocation_recording(VALUE end_heap_allocation_args); // ========================== // Heap Recorder External API @@ -340,9 +347,28 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj }; } -void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_prof_Slice_Location locations) { +// end_heap_allocation_recording_with_rb_protect gets called while the stack_recorder is holding one of the profile +// locks. To enable us to correctly unlock the profile on exception, we wrap the call to end_heap_allocation_recording +// with an rb_protect. +__attribute__((warn_unused_result)) +int end_heap_allocation_recording_with_rb_protect(struct heap_recorder *heap_recorder, ddog_prof_Slice_Location locations) { + int exception_state; + struct end_heap_allocation_args end_heap_allocation_args = { + .heap_recorder = heap_recorder, + .locations = locations, + }; + rb_protect(end_heap_allocation_recording, (VALUE) &end_heap_allocation_args, &exception_state); + return exception_state; +} + +static VALUE end_heap_allocation_recording(VALUE end_heap_allocation_args) { + struct end_heap_allocation_args *args = (struct end_heap_allocation_args *) end_heap_allocation_args; + + struct heap_recorder *heap_recorder = args->heap_recorder; + ddog_prof_Slice_Location locations = args->locations; + if (heap_recorder == NULL) { - return; + return Qnil; } recording active_recording = heap_recorder->active_recording; @@ -356,15 +382,16 @@ void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_pro // data required for committing though. heap_recorder->active_recording = (recording) {0}; - if (active_recording.object_record == &SKIPPED_RECORD) { - // special marker when we decided to skip due to sampling - return; + if (active_recording.object_record == &SKIPPED_RECORD) { // special marker when we decided to skip due to sampling + return Qnil; } heap_record *heap_record = get_or_create_heap_record(heap_recorder, locations); // And then commit the new allocation. commit_recording(heap_recorder, heap_record, active_recording); + + return Qnil; } void heap_recorder_prepare_iteration(heap_recorder *heap_recorder) { diff --git a/ext/datadog_profiling_native_extension/heap_recorder.h b/ext/datadog_profiling_native_extension/heap_recorder.h index 4d8a7aa1e89..6647d2e7432 100644 --- a/ext/datadog_profiling_native_extension/heap_recorder.h +++ b/ext/datadog_profiling_native_extension/heap_recorder.h @@ -114,7 +114,9 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj // @param locations The stacktrace representing the location of the allocation. // // WARN: It is illegal to call this without previously having called ::start_heap_allocation_recording. -void end_heap_allocation_recording(heap_recorder *heap_recorder, ddog_prof_Slice_Location locations); +// WARN: This method rescues exceptions with `rb_protect`, returning the exception state integer for the caller to handle. +__attribute__((warn_unused_result)) +int end_heap_allocation_recording_with_rb_protect(heap_recorder *heap_recorder, ddog_prof_Slice_Location locations); // Update the heap recorder to reflect the latest state of the VM and prepare internal structures // for efficient iteration. diff --git a/ext/datadog_profiling_native_extension/stack_recorder.c b/ext/datadog_profiling_native_extension/stack_recorder.c index 169a73765f1..b543f058a1f 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.c +++ b/ext/datadog_profiling_native_extension/stack_recorder.c @@ -610,7 +610,15 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, // FIXME: Heap sampling currently has to be done in 2 parts because the construction of locations is happening // very late in the allocation-sampling path (which is shared with the cpu sampling path). This can // be fixed with some refactoring but for now this leads to a less impactful change. - end_heap_allocation_recording(state->heap_recorder, locations); + // + // NOTE: The heap recorder is allowed to raise exceptions if something's wrong. But we also need to handle it + // on this side to make sure we properly unlock the active slot mutex on our way out. Otherwise, this would + // later lead to deadlocks (since the active slot mutex is not expected to be locked forever). + int exception_state = end_heap_allocation_recording_with_rb_protect(state->heap_recorder, locations); + if (exception_state) { + sampler_unlock_active_profile(active_slot); + rb_jump_tag(exception_state); + } } ddog_prof_Profile_Result result = ddog_prof_Profile_add( diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 5fcf7b9ab25..499fb1c0967 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -767,6 +767,33 @@ def create_obj_in_recycled_slot(should_sample_original: false) expect(relevant_sample).to be nil end end + + # NOTE: This is a regression test that exceptions in end_heap_allocation_recording_with_rb_protect are safely + # handled by the stack_recorder. + context 'when the heap sampler raises an exception during _native_sample' do + it 'propagates the exception' do + expect do + Datadog::Profiling::Collectors::Stack::Testing + ._native_sample(Thread.current, stack_recorder, metric_values, labels, numeric_labels, 400, false) + end.to raise_error(RuntimeError, /Ended a heap recording/) + end + + it 'does not keep the active slot mutex locked' do + expect(active_slot).to be 1 + expect(slot_one_mutex_locked?).to be false + expect(slot_two_mutex_locked?).to be true + + begin + Datadog::Profiling::Collectors::Stack::Testing + ._native_sample(Thread.current, stack_recorder, metric_values, labels, numeric_labels, 400, false) + rescue # rubocop:disable Lint/SuppressedException + end + + expect(active_slot).to be 1 + expect(slot_one_mutex_locked?).to be false + expect(slot_two_mutex_locked?).to be true + end + end end end From 366df1628d38f17b6137b090b8d556d36ff82c8e Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 8 Jul 2024 12:52:18 +0100 Subject: [PATCH 6/9] Do not try to flush profile when profiler failed When the worker component fails, we also stop the scheduler. But, up until now, if there was enough data to be reported, we would still try to serialize and report a last profile. This is not an entirely great idea, as the profile may not be in a consistent state. So, instead, this commit adds a flag to mark this situation and the scheduler will use it to skip any attempts to report data. --- lib/datadog/profiling/profiler.rb | 1 + lib/datadog/profiling/scheduler.rb | 12 ++++++++++-- sig/datadog/profiling/scheduler.rbs | 1 + spec/datadog/profiling/profiler_spec.rb | 7 +++++++ spec/datadog/profiling/scheduler_spec.rb | 9 +++++++++ 5 files changed, 28 insertions(+), 2 deletions(-) diff --git a/lib/datadog/profiling/profiler.rb b/lib/datadog/profiling/profiler.rb index 3231eabc330..8d6578a8fc8 100644 --- a/lib/datadog/profiling/profiler.rb +++ b/lib/datadog/profiling/profiler.rb @@ -59,6 +59,7 @@ def component_failed(failed_component) # we're operating in a degraded state and crash tracking may still be helpful. if failed_component == :worker + scheduler.mark_profiler_failed stop_scheduler elsif failed_component == :scheduler stop_worker diff --git a/lib/datadog/profiling/scheduler.rb b/lib/datadog/profiling/scheduler.rb index 96da490b4e2..73cc94fc27a 100644 --- a/lib/datadog/profiling/scheduler.rb +++ b/lib/datadog/profiling/scheduler.rb @@ -22,7 +22,8 @@ class Scheduler < Core::Worker attr_reader \ :exporter, - :transport + :transport, + :profiler_failed public @@ -34,6 +35,7 @@ def initialize( ) @exporter = exporter @transport = transport + @profiler_failed = false # Workers::Async::Thread settings self.fork_policy = fork_policy @@ -80,8 +82,14 @@ def loop_wait_before_first_iteration? true end + # This is called by the Profiler class whenever an issue happened in the profiler. This makes sure that even + # if there is data to be flushed, we don't try to flush it. + def mark_profiler_failed + @profiler_failed = true + end + def work_pending? - exporter.can_flush? + !profiler_failed && exporter.can_flush? end def reset_after_fork diff --git a/sig/datadog/profiling/scheduler.rbs b/sig/datadog/profiling/scheduler.rbs index fabd51ecabb..8a7e9765ed6 100644 --- a/sig/datadog/profiling/scheduler.rbs +++ b/sig/datadog/profiling/scheduler.rbs @@ -14,6 +14,7 @@ module Datadog def start: (?on_failure_proc: ::Proc?) -> void def reset_after_fork: () -> void + def mark_profiler_failed: () -> true end end end diff --git a/spec/datadog/profiling/profiler_spec.rb b/spec/datadog/profiling/profiler_spec.rb index 98ee1722754..ccdbf7b69ff 100644 --- a/spec/datadog/profiling/profiler_spec.rb +++ b/spec/datadog/profiling/profiler_spec.rb @@ -123,6 +123,7 @@ before do allow(scheduler).to receive(:enabled=) allow(scheduler).to receive(:stop) + allow(scheduler).to receive(:mark_profiler_failed) end it 'logs the issue' do @@ -131,6 +132,12 @@ worker_on_failure end + it 'marks the profiler as having failed in the scheduler' do + expect(scheduler).to receive(:mark_profiler_failed) + + worker_on_failure + end + it 'stops the scheduler' do expect(scheduler).to receive(:enabled=).with(false) expect(scheduler).to receive(:stop).with(true) diff --git a/spec/datadog/profiling/scheduler_spec.rb b/spec/datadog/profiling/scheduler_spec.rb index 3a33b9037ff..fa3ea06f38f 100644 --- a/spec/datadog/profiling/scheduler_spec.rb +++ b/spec/datadog/profiling/scheduler_spec.rb @@ -249,6 +249,15 @@ it { is_expected.to be false } end + + context 'when the profiler was marked as failed' do + before do + scheduler.mark_profiler_failed + expect(exporter).to_not receive(:can_flush?) + end + + it { is_expected.to be false } + end end describe '#reset_after_fork' do From 6f8f756dc165377c25d1984a36e82373e8af7e7f Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 8 Jul 2024 15:41:41 +0100 Subject: [PATCH 7/9] Fix placeholder for skipped samples in allocation profiler breaking heap profiler I missed it on my original commits, but the placeholder for skipped samples broke the heap profiler, as it started failing with: > W, [2024-07-08T15:41:27.128532 #366230] WARN -- datadog: [datadog] > CpuAndWallTimeWorker thread error. Cause: RuntimeError Ended a heap > recording that was not started Location: > missing-testcase.rb:2:in `new' > W, [2024-07-08T15:41:27.128583 #366230] WARN -- datadog: [datadog] > Detected issue with profiler (worker component), stopping profiling. > See previous log messages for details. This is triggered by how our current heap profiler integrates with the rest of the profiler: it expects to be called twice for each allocation sample -- once before the stack is collected, and once after. The implementation of the placeholder for skipped samples correctly did not call the heap profiler before the placeholder stack got collected BUT we were still calling the heap profiler after. This caused its internal consistency checks to fail. The fix for this is to not call the heap profiler after the placeholder sample. I chose to add a new argument to `record_sample` which is not particularly pretty, but I guess until we come up with a better way to have the heap profiler integrate with the rest of the components, it'll do. --- .../collectors_stack.c | 6 ++++-- .../stack_recorder.c | 4 ++-- .../stack_recorder.h | 2 +- .../profiling/collectors/thread_context_spec.rb | 10 ++++++++++ 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 7cb073928e9..90e6f2c15be 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -264,7 +264,8 @@ void sample_thread( recorder_instance, (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = captured_frames}, values, - labels + labels, + /* placeholder: */ false ); } @@ -339,7 +340,8 @@ void record_placeholder_stack( recorder_instance, (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = 1}, values, - labels + labels, + /* placeholder: */ true ); } diff --git a/ext/datadog_profiling_native_extension/stack_recorder.c b/ext/datadog_profiling_native_extension/stack_recorder.c index b543f058a1f..52322e95716 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.c +++ b/ext/datadog_profiling_native_extension/stack_recorder.c @@ -586,7 +586,7 @@ static VALUE ruby_time_from(ddog_Timespec ddprof_time) { return rb_time_timespec_new(&time, utc); } -void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels) { +void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels, bool placeholder) { struct stack_recorder_state *state; TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state); @@ -605,7 +605,7 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, metric_values[position_for[ALLOC_SAMPLES_VALUE_ID]] = values.alloc_samples; metric_values[position_for[TIMELINE_VALUE_ID]] = values.timeline_wall_time_ns; - if (values.alloc_samples != 0) { + if (!placeholder && values.alloc_samples > 0) { // If we got an allocation sample end the heap allocation recording to commit the heap sample. // FIXME: Heap sampling currently has to be done in 2 parts because the construction of locations is happening // very late in the allocation-sampling path (which is shared with the cpu sampling path). This can diff --git a/ext/datadog_profiling_native_extension/stack_recorder.h b/ext/datadog_profiling_native_extension/stack_recorder.h index cfb93f7a25f..949b737ca71 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.h +++ b/ext/datadog_profiling_native_extension/stack_recorder.h @@ -21,7 +21,7 @@ typedef struct sample_labels { int64_t end_timestamp_ns; } sample_labels; -void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels); +void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels, bool placeholder); void record_endpoint(VALUE recorder_instance, uint64_t local_root_span_id, ddog_CharSlice endpoint); void track_object(VALUE recorder_instance, VALUE new_object, unsigned int sample_weight, ddog_CharSlice *alloc_class); VALUE enforce_recorder_instance(VALUE object); diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index 67dd0bfa43f..44de2feb97d 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -1237,6 +1237,16 @@ def self.otel_sdk_available? expect(single_sample.locations.size).to be 1 expect(single_sample.locations.first.path).to eq 'Skipped Samples' end + + context 'when heap sampling is enabled' do + let(:recorder) { build_stack_recorder(heap_samples_enabled: true) } + + it 'records only the number of skipped allocations, and does not record any heap samples' do + GC.start # Force any incorrect heap samples to have age > 1 + + expect(single_sample.values).to include('alloc-samples': 123, 'heap-live-samples': 0) + end + end end describe '#thread_list' do From 5da21c1d3b7b77c75b111adbaea3a457ce0a289c Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 8 Jul 2024 17:27:25 +0100 Subject: [PATCH 8/9] Disable heap profiling spec on legacy rubies Oooops! This spec definitely should not be running on legacy Rubies. It actually started to flake with a recent change, and it took me a while to actually remember that it shouldn't even be running in the first place. --- .../profiling/collectors/cpu_and_wall_time_worker_spec.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 954749500b9..16b7e133b85 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -654,6 +654,7 @@ let(:options) { { dynamic_sampling_rate_enabled: false } } before do + skip 'Heap profiling is only supported on Ruby >= 2.7' if RUBY_VERSION < '2.7' allow(Datadog.logger).to receive(:warn) expect(Datadog.logger).to receive(:warn).with(/dynamic sampling rate disabled/) end From abdb5235d2f6ab09e730fcc1d7ad92111276e787 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 9 Jul 2024 11:41:01 +0100 Subject: [PATCH 9/9] Tweak specs to try to avoid flakiness I saw some flakiness in CI, so this is a very soft attempt at avoiding it; let's see if it's enough or we'll need more changes. --- .../profiling/collectors/cpu_and_wall_time_worker_spec.rb | 4 ++++ spec/datadog/profiling/stack_recorder_spec.rb | 1 + 2 files changed, 5 insertions(+) diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 16b7e133b85..55a80406c92 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -549,6 +549,8 @@ thread_that_allocates_as_fast_as_possible = Thread.new { loop { BasicObject.new } } + Thread.pass + allocation_samples = try_wait_until do samples = samples_from_pprof(recorder.serialize!).select { |it| it.values[:'alloc-samples'] > 0 } samples if samples.any? { |it| it.labels[:'thread name'] == 'Skipped Samples' } @@ -557,6 +559,8 @@ thread_that_allocates_as_fast_as_possible.kill thread_that_allocates_as_fast_as_possible.join + GC.start + cpu_and_wall_time_worker.stop expect(allocation_samples).to_not be_empty diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 499fb1c0967..c9a2efe432e 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -399,6 +399,7 @@ def sample_allocation(obj) end before do + GC.start allocations = [a_string, an_array, "a fearsome interpolated string: #{sample_rate}", (-10..-1).to_a, a_hash, { 'z' => -1, 'y' => '-2', 'x' => false }, Object.new] @num_allocations = 0