From cbee551914893c82df09287286d7bd1b3a90b7d3 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Feb 2024 18:53:39 +0000 Subject: [PATCH 1/2] Fix "datadog/profiling/collectors/info.rb:62: warning: instance variable @profiler_info not initialized" --- lib/datadog/profiling/collectors/info.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/datadog/profiling/collectors/info.rb b/lib/datadog/profiling/collectors/info.rb index c8c257f90e3..49cbf776ff4 100644 --- a/lib/datadog/profiling/collectors/info.rb +++ b/lib/datadog/profiling/collectors/info.rb @@ -14,6 +14,7 @@ module Collectors # gathering and easily support more flexible/dynamic info collection in the future. class Info def initialize(settings) + @profiler_info = nil @info = { platform: collect_platform_info, runtime: collect_runtime_info, From 7843f7ccf2ca9ca4e6d6bef48f15edbfa911f2c7 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 20 Feb 2024 14:38:33 +0000 Subject: [PATCH 2/2] [PROF-9163] Fix missing profiling code hotspots when using ddtrace+otel **What does this PR do?** This PR fixes the profiler not sending code hotspots data when ddtrace is being used with OpenTelemetry (e.g. [when following these instructions](https://docs.datadoghq.com/tracing/trace_collection/custom_instrumentation/otel_instrumentation/ruby/)). Note that in particular this only works in apps that use ```ruby require 'datadog/opentelemetry' ``` **Motivation:** Since in this mode, otel data goes through ddtrace, code hotspots should be correctly supported. The only reason this didn't work is that otel spans were handled slightly differently, and the profiler was not correctly accounting for this. **Additional Notes:** N/A **How to test the change?** I've updated the `Rakefile` to run the profiler test suite with and without opentelemetry, and added test coverage for this. To test it manually, you can add `gem 'opentelemetry-sdk'` to your Gemfile, and the specs will run. Here's also a trivial app you can use for testing: ```ruby require 'opentelemetry/sdk' require 'datadog/opentelemetry' OpenTelemetry::SDK.configure MyAppTracer = OpenTelemetry.tracer_provider.tracer('otel-testing') MyAppTracer.in_span("do_work") do |span| sleep 2 end ``` --- Rakefile | 1 + .../collectors_thread_context.c | 94 ++++++++++- .../collectors/thread_context_spec.rb | 158 +++++++++++++++++- 3 files changed, 243 insertions(+), 10 deletions(-) diff --git a/Rakefile b/Rakefile index a29cb33859e..2391e14b9c8 100644 --- a/Rakefile +++ b/Rakefile @@ -18,6 +18,7 @@ TEST_METADATA = { '' => '✅ 2.1 / ✅ 2.2 / ✅ 2.3 / ✅ 2.4 / ✅ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ jruby' }, 'profiling:main' => { + 'opentelemetry' => '❌ 2.1 / ❌ 2.2 / ❌ 2.3 / ❌ 2.4 / ❌ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ❌ jruby', '' => '✅ 2.1 / ✅ 2.2 / ✅ 2.3 / ✅ 2.4 / ✅ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ jruby' }, 'profiling:ractors' => { diff --git a/ext/ddtrace_profiling_native_extension/collectors_thread_context.c b/ext/ddtrace_profiling_native_extension/collectors_thread_context.c index cbfb06c3e58..fb59ccf70ee 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_thread_context.c +++ b/ext/ddtrace_profiling_native_extension/collectors_thread_context.c @@ -82,6 +82,9 @@ static ID at_id_id; // id of :@id in Ruby static ID at_resource_id; // id of :@resource in Ruby static ID at_root_span_id; // id of :@root_span in Ruby static ID at_type_id; // id of :@type in Ruby +static ID at_otel_values_id; // id of :@otel_values in Ruby +static ID at_parent_span_id_id; // id of :@parent_span_id in Ruby +static ID at_datadog_trace_id; // id of :@datadog_trace in Ruby // Contains state for a single ThreadContext instance struct thread_context_collector_state { @@ -114,6 +117,8 @@ struct thread_context_collector_state { monotonic_to_system_epoch_state time_converter_state; // Used to identify the main thread, to give it a fallback name VALUE main_thread; + // Used when extracting trace identifiers from otel spans. Lazily initialized. + VALUE otel_current_span_key; struct stats { // Track how many garbage collection samples we've taken. @@ -218,6 +223,14 @@ static VALUE thread_list(struct thread_context_collector_state *state); static VALUE _native_sample_allocation(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE sample_weight, VALUE new_object); static VALUE _native_new_empty_thread(VALUE self); static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type); +static void ddtrace_otel_trace_identifiers_for( + struct thread_context_collector_state *state, + VALUE *active_trace, + VALUE *root_span, + VALUE *numeric_span_id, + VALUE active_span, + VALUE otel_values +); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -255,6 +268,9 @@ void collectors_thread_context_init(VALUE profiling_module) { at_resource_id = rb_intern_const("@resource"); at_root_span_id = rb_intern_const("@root_span"); at_type_id = rb_intern_const("@type"); + at_otel_values_id = rb_intern_const("@otel_values"); + at_parent_span_id_id = rb_intern_const("@parent_span_id"); + at_datadog_trace_id = rb_intern_const("@datadog_trace"); gc_profiling_init(); } @@ -282,6 +298,7 @@ static void thread_context_collector_typed_data_mark(void *state_ptr) { st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_mark, 0 /* unused */); rb_gc_mark(state->thread_list_buffer); rb_gc_mark(state->main_thread); + rb_gc_mark(state->otel_current_span_key); } static void thread_context_collector_typed_data_free(void *state_ptr) { @@ -334,6 +351,7 @@ static VALUE _native_new(VALUE klass) { state->allocation_type_enabled = true; state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER; state->main_thread = rb_thread_main(); + state->otel_current_span_key = Qnil; state->gc_tracking.wall_time_at_previous_gc_ns = INVALID_TIME; state->gc_tracking.wall_time_at_last_flushed_gc_event_ns = 0; @@ -917,6 +935,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc )); rb_str_concat(result, rb_sprintf(" main_thread=%"PRIsVALUE, state->main_thread)); rb_str_concat(result, rb_sprintf(" gc_tracking=%"PRIsVALUE, gc_tracking_as_ruby_hash(state))); + rb_str_concat(result, rb_sprintf(" otel_current_span_key=%"PRIsVALUE, state->otel_current_span_key)); return result; } @@ -1104,10 +1123,19 @@ static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE root_span = rb_ivar_get(active_trace, at_root_span_id /* @root_span */); VALUE active_span = rb_ivar_get(active_trace, at_active_span_id /* @active_span */); - if (root_span == Qnil || active_span == Qnil) return; + // Note: On Ruby 3.x `rb_attr_get` is exactly the same as `rb_ivar_get`. For Ruby 2.x, the difference is that + // `rb_ivar_get` can trigger "warning: instance variable @otel_values not initialized" if warnings are enabled and + // opentelemetry is not in use, whereas `rb_attr_get` does the lookup without generating the warning. + VALUE otel_values = rb_attr_get(active_trace, at_otel_values_id /* @otel_values */); + + VALUE numeric_span_id = Qnil; + + if (otel_values != Qnil) ddtrace_otel_trace_identifiers_for(state, &active_trace, &root_span, &numeric_span_id, active_span, otel_values); + + if (root_span == Qnil || (active_span == Qnil && numeric_span_id == Qnil)) return; VALUE numeric_local_root_span_id = rb_ivar_get(root_span, at_id_id /* @id */); - VALUE numeric_span_id = rb_ivar_get(active_span, at_id_id /* @id */); + if (active_span != Qnil && numeric_span_id == Qnil) numeric_span_id = rb_ivar_get(active_span, at_id_id /* @id */); if (numeric_local_root_span_id == Qnil || numeric_span_id == Qnil) return; trace_identifiers_result->local_root_span_id = NUM2ULL(numeric_local_root_span_id); @@ -1299,3 +1327,65 @@ static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) { default: return DDOG_CHARSLICE_C("(VM Internal, Missing class)"); } } + +static VALUE get_otel_current_span_key(struct thread_context_collector_state *state) { + if (state->otel_current_span_key == Qnil) { + VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog")); + VALUE opentelemetry_module = rb_const_get(datadog_module, rb_intern("OpenTelemetry")); + VALUE api_module = rb_const_get(opentelemetry_module, rb_intern("API")); + VALUE context_module = rb_const_get(api_module, rb_intern_const("Context")); + VALUE current_span_key = rb_const_get(context_module, rb_intern_const("CURRENT_SPAN_KEY")); + + if (current_span_key == Qnil) { + rb_raise(rb_eRuntimeError, "Unexpected: Missing Datadog::OpenTelemetry::API::Context::CURRENT_SPAN_KEY"); + } + + state->otel_current_span_key = current_span_key; + } + + return state->otel_current_span_key; +} + +// This method gets used when ddtrace is being used indirectly via the otel APIs. Information gets stored slightly +// differently, and this codepath handles it. +static void ddtrace_otel_trace_identifiers_for( + struct thread_context_collector_state *state, + VALUE *active_trace, + VALUE *root_span, + VALUE *numeric_span_id, + VALUE active_span, + VALUE otel_values +) { + VALUE resolved_numeric_span_id = + active_span == Qnil ? + // For traces started from otel spans, the span id will be empty, and the @parent_span_id has the right value + rb_ivar_get(*active_trace, at_parent_span_id_id /* @parent_span_id */) : + // Regular span created by ddtrace + rb_ivar_get(active_span, at_id_id /* @id */); + + if (resolved_numeric_span_id == Qnil) return; + + VALUE otel_current_span_key = get_otel_current_span_key(state); + VALUE current_trace = *active_trace; + + // ddtrace uses a different structure when spans are created from otel, where each otel span will have a unique ddtrace + // trace and span representing it. Each ddtrace trace is then connected to the previous otel span, forming a linked + // list. The local root span is going to be the trace/span we find at the end of this linked list. + while (otel_values != Qnil) { + VALUE otel_span = rb_hash_lookup(otel_values, otel_current_span_key); + if (otel_span == Qnil) break; + VALUE next_trace = rb_ivar_get(otel_span, at_datadog_trace_id); + if (next_trace == Qnil) break; + + current_trace = next_trace; + otel_values = rb_ivar_get(current_trace, at_otel_values_id /* @otel_values */); + } + + // We found the last trace in the linked list. This contains the local root span + VALUE resolved_root_span = rb_ivar_get(current_trace, at_root_span_id /* @root_span */); + if (resolved_root_span == Qnil) return; + + *root_span = resolved_root_span; + *active_trace = current_trace; + *numeric_span_id = resolved_numeric_span_id; +} diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index 6ebfd62cea1..9f39a5b36bb 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -377,19 +377,13 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current end before do - # Sanity checks expect(@t1_span_id).to be > 0 expect(@t1_local_root_span_id).to be > 0 - expect(@t1_span_id).to_not be @t1_local_root_span_id - end - - it 'samples successfully' do - sample - - expect(t1_sample).to_not be_nil end it 'includes "local root span id" and "span id" labels in the samples' do + expect(@t1_span_id).to_not be @t1_local_root_span_id + sample expect(t1_sample.labels).to include( @@ -527,6 +521,154 @@ def another_way_of_calling_sample(profiler_overhead_stack_thread: Thread.current it_behaves_like 'samples with code hotspots information' end + + def self.otel_sdk_available? + begin + require 'opentelemetry/sdk' + true + rescue LoadError + false + end + end + + context 'when trace comes from otel sdk', if: otel_sdk_available? do + let(:otel_tracer) do + require 'datadog/opentelemetry' + + OpenTelemetry::SDK.configure + OpenTelemetry.tracer_provider.tracer('ddtrace-profiling-test') + end + + let(:t1) do + Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer| + otel_tracer.in_span('profiler.test') do + @t1_span_id = Datadog::Tracing.correlation.span_id + @t1_local_root_span_id = Datadog::Tracing.correlation.span_id + ready_queue << true + sleep + end + end + end + + it 'includes "local root span id" and "span id" labels in the samples' do + sample + + expect(t1_sample.labels).to include( + :'local root span id' => @t1_local_root_span_id.to_i, + :'span id' => @t1_span_id.to_i, + ) + end + + it 'does not include the "trace endpoint" label' do + sample + + expect(t1_sample.labels).to_not include(:'trace endpoint' => anything) + end + + context 'when there are multiple otel spans nested' do + let(:t1) do + Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer| + otel_tracer.in_span('profiler.test') do + @t1_local_root_span_id = Datadog::Tracing.correlation.span_id + otel_tracer.in_span('profiler.test.nested.1') do + otel_tracer.in_span('profiler.test.nested.2') do + otel_tracer.in_span('profiler.test.nested.3') do + @t1_span_id = Datadog::Tracing.correlation.span_id + ready_queue << true + sleep + end + end + end + end + end + end + + it 'includes "local root span id" and "span id" labels in the samples' do + sample + + expect(t1_sample.labels).to include( + :'local root span id' => @t1_local_root_span_id.to_i, + :'span id' => @t1_span_id.to_i, + ) + end + end + + context 'mixing of otel sdk and ddtrace' do + context 'when top-level span is started from ddtrace' do + let(:t1) do + Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer| + Datadog::Tracing.trace('profiler.test', type: :web) do |_span, trace| + trace.resource = 'example_resource' + + @t1_local_root_span_id = Datadog::Tracing.correlation.span_id + otel_tracer.in_span('profiler.test.nested.1') do + Datadog::Tracing.trace('profiler.test.nested.2') do + otel_tracer.in_span('profiler.test.nested.3') do + Datadog::Tracing.trace('profiler.test.nested.4') do + @t1_span_id = Datadog::Tracing.correlation.span_id + ready_queue << true + sleep + end + end + end + end + end + end + end + + it 'uses the local root span id from the top-level span, and the span id from the leaf span' do + sample + + expect(t1_sample.labels).to include( + :'local root span id' => @t1_local_root_span_id.to_i, + :'span id' => @t1_span_id.to_i, + ) + end + + it 'includes the "trace endpoint" label in the samples with the trace resource' do + sample + + expect(t1_sample.labels).to include(:'trace endpoint' => 'example_resource') + end + end + + context 'when top-level span is started from otel' do + let(:t1) do + Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer| + otel_tracer.in_span('profiler.test') do + @t1_local_root_span_id = Datadog::Tracing.correlation.span_id + otel_tracer.in_span('profiler.test.nested.1') do + Datadog::Tracing.trace('profiler.test.nested.2') do + otel_tracer.in_span('profiler.test.nested.3') do + Datadog::Tracing.trace('profiler.test.nested.4') do + @t1_span_id = Datadog::Tracing.correlation.span_id + ready_queue << true + sleep + end + end + end + end + end + end + end + + it 'uses the local root span id from the top-level span, and the span id from the leaf span' do + sample + + expect(t1_sample.labels).to include( + :'local root span id' => @t1_local_root_span_id.to_i, + :'span id' => @t1_span_id.to_i, + ) + end + end + end + end + + context 'when trace comes from otel sdk (warning)', unless: otel_sdk_available? do + it 'is not being tested' do + skip 'Skipping OpenTelemetry tests because `opentelemetry-sdk` gem is not available' + end + end end end end