diff --git a/ext/ddtrace_profiling_native_extension/collectors_stack.c b/ext/ddtrace_profiling_native_extension/collectors_stack.c index 7611f13e41e..f8b9ca84f50 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_stack.c +++ b/ext/ddtrace_profiling_native_extension/collectors_stack.c @@ -11,9 +11,6 @@ // Gathers stack traces from running threads, storing them in a StackRecorder instance // This file implements the native bits of the Datadog::Profiling::Collectors::Stack class -#define MAX_FRAMES_LIMIT 10000 -#define MAX_FRAMES_LIMIT_AS_STRING "10000" - static VALUE missing_string = Qnil; // Used as scratch space during sampling diff --git a/ext/ddtrace_profiling_native_extension/collectors_stack.h b/ext/ddtrace_profiling_native_extension/collectors_stack.h index e4175ae35e8..f983c8e6f39 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_stack.h +++ b/ext/ddtrace_profiling_native_extension/collectors_stack.h @@ -4,6 +4,9 @@ #include "stack_recorder.h" +#define MAX_FRAMES_LIMIT 10000 +#define MAX_FRAMES_LIMIT_AS_STRING "10000" + typedef struct sampling_buffer sampling_buffer; typedef enum { SAMPLE_REGULAR, SAMPLE_IN_GC } sample_type; diff --git a/ext/ddtrace_profiling_native_extension/heap_recorder.c b/ext/ddtrace_profiling_native_extension/heap_recorder.c index 627567c3880..fea01cc8e3c 100644 --- a/ext/ddtrace_profiling_native_extension/heap_recorder.c +++ b/ext/ddtrace_profiling_native_extension/heap_recorder.c @@ -1,23 +1,136 @@ #include "heap_recorder.h" #include #include "ruby/st.h" -#include "ruby/util.h" #include "ruby_helpers.h" #include +#include "collectors_stack.h" + +// A compact representation of a stacktrace frame for a heap allocation. +typedef struct { + char *name; + char *filename; + int32_t line; +} heap_frame; +static st_index_t heap_frame_hash(heap_frame*, st_index_t seed); + +// A compact representation of a stacktrace for a heap allocation. +// +// We could use a ddog_prof_Slice_Location instead but it has a lot of +// unused fields. Because we have to keep these stacks around for at +// least the lifetime of the objects allocated therein, we would be +// incurring a non-negligible memory overhead for little purpose. +typedef struct { + uint16_t frames_len; + heap_frame frames[]; +} heap_stack; +static heap_stack* heap_stack_new(ddog_prof_Slice_Location); +static void heap_stack_free(heap_stack*); +static st_index_t heap_stack_hash(heap_stack*, st_index_t); + +#if MAX_FRAMES_LIMIT > UINT16_MAX + #error Frames len type not compatible with MAX_FRAMES_LIMIT +#endif + +enum heap_record_key_type { + HEAP_STACK, + LOCATION_SLICE +}; +// This struct allows us to use two different types of stacks when +// interacting with a heap_record hash. +// +// The idea is that we'll always want to use heap_stack-keys when +// adding new entries to the hash since that's the compact stack +// representation we rely on internally. +// +// However, when querying for an existing heap record, we'd save a +// lot of allocations if we could query with the +// ddog_prof_Slice_Location we receive in our external API. +// +// To allow this interchange, we need a union and need to ensure +// that whatever shape of the union, the heap_record_key_cmp_st +// and heap_record_hash_st functions return the same results for +// equivalent stacktraces. +typedef struct { + enum heap_record_key_type type; + union { + // key never owns this if set + heap_stack *heap_stack; + // key never owns this if set + ddog_prof_Slice_Location *location_slice; + }; +} heap_record_key; +static heap_record_key* heap_record_key_new(heap_stack*); +static void heap_record_key_free(heap_record_key*); +static int heap_record_key_cmp_st(st_data_t, st_data_t); +static st_index_t heap_record_key_hash_st(st_data_t); +static const struct st_hash_type st_hash_type_heap_record_key = { + heap_record_key_cmp_st, + heap_record_key_hash_st, +}; + +// Need to implement these functions to support the location-slice based keys +static st_index_t ddog_location_hash(ddog_prof_Location, st_index_t seed); +static st_index_t ddog_location_slice_hash(ddog_prof_Slice_Location, st_index_t seed); + +// A heap record is used for deduping heap allocation stacktraces across multiple +// objects sharing the same allocation location. +typedef struct { + // How many objects are currently tracked by the heap recorder for this heap record. + uint32_t num_tracked_objects; + // stack is owned by the associated record and gets cleaned up alongside it + heap_stack *stack; +} heap_record; +static heap_record* heap_record_new(heap_stack*); +static void heap_record_free(heap_record*); + +// An object record is used for storing data about currently tracked live objects +typedef struct { + long obj_id; + heap_record *heap_record; + live_object_data object_data; +} object_record; +static object_record* object_record_new(long, heap_record*, live_object_data); +static void object_record_free(object_record*); // Allows storing data passed to ::start_heap_allocation_recording to make it accessible to // ::end_heap_allocation_recording. // -// obj != Qnil flags this struct as holding a valid partial heap recording. +// obj_id != 0 flags this struct as holding a valid partial heap recording. typedef struct { - VALUE obj; + long obj_id; live_object_data object_data; } partial_heap_recording; struct heap_recorder { + // Map[key: heap_record_key*, record: heap_record*] + // NOTE: We always use heap_record_key.type == HEAP_STACK for storage but support lookups + // via heap_record_key.type == LOCATION_SLICE to allow for allocation-free fast-paths. + // NOTE: This table is currently only protected by the GVL since we never iterate on it + // outside the GVL. + st_table *heap_records; + + // Map[obj_id: long, record: object_record*] + st_table *object_records; + + // Lock protecting writes to object_records. + // NOTE: heap_records is currently not protected by this one since we do not iterate on + // heap records outside the GVL. + pthread_mutex_t records_mutex; + // Data for a heap recording that was started but not yet ended partial_heap_recording active_recording; + + // Reusable location array, implementing a flyweight pattern for things like iteration. + ddog_prof_Location *reusable_locations; }; +static heap_record* get_or_create_heap_record(heap_recorder*, ddog_prof_Slice_Location); +static void cleanup_heap_record_if_unused(heap_recorder*, heap_record*); +static int st_heap_record_entry_free(st_data_t, st_data_t, st_data_t); +static int st_object_record_entry_free(st_data_t, st_data_t, st_data_t); +static int st_object_record_entry_free_if_invalid(st_data_t, st_data_t, st_data_t); +static int st_object_records_iterate(st_data_t, st_data_t, st_data_t); +static int update_object_record_entry(st_data_t*, st_data_t*, st_data_t, int); +static void commit_allocation(heap_recorder*, heap_record*, long, live_object_data); // ========================== // Heap Recorder External API @@ -29,34 +142,66 @@ struct heap_recorder { // // ========================== heap_recorder* heap_recorder_new(void) { - heap_recorder* recorder = ruby_xmalloc(sizeof(heap_recorder)); + heap_recorder *recorder = ruby_xcalloc(1, sizeof(heap_recorder)); + recorder->records_mutex = (pthread_mutex_t) PTHREAD_MUTEX_INITIALIZER; + recorder->heap_records = st_init_table(&st_hash_type_heap_record_key); + recorder->object_records = st_init_numtable(); + recorder->reusable_locations = ruby_xcalloc(MAX_FRAMES_LIMIT, sizeof(ddog_prof_Location)); recorder->active_recording = (partial_heap_recording) { - .obj = Qnil, + .obj_id = 0, // 0 is actually the obj_id of false, but we'll never track that one in heap so we use + // it as invalid/unset value. .object_data = {0}, }; return recorder; } -void heap_recorder_free(struct heap_recorder* recorder) { - if (recorder == NULL) { +void heap_recorder_free(heap_recorder *heap_recorder) { + if (heap_recorder == NULL) { return; } - ruby_xfree(recorder); -} + st_foreach(heap_recorder->object_records, st_object_record_entry_free, 0); + st_free_table(heap_recorder->object_records); + + st_foreach(heap_recorder->heap_records, st_heap_record_entry_free, 0); + st_free_table(heap_recorder->heap_records); -// TODO: Remove when things get implemented -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" + pthread_mutex_destroy(&heap_recorder->records_mutex); + ruby_xfree(heap_recorder->reusable_locations); + + ruby_xfree(heap_recorder); +} + +// WARN: Assumes this gets called before profiler is reinitialized on the fork void heap_recorder_after_fork(heap_recorder *heap_recorder) { if (heap_recorder == NULL) { return; } - // TODO: Implement + // When forking, the child process gets a copy of the entire state of the parent process, minus + // threads. + // + // This means anything the heap recorder is tracking will still be alive after the fork and + // should thus be kept. Because this heap recorder implementation does not rely on free + // tracepoints to track liveness, any frees that happen until we fully reinitialize, will + // simply be noticed on next heap_recorder_flush. + // + // There is one small caveat though: fork only preserves one thread and in a Ruby app, that + // will be the thread holding on to the GVL. Since we support iteration on the heap recorder + // outside of the GVL (which implies acquiring the records_mutex lock), this means the child + // process may be in this weird state of having a records_mutex lock stuck in a locked + // state and that state having been caused by a thread that no longer exists. + // + // We can't blindly unlock records_mutex from the thread calling heap_recorder_after_fork + // as unlocking mutexes a thread doesn't own is undefined behaviour. What we can do is + // create a new lock and start using it from now on-forward. This is fine because at this + // point in the fork-handling logic, all tracepoints are disabled and no-one should be + // iterating on the recorder state so there are no writers/readers that may race with + // this reinitialization. + heap_recorder->records_mutex = (pthread_mutex_t) PTHREAD_MUTEX_INITIALIZER; } void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj, unsigned int weight) { @@ -64,8 +209,13 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj return; } + VALUE ruby_obj_id = rb_obj_id(new_obj); + if (!FIXNUM_P(ruby_obj_id)) { + rb_raise(rb_eRuntimeError, "Detected a bignum object id. These are not supported by heap profiling."); + } + heap_recorder->active_recording = (partial_heap_recording) { - .obj = new_obj, + .obj_id = FIX2LONG(ruby_obj_id), .object_data = (live_object_data) { .weight = weight, }, @@ -79,17 +229,34 @@ void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_pro partial_heap_recording *active_recording = &heap_recorder->active_recording; - VALUE new_obj = active_recording->obj; - if (new_obj == Qnil) { + long obj_id = active_recording->obj_id; + if (obj_id == 0) { // Recording ended without having been started? rb_raise(rb_eRuntimeError, "Ended a heap recording that was not started"); } // From now on, mark active recording as invalid so we can short-circuit at any point and // not end up with a still active recording. new_obj still holds the object for this recording - active_recording->obj = Qnil; + active_recording->obj_id = 0; - // TODO: Implement + // NOTE: This is the only path where we lookup/mutate the heap_records hash. Since this + // runs under the GVL, we can afford to interact with heap_records without getting + // the lock below. + heap_record *heap_record = get_or_create_heap_record(heap_recorder, locations); + + int error = pthread_mutex_trylock(&heap_recorder->records_mutex); + if (error == EBUSY) { + // We weren't able to get a lock + // TODO: Add some queuing system so we can do something other than drop this data. + cleanup_heap_record_if_unused(heap_recorder, heap_record); + return; + } + if (error) ENFORCE_SUCCESS_GVL(error); + + // And then commit the new allocation. + commit_allocation(heap_recorder, heap_record, obj_id, active_recording->object_data); + + ENFORCE_SUCCESS_GVL(pthread_mutex_unlock(&heap_recorder->records_mutex)); } void heap_recorder_flush(heap_recorder *heap_recorder) { @@ -97,9 +264,19 @@ void heap_recorder_flush(heap_recorder *heap_recorder) { return; } - // TODO: Implement + st_foreach(heap_recorder->object_records, st_object_record_entry_free_if_invalid, (st_data_t) heap_recorder); } +// Internal data we need while performing iteration over live objects. +typedef struct { + // The callback we need to call for each object. + bool (*for_each_callback)(heap_recorder_iteration_data stack_data, void *extra_arg); + // The extra arg to pass as the second parameter to the callback. + void *for_each_callback_extra_arg; + // A reference to the heap recorder so we can access extra stuff like reusable_locations. + heap_recorder *heap_recorder; +} iteration_context; + // WARN: If with_gvl = False, NO HEAP ALLOCATIONS, EXCEPTIONS or RUBY CALLS ARE ALLOWED. void heap_recorder_for_each_live_object( heap_recorder *heap_recorder, @@ -110,8 +287,455 @@ void heap_recorder_for_each_live_object( return; } - // TODO: Implement + ENFORCE_SUCCESS_HELPER(pthread_mutex_lock(&heap_recorder->records_mutex), with_gvl); + iteration_context context; + context.for_each_callback = for_each_callback; + context.for_each_callback_extra_arg = for_each_callback_extra_arg; + context.heap_recorder = heap_recorder; + st_foreach(heap_recorder->object_records, st_object_records_iterate, (st_data_t) &context); + ENFORCE_SUCCESS_HELPER(pthread_mutex_unlock(&heap_recorder->records_mutex), with_gvl); } -// TODO: Remove when things get implemented -#pragma GCC diagnostic pop +void heap_recorder_testonly_assert_hash_matches(ddog_prof_Slice_Location locations) { + heap_stack *stack = heap_stack_new(locations); + heap_record_key stack_based_key = (heap_record_key) { + .type = HEAP_STACK, + .heap_stack = stack, + }; + heap_record_key location_based_key = (heap_record_key) { + .type = LOCATION_SLICE, + .location_slice = &locations, + }; + + st_index_t stack_hash = heap_record_key_hash_st((st_data_t) &stack_based_key); + st_index_t location_hash = heap_record_key_hash_st((st_data_t) &location_based_key); + + heap_stack_free(stack); + + if (stack_hash != location_hash) { + rb_raise(rb_eRuntimeError, "Heap record key hashes built from the same locations differ. stack_based_hash=%"PRI_VALUE_PREFIX"u location_based_hash=%"PRI_VALUE_PREFIX"u", stack_hash, location_hash); + } +} + +// ========================== +// Heap Recorder Internal API +// ========================== +static int st_heap_record_entry_free(st_data_t key, st_data_t value, DDTRACE_UNUSED st_data_t extra_arg) { + heap_record_key *record_key = (heap_record_key*) key; + heap_record_key_free(record_key); + heap_record_free((heap_record *) value); + return ST_DELETE; +} + +static int st_object_record_entry_free(DDTRACE_UNUSED st_data_t key, st_data_t value, DDTRACE_UNUSED st_data_t extra_arg) { + object_record_free((object_record *) value); + return ST_DELETE; +} + +static int st_object_record_entry_free_if_invalid(st_data_t key, st_data_t value, st_data_t extra_arg) { + long obj_id = (long) key; + object_record *record = (object_record*) value; + heap_recorder *recorder = (heap_recorder*) extra_arg; + + if (!ruby_ref_from_id(LONG2NUM(obj_id), NULL)) { + // Id no longer associated with a valid ref. Need to clean things up! + + // Starting with the associated heap record. There will now be one less tracked object pointing to it + heap_record *heap_record = record->heap_record; + heap_record->num_tracked_objects--; + + // One less object using this heap record, it may have become unused... + cleanup_heap_record_if_unused(recorder, heap_record); + + object_record_free(record); + return ST_DELETE; + } + + return ST_CONTINUE; +} + +// WARN: This can get called outside the GVL. NO HEAP ALLOCATIONS OR EXCEPTIONS ARE ALLOWED. +static int st_object_records_iterate(DDTRACE_UNUSED st_data_t key, st_data_t value, st_data_t extra) { + object_record *record = (object_record*) value; + const heap_stack *stack = record->heap_record->stack; + iteration_context *context = (iteration_context*) extra; + + ddog_prof_Location *locations = context->heap_recorder->reusable_locations; + + for (uint16_t i = 0; i < stack->frames_len; i++) { + const heap_frame *frame = &stack->frames[i]; + ddog_prof_Location *location = &locations[i]; + location->function.name.ptr = frame->name; + location->function.name.len = strlen(frame->name); + location->function.filename.ptr = frame->filename; + location->function.filename.len = strlen(frame->filename); + location->line = frame->line; + } + + heap_recorder_iteration_data iteration_data; + iteration_data.object_data = record->object_data; + iteration_data.locations = (ddog_prof_Slice_Location) {.ptr = locations, .len = stack->frames_len}; + + if (!context->for_each_callback(iteration_data, context->for_each_callback_extra_arg)) { + return ST_STOP; + } + + return ST_CONTINUE; +} + +// Struct holding data required for an update operation on heap_records +typedef struct { + // [in] The new object record we want to add. + // NOTE: Transfer of ownership is assumed, do not re-use it after call to ::update_object_record_entry + object_record *new_object_record; + + // [in] The heap recorder where the update is happening. + heap_recorder *heap_recorder; +} object_record_update_data; + +static int update_object_record_entry(DDTRACE_UNUSED st_data_t *key, st_data_t *value, st_data_t data, int existing) { + object_record_update_data *update_data = (object_record_update_data*) data; + if (existing) { + rb_raise(rb_eRuntimeError, "Object ids are supposed to be unique. We got 2 allocation recordings with the same id"); + } + // Always carry on with the update, we want the new record to be there at the end + (*value) = (st_data_t) update_data->new_object_record; + return ST_CONTINUE; +} + +// WARN: Expects records_mutex to be held +static void commit_allocation(heap_recorder *heap_recorder, heap_record *heap_record, long obj_id, live_object_data object_data) { + // Update object_records + object_record_update_data update_data = (object_record_update_data) { + .heap_recorder = heap_recorder, + .new_object_record = object_record_new(obj_id, heap_record, object_data), + }; + if (!st_update(heap_recorder->object_records, obj_id, update_object_record_entry, (st_data_t) &update_data)) { + // We are sure there was no previous record for this id so let the heap record know there now is one + // extra record associated with this stack. + if (heap_record->num_tracked_objects == UINT32_MAX) { + rb_raise(rb_eRuntimeError, "Reached maximum number of tracked objects for heap record"); + } + heap_record->num_tracked_objects++; + }; +} + +// Struct holding data required for an update operation on heap_records +typedef struct { + // [in] The locations we did this update with + ddog_prof_Slice_Location locations; + // [out] Pointer that will be updated to the updated heap record to prevent having to do + // another lookup to access the updated heap record. + heap_record **record; +} heap_record_update_data; + +// This function assumes ownership of stack_data is passed on to it so it'll either transfer ownership or clean-up. +static int update_heap_record_entry_with_new_allocation(st_data_t *key, st_data_t *value, st_data_t data, int existing) { + heap_record_update_data *update_data = (heap_record_update_data*) data; + + if (!existing) { + // there was no matching heap record so lets create a new one... + // we need to initialize a heap_record_key with a new stack and use that for the key storage. We can't use the + // locations-based key we used for the update call because we don't own its lifecycle. So we create a new + // heap stack and will pass ownership of it to the heap_record. + heap_stack *stack = heap_stack_new(update_data->locations); + (*key) = (st_data_t) heap_record_key_new(stack); + (*value) = (st_data_t) heap_record_new(stack); + } + + heap_record *record = (heap_record*) (*value); + (*update_data->record) = record; + + return ST_CONTINUE; +} + +static heap_record* get_or_create_heap_record(heap_recorder *heap_recorder, ddog_prof_Slice_Location locations) { + // For performance reasons we use a stack-allocated location-slice based key. This allows us + // to do allocation-free lookups and reuse of a matching existing heap record. + // NOTE: If we end up creating a new record, we'll create a heap-allocated key we own and use that for storage + // instead of this one. + heap_record_key lookup_key = (heap_record_key) { + .type = LOCATION_SLICE, + .location_slice = &locations, + }; + + heap_record *heap_record = NULL; + heap_record_update_data update_data = (heap_record_update_data) { + .locations = locations, + .record = &heap_record, + }; + st_update(heap_recorder->heap_records, (st_data_t) &lookup_key, update_heap_record_entry_with_new_allocation, (st_data_t) &update_data); + + return heap_record; +} + +static void cleanup_heap_record_if_unused(heap_recorder *heap_recorder, heap_record *heap_record) { + if (heap_record->num_tracked_objects > 0) { + // still being used! do nothing... + return; + } + + heap_record_key heap_key = (heap_record_key) { + .type = HEAP_STACK, + .heap_stack = heap_record->stack, + }; + // We need to access the deleted key to free it since we gave ownership of the keys to the hash. + // st_delete will change this pointer to point to the removed key if one is found. + heap_record_key *deleted_key = &heap_key; + if (!st_delete(heap_recorder->heap_records, (st_data_t*) &deleted_key, NULL)) { + rb_raise(rb_eRuntimeError, "Attempted to cleanup an untracked heap_record"); + }; + heap_record_key_free(deleted_key); + heap_record_free(heap_record); +} + +// =============== +// Heap Record API +// =============== +heap_record* heap_record_new(heap_stack *stack) { + heap_record *record = ruby_xcalloc(1, sizeof(heap_record)); + record->num_tracked_objects = 0; + record->stack = stack; + return record; +} + +void heap_record_free(heap_record *record) { + heap_stack_free(record->stack); + ruby_xfree(record); +} + + +// ================= +// Object Record API +// ================= +object_record* object_record_new(long obj_id, heap_record *heap_record, live_object_data object_data) { + object_record *record = ruby_xcalloc(1, sizeof(object_record)); + record->obj_id = obj_id; + record->heap_record = heap_record; + record->object_data = object_data; + return record; +} + +void object_record_free(object_record *record) { + ruby_xfree(record); +} + +// ============== +// Heap Frame API +// ============== +int heap_frame_cmp(heap_frame *f1, heap_frame *f2) { + int line_diff = (int) (f1->line - f2->line); + if (line_diff != 0) { + return line_diff; + } + int cmp = strcmp(f1->name, f2->name); + if (cmp != 0) { + return cmp; + } + return strcmp(f1->filename, f2->filename); +} + +// TODO: Research potential performance improvements around hashing stuff here +// once we have a benchmarking suite. +// Example: Each call to st_hash is calling murmur_finish and we may want +// to only finish once per structure, not per field? +// Example: There may be a more efficient hashing for line that is not the +// generic st_hash algorithm? + +// WARN: Must be kept in-sync with ::char_slice_hash +st_index_t string_hash(char *str, st_index_t seed) { + return st_hash(str, strlen(str), seed); +} + +// WARN: Must be kept in-sync with ::string_hash +st_index_t char_slice_hash(ddog_CharSlice char_slice, st_index_t seed) { + return st_hash(char_slice.ptr, char_slice.len, seed); +} + +// WARN: Must be kept in-sync with ::ddog_location_hash +st_index_t heap_frame_hash(heap_frame *frame, st_index_t seed) { + st_index_t hash = string_hash(frame->name, seed); + hash = string_hash(frame->filename, hash); + hash = st_hash(&frame->line, sizeof(frame->line), hash); + return hash; +} + +// WARN: Must be kept in-sync with ::heap_frame_hash +st_index_t ddog_location_hash(ddog_prof_Location location, st_index_t seed) { + st_index_t hash = char_slice_hash(location.function.name, seed); + hash = char_slice_hash(location.function.filename, hash); + // Convert ddog_prof line type to the same type we use for our heap_frames to + // ensure we have compatible hashes + int32_t line_as_int32 = (int32_t) location.line; + hash = st_hash(&line_as_int32, sizeof(line_as_int32), hash); + return hash; +} + +// ============== +// Heap Stack API +// ============== +heap_stack* heap_stack_new(ddog_prof_Slice_Location locations) { + uint16_t frames_len = locations.len; + if (frames_len > MAX_FRAMES_LIMIT) { + // This should not be happening anyway since MAX_FRAMES_LIMIT should be shared with + // the stacktrace construction mechanism. If it happens, lets just raise. This should + // be safe since only allocate with the GVL anyway. + rb_raise(rb_eRuntimeError, "Found stack with more than %d frames (%d)", MAX_FRAMES_LIMIT, frames_len); + } + heap_stack *stack = ruby_xcalloc(1, sizeof(heap_stack) + frames_len * sizeof(heap_frame)); + stack->frames_len = frames_len; + for (uint16_t i = 0; i < stack->frames_len; i++) { + const ddog_prof_Location *location = &locations.ptr[i]; + stack->frames[i] = (heap_frame) { + .name = ruby_strndup(location->function.name.ptr, location->function.name.len), + .filename = ruby_strndup(location->function.filename.ptr, location->function.filename.len), + // ddog_prof_Location is a int64_t. We don't expect to have to profile files with more than + // 2M lines so this cast should be fairly safe? + .line = (int32_t) location->line, + }; + } + return stack; +} + +void heap_stack_free(heap_stack *stack) { + for (uint64_t i = 0; i < stack->frames_len; i++) { + heap_frame *frame = &stack->frames[i]; + ruby_xfree(frame->name); + ruby_xfree(frame->filename); + } + ruby_xfree(stack); +} + +// WARN: Must be kept in-sync with ::ddog_location_slice_hash +st_index_t heap_stack_hash(heap_stack *stack, st_index_t seed) { + st_index_t hash = seed; + for (uint64_t i = 0; i < stack->frames_len; i++) { + hash = heap_frame_hash(&stack->frames[i], hash); + } + return hash; +} + +// WARN: Must be kept in-sync with ::heap_stack_hash +st_index_t ddog_location_slice_hash(ddog_prof_Slice_Location locations, st_index_t seed) { + st_index_t hash = seed; + for (uint64_t i = 0; i < locations.len; i++) { + hash = ddog_location_hash(locations.ptr[i], hash); + } + return hash; +} + +// =================== +// Heap Record Key API +// =================== +heap_record_key* heap_record_key_new(heap_stack *stack) { + heap_record_key *key = ruby_xmalloc(sizeof(heap_record_key)); + key->type = HEAP_STACK; + key->heap_stack = stack; + return key; +} + +void heap_record_key_free(heap_record_key *key) { + ruby_xfree(key); +} + +static inline size_t heap_record_key_len(heap_record_key *key) { + if (key->type == HEAP_STACK) { + return key->heap_stack->frames_len; + } else { + return key->location_slice->len; + } +} + +static inline int64_t heap_record_key_entry_line(heap_record_key *key, size_t entry_i) { + if (key->type == HEAP_STACK) { + return key->heap_stack->frames[entry_i].line; + } else { + return key->location_slice->ptr[entry_i].line; + } +} + +static inline size_t heap_record_key_entry_name(heap_record_key *key, size_t entry_i, const char **name_ptr) { + if (key->type == HEAP_STACK) { + char *name = key->heap_stack->frames[entry_i].name; + (*name_ptr) = name; + return strlen(name); + } else { + ddog_CharSlice name = key->location_slice->ptr[entry_i].function.name; + (*name_ptr) = name.ptr; + return name.len; + } +} + +static inline size_t heap_record_key_entry_filename(heap_record_key *key, size_t entry_i, const char **filename_ptr) { + if (key->type == HEAP_STACK) { + char *filename = key->heap_stack->frames[entry_i].filename; + (*filename_ptr) = filename; + return strlen(filename); + } else { + ddog_CharSlice filename = key->location_slice->ptr[entry_i].function.filename; + (*filename_ptr) = filename.ptr; + return filename.len; + } +} + +int heap_record_key_cmp_st(st_data_t key1, st_data_t key2) { + heap_record_key *key_record1 = (heap_record_key*) key1; + heap_record_key *key_record2 = (heap_record_key*) key2; + + // Fast path, check if lengths differ + size_t key_record1_len = heap_record_key_len(key_record1); + size_t key_record2_len = heap_record_key_len(key_record2); + + if (key_record1_len != key_record2_len) { + return ((int) key_record1_len) - ((int) key_record2_len); + } + + // If we got this far, we have same lengths so need to check item-by-item + for (size_t i = 0; i < key_record1_len; i++) { + // Lines are faster to compare, lets do that first + size_t line1 = heap_record_key_entry_line(key_record1, i); + size_t line2 = heap_record_key_entry_line(key_record2, i); + if (line1 != line2) { + return ((int) line1) - ((int)line2); + } + + // Then come names, they are usually smaller than filenames + const char *name1, *name2; + size_t name1_len = heap_record_key_entry_name(key_record1, i, &name1); + size_t name2_len = heap_record_key_entry_name(key_record2, i, &name2); + if (name1_len != name2_len) { + return ((int) name1_len) - ((int) name2_len); + } + int name_cmp_result = strncmp(name1, name2, name1_len); + if (name_cmp_result != 0) { + return name_cmp_result; + } + + // Then come filenames + const char *filename1, *filename2; + int64_t filename1_len = heap_record_key_entry_filename(key_record1, i, &filename1); + int64_t filename2_len = heap_record_key_entry_filename(key_record2, i, &filename2); + if (filename1_len != filename2_len) { + return ((int) filename1_len) - ((int) filename2_len); + } + int filename_cmp_result = strncmp(filename1, filename2, filename1_len); + if (filename_cmp_result != 0) { + return filename_cmp_result; + } + } + + // If we survived the above for, then everything matched + return 0; +} + +// Initial seed for hash functions +#define FNV1_32A_INIT 0x811c9dc5 + +st_index_t heap_record_key_hash_st(st_data_t key) { + heap_record_key *record_key = (heap_record_key*) key; + if (record_key->type == HEAP_STACK) { + return heap_stack_hash(record_key->heap_stack, FNV1_32A_INIT); + } else { + return ddog_location_slice_hash(*record_key->location_slice, FNV1_32A_INIT); + } +} diff --git a/ext/ddtrace_profiling_native_extension/heap_recorder.h b/ext/ddtrace_profiling_native_extension/heap_recorder.h index add47d46734..819410fa559 100644 --- a/ext/ddtrace_profiling_native_extension/heap_recorder.h +++ b/ext/ddtrace_profiling_native_extension/heap_recorder.h @@ -42,6 +42,7 @@ heap_recorder* heap_recorder_new(void); void heap_recorder_free(heap_recorder *heap_recorder); // Do any cleanup needed after forking. +// WARN: Assumes this gets called before profiler is reinitialized on the fork void heap_recorder_after_fork(heap_recorder *heap_recorder); // Start a heap allocation recording on the heap recorder for a new object. @@ -66,7 +67,8 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj // WARN: It is illegal to call this without previously having called ::start_heap_allocation_recording. void end_heap_allocation_recording(heap_recorder *heap_recorder, ddog_prof_Slice_Location locations); -// Flush any intermediate state that might be queued inside the heap recorder. +// Flush any intermediate state that might be queued inside the heap recorder or updates certain +// state to reflect the latest state of the VM. // // NOTE: This should usually be called before iteration to ensure data is as little stale as possible. void heap_recorder_flush(heap_recorder *heap_recorder); @@ -89,3 +91,7 @@ void heap_recorder_for_each_live_object( bool (*for_each_callback)(heap_recorder_iteration_data data, void* extra_arg), void *for_each_callback_extra_arg, bool with_gvl); + +// v--- TEST-ONLY APIs ---v + +void heap_recorder_testonly_assert_hash_matches(ddog_prof_Slice_Location locations); diff --git a/ext/ddtrace_profiling_native_extension/profiling.c b/ext/ddtrace_profiling_native_extension/profiling.c index 9dc9952b4a7..d1d39397cd2 100644 --- a/ext/ddtrace_profiling_native_extension/profiling.c +++ b/ext/ddtrace_profiling_native_extension/profiling.c @@ -41,6 +41,7 @@ void DDTRACE_EXPORT Init_ddtrace_profiling_native_extension(void) { rb_define_singleton_method(native_extension_module, "native_working?", native_working_p, 0); rb_funcall(native_extension_module, rb_intern("private_class_method"), 1, ID2SYM(rb_intern("native_working?"))); + ruby_helpers_init(); collectors_cpu_and_wall_time_worker_init(profiling_module); collectors_dynamic_sampling_rate_init(profiling_module); collectors_idle_sampling_helper_init(profiling_module); diff --git a/ext/ddtrace_profiling_native_extension/ruby_helpers.c b/ext/ddtrace_profiling_native_extension/ruby_helpers.c index b874d1f249a..99dc6097d62 100644 --- a/ext/ddtrace_profiling_native_extension/ruby_helpers.c +++ b/ext/ddtrace_profiling_native_extension/ruby_helpers.c @@ -4,6 +4,18 @@ #include "ruby_helpers.h" #include "private_vm_api_access.h" +// The following global variables are initialized at startup to save expensive lookups later. +// They are not expected to be mutated outside of init. +static VALUE module_object_space = Qnil; +static ID _id2ref_id = Qnil; + +void ruby_helpers_init(void) { + rb_global_variable(&module_object_space); + + module_object_space = rb_const_get(rb_cObject, rb_intern("ObjectSpace")); + _id2ref_id = rb_intern("_id2ref"); +} + void raise_unexpected_type( VALUE value, const char *value_name, @@ -108,3 +120,49 @@ void raise_syserr( grab_gvl_and_raise_syserr(syserr_errno, "Failure returned by '%s' at %s:%d:in `%s'", expression, file, line, function_name); } } + +char* ruby_strndup(const char *str, size_t size) { + char *dup; + + dup = xmalloc(size + 1); + memcpy(dup, str, size); + dup[size] = '\0'; + + return dup; +} + +static VALUE _id2ref(VALUE obj_id) { + // Call ::ObjectSpace._id2ref natively. It will raise if the id is no longer valid + return rb_funcall(module_object_space, _id2ref_id, 1, obj_id); +} + +static VALUE _id2ref_failure(DDTRACE_UNUSED VALUE _unused1, DDTRACE_UNUSED VALUE _unused2) { + return Qfalse; +} + +// Native wrapper to get an object ref from an id. Returns true on success and +// writes the ref to the value pointer parameter if !NULL. False if id doesn't +// reference a valid object (in which case value is not changed). +bool ruby_ref_from_id(VALUE obj_id, VALUE *value) { + // Call ::ObjectSpace._id2ref natively. It will raise if the id is no longer valid + // so we need to call it via rb_rescue2 + // TODO: Benchmark rb_rescue2 vs rb_protect here + VALUE result = rb_rescue2( + _id2ref, + obj_id, + _id2ref_failure, + Qnil, + rb_eRangeError, // rb_eRangeError is the error used to flag invalid ids + 0 // Required by API to be the last argument + ); + + if (result == Qfalse) { + return false; + } + + if (value != NULL) { + (*value) = result; + } + + return true; +} diff --git a/ext/ddtrace_profiling_native_extension/ruby_helpers.h b/ext/ddtrace_profiling_native_extension/ruby_helpers.h index 84889fb83dd..d0bf3cfcb0e 100644 --- a/ext/ddtrace_profiling_native_extension/ruby_helpers.h +++ b/ext/ddtrace_profiling_native_extension/ruby_helpers.h @@ -5,6 +5,10 @@ #include "helpers.h" +// Initialize internal data needed by some ruby helpers. Should be called during start, before any actual +// usage of ruby helpers. +void ruby_helpers_init(void); + // Processes any pending interruptions, including exceptions to be raised. // If there's an exception to be raised, it raises it. In that case, this function does not return. static inline VALUE process_pending_interruptions(DDTRACE_UNUSED VALUE _) { @@ -87,3 +91,18 @@ NORETURN(void raise_syserr( int line, const char *function_name )); + +// Alternative to ruby_strdup that takes a size argument. +// Similar to C's strndup but slightly less smart as size is expected to +// be smaller or equal to the real size of str (minus null termination if it +// exists). +// A new string will be returned with size+1 bytes and last byte set to '\0'. +// The returned string must be freed explicitly. +// +// WARN: Cannot be used during GC or outside the GVL. +char* ruby_strndup(const char *str, size_t size); + +// Native wrapper to get an object ref from an id. Returns true on success and +// writes the ref to the value pointer parameter if !NULL. False if id doesn't +// reference a valid object (in which case value is not changed). +bool ruby_ref_from_id(size_t id, VALUE *value); diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.c b/ext/ddtrace_profiling_native_extension/stack_recorder.c index 4918f8caea8..44ec7ee70b5 100644 --- a/ext/ddtrace_profiling_native_extension/stack_recorder.c +++ b/ext/ddtrace_profiling_native_extension/stack_recorder.c @@ -220,6 +220,7 @@ static void serializer_set_start_timestamp_for_next_profile(struct stack_recorde static VALUE _native_record_endpoint(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE local_root_span_id, VALUE endpoint); static void reset_profile(ddog_prof_Profile *profile, ddog_Timespec *start_time /* Can be null */); static VALUE _native_track_object(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE new_obj, VALUE weight); +static VALUE _native_check_heap_hashes(DDTRACE_UNUSED VALUE _self, VALUE locations); void stack_recorder_init(VALUE profiling_module) { @@ -245,6 +246,7 @@ void stack_recorder_init(VALUE profiling_module) { rb_define_singleton_method(testing_module, "_native_slot_two_mutex_locked?", _native_is_slot_two_mutex_locked, 1); rb_define_singleton_method(testing_module, "_native_record_endpoint", _native_record_endpoint, 3); rb_define_singleton_method(testing_module, "_native_track_object", _native_track_object, 3); + rb_define_singleton_method(testing_module, "_native_check_heap_hashes", _native_check_heap_hashes, 1); ok_symbol = ID2SYM(rb_intern_const("ok")); error_symbol = ID2SYM(rb_intern_const("error")); @@ -739,6 +741,36 @@ static VALUE _native_track_object(DDTRACE_UNUSED VALUE _self, VALUE recorder_ins return Qtrue; } +static VALUE _native_check_heap_hashes(DDTRACE_UNUSED VALUE _self, VALUE locations) { + ENFORCE_TYPE(locations, T_ARRAY); + size_t locations_len = rb_array_len(locations); + ddog_prof_Location locations_arr[locations_len]; + for (size_t i = 0; i < locations_len; i++) { + VALUE location = rb_ary_entry(locations, i); + ENFORCE_TYPE(location, T_ARRAY); + VALUE name = rb_ary_entry(location, 0); + VALUE filename = rb_ary_entry(location, 1); + VALUE line = rb_ary_entry(location, 2); + ENFORCE_TYPE(name, T_STRING); + ENFORCE_TYPE(filename, T_STRING); + ENFORCE_TYPE(line, T_FIXNUM); + locations_arr[i] = (ddog_prof_Location) { + .line = line, + .function = (ddog_prof_Function) { + .name = char_slice_from_ruby_string(name), + .filename = char_slice_from_ruby_string(filename), + } + }; + } + ddog_prof_Slice_Location ddog_locations = { + .len = locations_len, + .ptr = locations_arr, + }; + heap_recorder_testonly_assert_hash_matches(ddog_locations); + + return Qnil; +} + static void reset_profile(ddog_prof_Profile *profile, ddog_Timespec *start_time /* Can be null */) { ddog_prof_Profile_Result reset_result = ddog_prof_Profile_reset(profile, start_time); if (reset_result.tag == DDOG_PROF_PROFILE_RESULT_ERR) { diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb index 228fd01f7a9..93bcf749e52 100644 --- a/lib/datadog/profiling/component.rb +++ b/lib/datadog/profiling/component.rb @@ -193,17 +193,26 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) private_class_method def self.enable_heap_profiling?(settings, allocation_profiling_enabled) heap_profiling_enabled = settings.profiling.advanced.experimental_heap_enabled - if heap_profiling_enabled && !allocation_profiling_enabled - raise ArgumentError, 'Heap profiling requires allocation profiling to be enabled' - end + return false unless heap_profiling_enabled - if heap_profiling_enabled + if RUBY_VERSION.start_with?('2.') && RUBY_VERSION < '2.7' Datadog.logger.warn( - 'Enabled experimental heap profiling. This is experimental, not recommended, and will increase overhead!' + 'Heap profiling currently relies on features introduced in Ruby 2.7 and will be forcibly disabled. '\ + 'Please upgrade to Ruby >= 2.7 in order to use this feature.' ) + return false + end + + unless allocation_profiling_enabled + raise ArgumentError, + 'Heap profiling requires allocation profiling to be enabled' end - heap_profiling_enabled + Datadog.logger.warn( + 'Enabled experimental heap profiling. This is experimental, not recommended, and will increase overhead!' + ) + + true end private_class_method def self.no_signals_workaround_enabled?(settings) # rubocop:disable Metrics/MethodLength diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index ef2bc0e7433..ce8a855dd9f 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -570,21 +570,31 @@ end it 'records live heap objects' do - pending "heap profiling isn't actually implemented just yet" - stub_const('CpuAndWallTimeWorkerSpec::TestStruct', Struct.new(:foo)) + # Warm this up to remove VM-related allocations + # TODO: Remove this when we can match on allocation class + CpuAndWallTimeWorkerSpec::TestStruct.new + start - test_num_allocated_object.times { CpuAndWallTimeWorkerSpec::TestStruct.new } + live_objects = Array.new(test_num_allocated_object) + + test_num_allocated_object.times { |i| live_objects[i] = CpuAndWallTimeWorkerSpec::TestStruct.new } allocation_line = __LINE__ - 1 cpu_and_wall_time_worker.stop - relevant_sample = samples_for_thread(samples_from_pprof(recorder.serialize!), Thread.current) - .find { |s| s.locations.first.lineno == allocation_line && s.locations.first.path == __FILE__ } + test_struct_heap_sample = lambda { |sample| + first_frame = sample.locations.first + first_frame.lineno == allocation_line && first_frame.path == __FILE__ && first_frame.base_label == 'new' && + (sample.values[:'heap-live-samples'] || 0) > 0 + } + + relevant_sample = samples_from_pprof(recorder.serialize!) + .find(&test_struct_heap_sample) - expect(relevant_sample.values[':heap-live-samples']).to eq test_num_allocated_object + expect(relevant_sample.values[:'heap-live-samples']).to eq test_num_allocated_object end end diff --git a/spec/datadog/profiling/component_spec.rb b/spec/datadog/profiling/component_spec.rb index 77ac09732b4..257cc66bb38 100644 --- a/spec/datadog/profiling/component_spec.rb +++ b/spec/datadog/profiling/component_spec.rb @@ -227,11 +227,26 @@ end context 'when heap profiling is enabled' do + # Universally supported ruby version for allocation profiling by default + let(:testing_version) { '2.7.2' } + before do settings.profiling.advanced.experimental_heap_enabled = true - # Universally supported ruby version for allocation profiling, we don't want to test those - # edge cases here - stub_const('RUBY_VERSION', '2.7.2') + stub_const('RUBY_VERSION', testing_version) + end + + context 'on a Ruby older than 2.7' do + let(:testing_version) { '2.6' } + + it 'initializes StackRecorder without heap sampling support and warns' do + expect(Datadog::Profiling::StackRecorder).to receive(:new) + .with(hash_including(heap_samples_enabled: false)) + .and_call_original + + expect(Datadog.logger).to receive(:warn).with(/upgrade to Ruby >= 2.7/) + + build_profiler_component + end end context 'and allocation profiling disabled' do diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb index e02c1af213b..c11c6a8c684 100644 --- a/spec/datadog/profiling/spec_helper.rb +++ b/spec/datadog/profiling/spec_helper.rb @@ -69,7 +69,10 @@ def object_id_from(thread_id) end def samples_for_thread(samples, thread) - samples.select { |sample| object_id_from(sample.labels.fetch(:'thread id')) == thread.object_id } + samples.select do |sample| + thread_id = sample.labels[:'thread id'] + thread_id && object_id_from(thread_id) == thread.object_id + end end # We disable heap_sample collection by default in tests since it requires some extra mocking/ diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 8652ceb5e05..b4dd550bf5c 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -342,13 +342,14 @@ def sample_types_from(decoded_profile) let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b', 'state' => 'unknown' }.to_a } let(:a_string) { 'a beautiful string' } - let(:an_array) { [1..10] } + let(:an_array) { (1..10).to_a } let(:a_hash) { { 'a' => 1, 'b' => '2', 'c' => true } } let(:samples) { samples_from_pprof(encoded_pprof) } before do - allocations = [a_string, an_array, 'a fearsome string', [-10..-1], a_hash, { 'z' => -1, 'y' => '-2', 'x' => false }] + allocations = [a_string, an_array, "a fearsome interpolated string: #{sample_rate}", (-10..-1).to_a, a_hash, + { 'z' => -1, 'y' => '-2', 'x' => false }] @num_allocations = 0 allocations.each_with_index do |obj, i| # Heap sampling currently requires this 2-step process to first pass data about the allocated object... @@ -391,8 +392,6 @@ def sample_types_from(decoded_profile) end it 'include the stack and sample counts for the objects still left alive' do - pending 'heap_recorder implementation is currently missing' - # We sample from 2 distinct locations expect(heap_samples.size).to eq(2) @@ -553,4 +552,69 @@ def sample_types_from(decoded_profile) expect(stack_recorder.serialize.first).to be >= now end end + + describe 'Heap_recorder' do + context 'produces the same hash code for stack-based and location-based keys' do + it 'with empty stacks' do + described_class::Testing._native_check_heap_hashes([]) + end + + it 'with single-frame stacks' do + described_class::Testing._native_check_heap_hashes( + [ + ['a name', 'a filename', 123] + ] + ) + end + + it 'with multi-frame stacks' do + described_class::Testing._native_check_heap_hashes( + [ + ['a name', 'a filename', 123], + ['another name', 'anoter filename', 456], + ] + ) + end + + it 'with empty names' do + described_class::Testing._native_check_heap_hashes( + [ + ['', 'a filename', 123], + ] + ) + end + + it 'with empty filenames' do + described_class::Testing._native_check_heap_hashes( + [ + ['a name', '', 123], + ] + ) + end + + it 'with zero lines' do + described_class::Testing._native_check_heap_hashes( + [ + ['a name', 'a filename', 0] + ] + ) + end + + it 'with negative lines' do + described_class::Testing._native_check_heap_hashes( + [ + ['a name', 'a filename', -123] + ] + ) + end + + it 'with biiiiiiig lines' do + described_class::Testing._native_check_heap_hashes( + [ + ['a name', 'a filename', 4_000_000] + ] + ) + end + end + end end