Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,29 @@ arm64 has a weakly-ordered memory model (unlike x86 TSO). Incorrect ordering cau
- **Architecture Support**: x64, arm64 with architecture-specific stack walking
- **Debug Symbol Handling**: Split debug information for production deployments

#### musl/aarch64/JDK11 — `start_routine_wrapper_spec` minimal-frame invariant

`start_routine_wrapper_spec` (`libraryPatcher_linux.cpp`) has a known "precarious stack guard
corruption" on musl/aarch64/JDK11 (see the comment at the function definition). The root cause
is that musl places the stack canary close to the frame boundary, so any substantial stack
allocation inside `start_routine_wrapper_spec` corrupts it.

**Rule:** Any code placed inside `start_routine_wrapper_spec` that allocates meaningful stack
objects MUST be extracted into a separate `__attribute__((noinline))` helper so those objects
live in the helper's own frame, not in `start_routine_wrapper_spec`'s frame.

Existing helpers follow this pattern:
- `delete_routine_info` — isolates `SignalBlocker` (`sigset_t`, 128 bytes on musl)
- `init_tls_and_register` — same reason
- `run_with_musl_cleanup` — isolates `struct __ptcb` from `pthread_cleanup_push` (24 bytes)

**Trigger:** `pthread_cleanup_push` is a macro that declares `struct __ptcb __cb` on the
caller's stack. If called directly inside `start_routine_wrapper_spec` it re-triggers the
corruption. Always wrap it in a `noinline` helper.

This only affects the `#ifdef __aarch64__` / `#ifndef __GLIBC__` code path. Other platforms
and libc combinations do not have this constraint.

## Development Guidelines

### Code Organization Principles
Expand Down
32 changes: 16 additions & 16 deletions ddprof-lib/src/main/cpp/callTraceStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ int RefCountGuard::getThreadRefCountSlot() {
return -1;
}

RefCountGuard::RefCountGuard(CallTraceHashTable* resource) : _active(true), _my_slot(-1) {
RefCountGuard::RefCountGuard(void* resource) : _active(true), _my_slot(-1) {
// Get thread refcount slot using signal-safe collision resolution
_my_slot = getThreadRefCountSlot();

Expand All @@ -76,7 +76,7 @@ RefCountGuard::RefCountGuard(CallTraceHashTable* resource) : _active(true), _my_
// - Safe: we haven't "activated" protection yet
//
// After step 2, slot is fully active and protects the resource
__atomic_store_n(&refcount_slots[_my_slot].active_table, resource, __ATOMIC_RELEASE);
__atomic_store_n(&refcount_slots[_my_slot].active_ptr, resource, __ATOMIC_RELEASE);
__atomic_fetch_add(&refcount_slots[_my_slot].count, 1, __ATOMIC_RELEASE);
}

Expand All @@ -90,7 +90,7 @@ RefCountGuard::~RefCountGuard() {
// Step 2 clears the pointer (cleanup)
// No window where scanner thinks slot protects a table it doesn't
__atomic_fetch_sub(&refcount_slots[_my_slot].count, 1, __ATOMIC_RELEASE);
__atomic_store_n(&refcount_slots[_my_slot].active_table, nullptr, __ATOMIC_RELEASE);
__atomic_store_n(&refcount_slots[_my_slot].active_ptr, nullptr, __ATOMIC_RELEASE);

// Release slot ownership
__atomic_store_n(&slot_owners[_my_slot], 0, __ATOMIC_RELEASE);
Expand All @@ -106,7 +106,7 @@ RefCountGuard& RefCountGuard::operator=(RefCountGuard&& other) noexcept {
// Clean up current state with same ordering as destructor
if (_active && _my_slot >= 0) {
__atomic_fetch_sub(&refcount_slots[_my_slot].count, 1, __ATOMIC_RELEASE);
__atomic_store_n(&refcount_slots[_my_slot].active_table, nullptr, __ATOMIC_RELEASE);
__atomic_store_n(&refcount_slots[_my_slot].active_ptr, nullptr, __ATOMIC_RELEASE);
__atomic_store_n(&slot_owners[_my_slot], 0, __ATOMIC_RELEASE);
}

Expand All @@ -120,7 +120,7 @@ RefCountGuard& RefCountGuard::operator=(RefCountGuard&& other) noexcept {
return *this;
}

void RefCountGuard::waitForRefCountToClear(CallTraceHashTable* table_to_delete) {
void RefCountGuard::waitForRefCountToClear(void* table_to_delete) {
// Check refcount slots for the table we want to delete
//
// POINTER-FIRST PROTOCOL GUARANTEES:
Expand Down Expand Up @@ -150,7 +150,7 @@ void RefCountGuard::waitForRefCountToClear(CallTraceHashTable* table_to_delete)
}

// Count > 0, so slot is active - check which table it protects
CallTraceHashTable* table = __atomic_load_n(&refcount_slots[i].active_table, __ATOMIC_ACQUIRE);
void* table = __atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE);
if (table == table_to_delete) {
all_clear = false;
break;
Expand All @@ -176,7 +176,7 @@ void RefCountGuard::waitForRefCountToClear(CallTraceHashTable* table_to_delete)
continue;
}

CallTraceHashTable* table = __atomic_load_n(&refcount_slots[i].active_table, __ATOMIC_ACQUIRE);
void* table = __atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE);
if (table == table_to_delete) {
all_clear = false;
break;
Expand Down Expand Up @@ -266,15 +266,15 @@ CallTraceStorage::CallTraceStorage() : _active_storage(nullptr), _standby_storag
_preserve_set_buffer.rehash(static_cast<size_t>(1024 / 0.75f));

// Initialize triple-buffered storage
auto active_table = std::make_unique<CallTraceHashTable>();
active_table->setInstanceId(getNextInstanceId());
active_table->setParentStorage(this);
__atomic_store_n(&_active_storage, active_table.release(), __ATOMIC_RELEASE);

auto standby_table = std::make_unique<CallTraceHashTable>();
standby_table->setParentStorage(this);
standby_table->setInstanceId(getNextInstanceId());
__atomic_store_n(&_standby_storage, standby_table.release(), __ATOMIC_RELEASE);
auto active_ptr = std::make_unique<CallTraceHashTable>();
active_ptr->setInstanceId(getNextInstanceId());
active_ptr->setParentStorage(this);
__atomic_store_n(&_active_storage, active_ptr.release(), __ATOMIC_RELEASE);

auto standby_ptr = std::make_unique<CallTraceHashTable>();
standby_ptr->setParentStorage(this);
standby_ptr->setInstanceId(getNextInstanceId());
__atomic_store_n(&_standby_storage, standby_ptr.release(), __ATOMIC_RELEASE);

auto scratch_table = std::make_unique<CallTraceHashTable>();
scratch_table->setParentStorage(this);
Expand Down
77 changes: 1 addition & 76 deletions ddprof-lib/src/main/cpp/callTraceStorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define _CALLTRACESTORAGE_H

#include "callTraceHashTable.h"
#include "refCountGuard.h"
#include "spinLock.h"
#include "os.h"
#include <functional>
Expand All @@ -28,82 +29,6 @@ class CallTraceHashTable;
// Using reference parameter avoids malloc() for vector creation and copying
typedef std::function<void(std::unordered_set<u64>&)> LivenessChecker;

/**
* Cache-aligned reference counting slot for thread-local reference counting.
* Each slot occupies a full cache line (64 bytes) to eliminate false sharing.
*
* CORRECTNESS: The pointer-first protocol ensures race-free operation:
* - Constructor: Store pointer first, then increment count
* - Destructor: Decrement count first, then clear pointer
* - Scanner: Check count first (if 0, slot is inactive)
*
* This ordering ensures no window where scanner incorrectly believes a slot
* is inactive when it should be protecting a table.
*/
struct alignas(DEFAULT_CACHE_LINE_SIZE) RefCountSlot {
volatile uint32_t count; // Reference count (0 = inactive)
char _padding1[4]; // Alignment padding for pointer
CallTraceHashTable* active_table; // Which table is being referenced (8 bytes on 64-bit)
char padding[DEFAULT_CACHE_LINE_SIZE - 16]; // Remaining padding (64 - 16 = 48 bytes)

RefCountSlot() : count(0), _padding1{}, active_table(nullptr), padding{} {
static_assert(sizeof(RefCountSlot) == DEFAULT_CACHE_LINE_SIZE,
"RefCountSlot must be exactly one cache line");
}
};

/**
* RAII guard for thread-local reference counting.
*
* This class provides lock-free memory reclamation for CallTraceHashTable instances.
* Uses the pointer-first protocol to avoid race conditions during slot activation/deactivation.
*
* Performance characteristics:
* - Hot path: ~44-94 cycles
* - Thread-local cache line access (zero contention)
* - No bitmap operations required
*
* Correctness:
* - Pointer stored BEFORE count increment (activation)
* - Count decremented BEFORE pointer cleared (deactivation)
* - Scanner checks count first, ensuring consistent view
*/
class RefCountGuard {
public:
static constexpr int MAX_THREADS = 8192;
static constexpr int MAX_PROBE_DISTANCE = 32; // Maximum probing attempts

static RefCountSlot refcount_slots[MAX_THREADS];
static int slot_owners[MAX_THREADS]; // Thread ID ownership verification

private:
bool _active;
int _my_slot; // This instance's assigned slot

// Signal-safe slot assignment using thread ID hash with prime probing
static int getThreadRefCountSlot();

public:
RefCountGuard(CallTraceHashTable* resource);
~RefCountGuard();

// Non-copyable, movable for efficiency
RefCountGuard(const RefCountGuard&) = delete;
RefCountGuard& operator=(const RefCountGuard&) = delete;

RefCountGuard(RefCountGuard&& other) noexcept;
RefCountGuard& operator=(RefCountGuard&& other) noexcept;

// Check if refcount guard is active (slot allocation succeeded)
bool isActive() const { return _active; }

// Wait for reference counts pointing to specific table to clear
static void waitForRefCountToClear(CallTraceHashTable* table_to_delete);

// Wait for ALL reference counts to clear
static void waitForAllRefCountsToClear();
};

class CallTraceStorage {
public:
// Reserved trace ID for dropped samples due to contention
Expand Down
16 changes: 16 additions & 0 deletions ddprof-lib/src/main/cpp/dictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,22 @@ void Dictionary::collect(std::map<unsigned int, const char *> &map) {
collect(map, _table);
}

void Dictionary::mergeFrom(const Dictionary &src) {
mergeFrom(src._table);
}

void Dictionary::mergeFrom(const DictTable *table) {
for (int i = 0; i < ROWS; i++) {
const DictRow *row = &table->rows[i];
for (int j = 0; j < CELLS; j++) {
if (const char *key = row->keys[j]) {
lookup(key, strlen(key));
}
}
if (row->next) mergeFrom(row->next);
}
}

void Dictionary::collect(std::map<unsigned int, const char *> &map,
DictTable *table) {
for (int i = 0; i < ROWS; i++) {
Expand Down
122 changes: 122 additions & 0 deletions ddprof-lib/src/main/cpp/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#define _DICTIONARY_H

#include "counters.h"
#include "refCountGuard.h"
#include "tripleBuffer.h"
#include <map>
#include <stddef.h>
#include <stdlib.h>
Expand Down Expand Up @@ -61,6 +63,8 @@ class Dictionary {
unsigned int lookup(const char *key, size_t length, bool for_insert,
unsigned int sentinel);

void mergeFrom(const DictTable *table);

public:
Dictionary() : Dictionary(0) {}
Dictionary(int id) : _id(id) {
Expand All @@ -84,6 +88,124 @@ class Dictionary {
unsigned int bounded_lookup(const char *key, size_t length, int size_limit);

void collect(std::map<unsigned int, const char *> &map);

// Re-inserts all entries from src into this dictionary. Called from the
// dump thread during rotatePersistent(); not signal-safe (calls malloc).
void mergeFrom(const Dictionary &src);

int counterId() const { return _id; }
int size() const { return _size; }
};

// Triple-buffered wrapper for signal-handler-safe concurrent dictionary access.
//
// Three roles cycle through three Dictionary buffers:
//
// active — current writes (signal handlers + fill-path)
// dump — snapshot being read by the current dump (old active after rotate)
// scratch — two rotations behind active; ready to be cleared by clearStandby()
//
// Concurrency safety:
// lookup() and bounded_lookup() acquire a per-thread RefCountGuard on the
// active buffer pointer before touching it. rotate() and rotatePersistent()
// call RefCountGuard::waitForRefCountToClear(old_active) after advancing the
// active index, which provably drains all in-flight callers (signal handlers
// AND JNI threads) before the old buffer is handed to the dump thread.
// clearStandby() clears the scratch buffer, which was already drained by the
// rotate() two cycles earlier — no additional drain is needed.
//
// Trace-drop window: RefCountGuard uses a pointer-first activation protocol
// (see refCountGuard.h). In the theoretical window between storing the active
// pointer and incrementing the reference count a scanner could skip the slot.
// In practice signal handlers complete in microseconds and a buffer is only
// cleared after TWO dump cycles (seconds), so this window is never hit.
// Should it occur, bounded_lookup returns INT_MAX (miss) — a dropped trace or
// generic vtable frame — not a crash.
//
// Lifecycle per dump cycle:
// rotate() — advance active; drain old active via RefCountGuard
// standby()->... — dump thread reads stable snapshot
// clearStandby() — clear the scratch buffer (safe: drained two cycles ago)
//
// Memory: at most two non-empty buffers at any time (active + dump).
// Churn: entries purged after at most two full dump cycles.
//
// For profiler reset call clearAll().
class TripleBufferedDictionary {
int _counter_id;
Dictionary _a;
Dictionary _b;
Dictionary _c;
TripleBufferRotator<Dictionary> _rot;

public:
// All three buffers carry the real counter id so that insertions through
// any buffer are tracked in the named counter slot.
TripleBufferedDictionary(int id)
: _counter_id(id), _a(id), _b(id), _c(id), _rot(&_a, &_b, &_c) {}

unsigned int lookup(const char* key, size_t length) {
Dictionary* active = _rot.active();
RefCountGuard guard(active);
return active->lookup(key, length);
}

// Signal-safe: acquires a per-thread RefCountGuard then performs a
// read-only probe (size_limit=0 never calls malloc). Returns INT_MAX
// on miss; callers must tolerate misses.
unsigned int bounded_lookup(const char* key, size_t length, int size_limit) {
Dictionary* active = _rot.active();
RefCountGuard guard(active);
return active->bounded_lookup(key, length, size_limit);
}

// Returns the dump buffer for read by the dump thread.
// Safe to read after rotate() returns (all in-flight writers drained).
Dictionary* standby() {
return _rot.dumpBuffer();
}

// Advances the active buffer and drains all in-flight accesses to the
// old active via RefCountGuard before returning. After this call the
// dump thread may read standby() safely.
void rotate() {
Dictionary* old_active = _rot.active();
_rot.rotate();
RefCountGuard::waitForRefCountToClear(old_active);
}

// Variant of rotate() for persistent dictionaries (e.g. class map) whose
// entries must survive across dump cycles.
//
// Before rotating, all entries from the current active are merged into the
// current clearTarget (the future active after rotation). Signal handlers
// observe no gap: they use the old active — still live during the merge —
// and after rotate() the new active is already fully populated.
void rotatePersistent() {
Dictionary* old_active = _rot.active();
_rot.clearTarget()->mergeFrom(*old_active);
_rot.rotate();
RefCountGuard::waitForRefCountToClear(old_active);
}

// Clears the scratch buffer (two rotations behind active).
// The scratch buffer was drained by the rotate() call two cycles ago;
// no additional synchronisation is required here.
void clearStandby() {
_rot.clearTarget()->clear();
// Dictionary::clear() zeroed the shared DICTIONARY_KEYS slot. Re-set it
// to the active buffer's actual insertion count so that monitoring sees
// only live entries, not the just-cleared scratch buffer's (zero) state.
Counters::set(DICTIONARY_KEYS, _rot.active()->size(), _counter_id);
}

// Clears all three buffers. Call on profiler reset (no concurrent writers).
void clearAll() {
_a.clear();
_b.clear();
_c.clear();
_rot.reset();
}
};

#endif // _DICTIONARY_H
Loading
Loading