Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,25 @@ With separate debug symbol packages for production debugging support.

- This ensures the full build log is captured to a file and only a summary is shown in the main session.

## GitHub Operations

### Updating PR Descriptions

The `gh pr edit` command may fail with a GraphQL error about "Projects (classic)" deprecation. Use the GitHub API directly instead:

```bash
# 1. Write the PR body to a file (e.g., pr-body.md)

# 2. Convert to JSON and update via API
jq -Rs '{body: .}' pr-body.md > /tmp/pr-update.json
gh api repos/DataDog/java-profiler/pulls/<PR_NUMBER> -X PATCH --input /tmp/pr-update.json

# 3. Verify the update
gh pr view <PR_NUMBER> --json body -q '.body' | head -30
```

This workaround properly escapes the markdown content and avoids the GraphQL Projects deprecation error.

## Ground rules
- Never replace the code you work on with stubs
- Never 'fix' the tests by testing constants against constants
Expand Down
5 changes: 5 additions & 0 deletions ddprof-lib/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,11 @@ tasks.register('sourcesJar', Jar) {
archiveVersion = component_version
}

javadoc {
// Exclude classes that use internal JDK APIs not visible to javadoc
exclude '**/BufferWriter8.java'
}

tasks.register('javadocJar', Jar) {
dependsOn javadoc
archiveBaseName = libraryName
Expand Down
238 changes: 238 additions & 0 deletions ddprof-lib/src/main/cpp/lockFree.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
/*
* Copyright 2025, Datadog, Inc.
* SPDX-License-Identifier: Apache-2.0
*/

#ifndef _LOCKFREE_H
#define _LOCKFREE_H

#include "common.h"

#include <atomic>
#include <cstddef>
#include <cstdint>

/**
* Lock-free atomic primitives and utilities.
*
* This header provides building blocks for lock-free data structures:
* - PaddedAtomic: Cache-line padded atomics to prevent false sharing
* - LockFreeBitset: Lock-free bitset for concurrent membership tracking
*
* For complete synchronization classes (SpinLock, mutexes), see spinLock.h
*/

// Cache line size for preventing false sharing (typical for x86/ARM)
// Note: This duplicates DEFAULT_CACHE_LINE_SIZE from arch_dd.h for standalone use
constexpr size_t CACHE_LINE_SIZE = 64;

/**
* Atomic value padded to its own cache line to prevent false sharing.
*
* Use this when you have an array of atomics that are frequently accessed
* by different threads. Without padding, atomics in adjacent array elements
* may share a cache line, causing false sharing that degrades performance.
*
* False sharing occurs when:
* - Thread A modifies atomic at index 0
* - Thread B modifies atomic at index 1
* - Both atomics are on the same cache line
* - CPU must invalidate entire cache line, forcing both threads to reload
*
* Example usage:
* static PaddedAtomic<uint64_t> counters[128]; // Each counter on own cache line
* counters[i].value.fetch_add(1, std::memory_order_relaxed);
*
* @tparam T The atomic value type (e.g., uint64_t, int, bool)
*/
template<typename T>
struct alignas(CACHE_LINE_SIZE) PaddedAtomic {
std::atomic<T> value;
// Padding is automatic due to alignas - ensures this struct occupies full cache line
};

/**
* Lock-free bitset for concurrent membership tracking.
*
* A fixed-size bitset that supports lock-free set, clear, and test operations.
* Uses cache-line padded atomic words to prevent false sharing between threads
* operating on different portions of the bitset.
*
* Hash-based operations use double-hashing with two independent hash functions
* to minimize false positives. A key is considered "set" only if both
* corresponding bits (from both hash functions) are set. This reduces the
* false positive probability from p to p² compared to single-hash approaches.
*
* Thread safety:
* - All operations are lock-free and async-signal-safe
* - Uses atomic operations with appropriate memory ordering
* - Safe to call from signal handlers
*
* Example usage:
* static LockFreeBitset<8192> threadSet;
*
* // Hash-based operations (for integer keys like thread IDs)
* threadSet.set(tid); // Mark thread as member
* if (threadSet.test(tid)) { ... } // Check membership
* threadSet.clear(tid); // Remove from set
*
* // Raw bit operations (when you manage indexing yourself)
* threadSet.setRaw(42); // Set bit 42
* threadSet.clearRaw(42); // Clear bit 42
*
* @tparam NumBits Total number of bits per array (should be power of 2 for efficient hashing)
*/
template<size_t NumBits>
class LockFreeBitset {
public:
static constexpr size_t NUM_BITS = NumBits;
static constexpr size_t BITS_PER_WORD = 64;
static constexpr size_t NUM_WORDS = (NumBits + BITS_PER_WORD - 1) / BITS_PER_WORD;

/**
* Initializes the bitset with all bits cleared in both arrays.
*/
void init() {
for (size_t i = 0; i < NUM_WORDS * 2; i++) {
_words[i].value.store(0, std::memory_order_relaxed);
}
}

/**
* Sets the bits for the given key using double-hash indexing.
* Sets bits in both arrays using two independent hash functions.
*
* @param key Integer key to hash and set
*/
void set(size_t key) {
setBit(hashKey1(key), 0); // Array 1 at even indices
setBit(hashKey2(key), 1); // Array 2 at odd indices
}

/**
* Clears the bits for the given key using double-hash indexing.
* Clears bits in both arrays using two independent hash functions.
*
* @param key Integer key to hash and clear
*/
void clear(size_t key) {
clearBit(hashKey1(key), 0); // Array 1 at even indices
clearBit(hashKey2(key), 1); // Array 2 at odd indices
}

/**
* Tests if the key is set using double-hash indexing.
* Returns true only if BOTH bits (from both hash functions) are set.
* This minimizes false positives compared to single-hash approaches.
*
* @param key Integer key to hash and test
* @return true if both bits are set, false otherwise
*/
bool test(size_t key) const {
return testBit(hashKey1(key), 0) && testBit(hashKey2(key), 1);
}

/**
* Sets the bit at the given raw index in the primary array (no hashing).
*
* @param bit_index Raw bit index (0 to NumBits-1)
*/
void setRaw(size_t bit_index) {
setBit(bit_index, 0); // Use array 1 (even indices)
}

/**
* Clears the bit at the given raw index in the primary array (no hashing).
*
* @param bit_index Raw bit index (0 to NumBits-1)
*/
void clearRaw(size_t bit_index) {
clearBit(bit_index, 0); // Use array 1 (even indices)
}

/**
* Tests if the bit at the given raw index is set in the primary array (no hashing).
*
* @param bit_index Raw bit index (0 to NumBits-1)
* @return true if the bit is set, false otherwise
*/
bool testRaw(size_t bit_index) const {
return testBit(bit_index, 0); // Use array 1 (even indices)
}

/**
* Clears all bits in both arrays.
*/
void clearAll() {
init();
}

private:
// Second hash constant - FNV offset basis provides good independence from Knuth constant
static constexpr size_t HASH2_CONSTANT = 0x517cc1b727220a95ULL;

// Interleaved array layout for L1 cache optimization.
// Layout: [word1_0, word2_0, word1_1, word2_1, ..., word1_N-1, word2_N-1]
// When test() accesses both hash positions, if they map to similar word indices,
// they'll be on adjacent cache lines, improving cache hit rate.
// Total memory: NUM_WORDS * 2 * 64 bytes (e.g., 256 * 2 * 64 = 32 KB for 16384 bits)
PaddedAtomic<uint64_t> _words[NUM_WORDS * 2];

/**
* Primary hash function using Knuth multiplicative hash.
*/
static size_t hashKey1(size_t key) {
return (key * KNUTH_MULTIPLICATIVE_CONSTANT) % NumBits;
}

/**
* Secondary hash function using upper bits of multiplication.
* While hash1 uses lower bits (via modulo), hash2 uses upper bits
* to provide true independence between the two hash functions.
*/
static size_t hashKey2(size_t key) {
// Use upper 32 bits of the multiplication result
// This provides independence from hash1 which uses lower bits via modulo
size_t product = key * HASH2_CONSTANT;
return (product >> 32) % NumBits;
}

/**
* Sets a bit in the interleaved array.
* @param bit_index The bit index within the logical array
* @param array_offset 0 for array1 (even indices), 1 for array2 (odd indices)
*/
void setBit(size_t bit_index, size_t array_offset) {
size_t word_index = bit_index / BITS_PER_WORD;
size_t interleaved_index = word_index * 2 + array_offset;
uint64_t bit_mask = 1ULL << (bit_index % BITS_PER_WORD);
_words[interleaved_index].value.fetch_or(bit_mask, std::memory_order_release);
}

/**
* Clears a bit in the interleaved array.
* @param bit_index The bit index within the logical array
* @param array_offset 0 for array1 (even indices), 1 for array2 (odd indices)
*/
void clearBit(size_t bit_index, size_t array_offset) {
size_t word_index = bit_index / BITS_PER_WORD;
size_t interleaved_index = word_index * 2 + array_offset;
uint64_t bit_mask = 1ULL << (bit_index % BITS_PER_WORD);
_words[interleaved_index].value.fetch_and(~bit_mask, std::memory_order_release);
}

/**
* Tests a bit in the interleaved array.
* @param bit_index The bit index within the logical array
* @param array_offset 0 for array1 (even indices), 1 for array2 (odd indices)
*/
bool testBit(size_t bit_index, size_t array_offset) const {
size_t word_index = bit_index / BITS_PER_WORD;
size_t interleaved_index = word_index * 2 + array_offset;
uint64_t bit_mask = 1ULL << (bit_index % BITS_PER_WORD);
uint64_t word = _words[interleaved_index].value.load(std::memory_order_acquire);
return (word & bit_mask) != 0;
}
};

#endif // _LOCKFREE_H
11 changes: 10 additions & 1 deletion ddprof-lib/src/main/cpp/os_linux_dd.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifdef __linux__

#include "os_dd.h"
#include "thread.h"
#include "common.h"
#include <signal.h>
#include <unistd.h>
Expand Down Expand Up @@ -292,7 +293,15 @@ static void* threadDirectoryWatcherLoop(void* arg) {
int tid = atoi(event->name);
if (tid > 0) {
if (event->mask & (IN_CREATE | IN_MOVED_TO)) {
if (g_on_new_thread) g_on_new_thread(tid);
// Small delay (20ms) to allow JVMTI ThreadStart callback to register Java threads
// This virtually eliminates the race condition between thread creation and JVMTI callback
struct timespec delay = {0, 20000000}; // 20ms
nanosleep(&delay, nullptr);

// Skip sending signal to likely Java threads
if (!ProfiledThread::isLikelyJavaThread(tid) && g_on_new_thread) {
g_on_new_thread(tid);
}
} else if (event->mask & (IN_DELETE | IN_MOVED_FROM)) {
if (g_on_dead_thread) g_on_dead_thread(tid);
}
Expand Down
3 changes: 1 addition & 2 deletions ddprof-lib/src/main/cpp/profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ void Profiler::addRuntimeStub(const void *address, int length,
}

void Profiler::onThreadStart(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) {
ProfiledThread::initCurrentThread();
ProfiledThread *current = ProfiledThread::current();
int tid = current->tid();
if (_thread_filter.enabled()) {
Expand All @@ -120,7 +119,7 @@ void Profiler::onThreadStart(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) {
}

void Profiler::onThreadEnd(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread) {
ProfiledThread *current = ProfiledThread::current();
ProfiledThread *current = ProfiledThread::currentSignalSafe();
int tid = -1;

if (current != nullptr) {
Expand Down
Loading
Loading