From e0fc45674d35f6f1565ad1d3feb0751b3f97ebab Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Wed, 13 May 2026 17:49:56 +0200 Subject: [PATCH] fix(profiler): close SIGVTALRM race in thread teardown (PROF-14603) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JavaThread::~JavaThread / OSThread::~OSThread crashed on JDK 25 when the ddprof pthread_create hook delivered SIGVTALRM between Profiler::unregisterThread() returning and ProfiledThread::release() acquiring its internal guard. The signal handler called currentSignalSafe() and dereferenced the now-freed ProfiledThread. Fix: extract unregister_and_release(tid) — a noinline helper that holds a SignalBlocker for the entire unregister+release sequence. Both start_routine_wrapper and start_routine_wrapper_spec invoke it; the race window is eliminated without duplicating signal-masking logic. Same SignalBlocker pattern is applied to perfEvents_linux.cpp's pthread_setspecific_hook teardown path. thread.h guards clearCurrentThreadTLS() with #ifdef UNIT_TEST so it is absent from production builds; GtestTaskBuilder.kt adds -DUNIT_TEST to the gtest compiler flags so the guarded method compiles in tests. thread_teardown_safety_ut.cpp adds an acceptance-test suite (ThreadTeardownSafetyTest T-01..T-10) covering the full teardown lifecycle under signal load. Co-Authored-By: Claude Sonnet 4.6 --- .../native/gtest/GtestTaskBuilder.kt | 3 + .../src/main/cpp/libraryPatcher_linux.cpp | 22 +- ddprof-lib/src/main/cpp/perfEvents_linux.cpp | 7 +- ddprof-lib/src/main/cpp/thread.h | 7 +- .../test/cpp/thread_teardown_safety_ut.cpp | 346 ++++++++++++++++++ 5 files changed, 377 insertions(+), 8 deletions(-) create mode 100644 ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt index 9da2e260e..cc4e80770 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt @@ -204,6 +204,9 @@ class GtestTaskBuilder( args.add("-D__musl__") } + // Mark unit-test builds so test-only production APIs are compiled in. + args.add("-DUNIT_TEST") + return args } } diff --git a/ddprof-lib/src/main/cpp/libraryPatcher_linux.cpp b/ddprof-lib/src/main/cpp/libraryPatcher_linux.cpp index 0c04f482f..ff984a33b 100644 --- a/ddprof-lib/src/main/cpp/libraryPatcher_linux.cpp +++ b/ddprof-lib/src/main/cpp/libraryPatcher_linux.cpp @@ -55,6 +55,20 @@ class RoutineInfo { } }; +// Unregister the current thread from the profiler and release its TLS under a +// single SignalBlocker to close the race window between unregisterThread() +// returning and release() acquiring its internal guard (PROF-14603). Without +// this, a SIGVTALRM delivered in that window could call currentSignalSafe() +// and dereference a now-freed ProfiledThread. Kept noinline so the +// SignalBlocker's sigset_t does not appear in the caller's stack frame on +// musl/aarch64 where the deopt blob may corrupt the wrapper's stack guard. +__attribute__((noinline)) +static void unregister_and_release(int tid) { + SignalBlocker blocker; + Profiler::unregisterThread(tid); + ProfiledThread::release(); +} + #ifdef __aarch64__ // Delete RoutineInfo with profiling signals blocked to prevent ASAN // allocator lock reentrancy. Kept noinline so SignalBlocker's sigset_t @@ -129,8 +143,9 @@ static void* start_routine_wrapper_spec(void* args) { delete_routine_info(thr); init_tls_and_register(); routine(params); - Profiler::unregisterThread(ProfiledThread::currentTid()); - ProfiledThread::release(); + // unregister_and_release() holds SignalBlocker for the duration, closing the + // race window between unregisterThread() and release() (PROF-14603). + unregister_and_release(ProfiledThread::currentTid()); // pthread_exit instead of 'return': the saved LR in this frame is corrupted // by DEOPT PACKING; returning would jump to a garbage address. pthread_exit(nullptr); @@ -184,8 +199,9 @@ static void* start_routine_wrapper(void* args) { } // RAII cleanup: reads tid from TLS in the destructor (same rationale as // start_routine_wrapper_spec: avoids storing state on a potentially corruptible frame). + // unregister_and_release() wraps the two calls under SignalBlocker (PROF-14603). struct Cleanup { - ~Cleanup() { Profiler::unregisterThread(ProfiledThread::currentTid()); ProfiledThread::release(); } + ~Cleanup() { unregister_and_release(ProfiledThread::currentTid()); } } cleanup; routine(params); return nullptr; diff --git a/ddprof-lib/src/main/cpp/perfEvents_linux.cpp b/ddprof-lib/src/main/cpp/perfEvents_linux.cpp index 63bd74da6..2edc05f46 100644 --- a/ddprof-lib/src/main/cpp/perfEvents_linux.cpp +++ b/ddprof-lib/src/main/cpp/perfEvents_linux.cpp @@ -184,8 +184,11 @@ static int pthread_setspecific_hook(pthread_key_t key, const void *value) { return result; } else { int tid = ProfiledThread::currentTid(); - Profiler::unregisterThread(tid); - ProfiledThread::release(); + { + SignalBlocker blocker; + Profiler::unregisterThread(tid); + ProfiledThread::release(); + } return pthread_setspecific(key, value); } } diff --git a/ddprof-lib/src/main/cpp/thread.h b/ddprof-lib/src/main/cpp/thread.h index 53357f87c..4428aab94 100644 --- a/ddprof-lib/src/main/cpp/thread.h +++ b/ddprof-lib/src/main/cpp/thread.h @@ -82,14 +82,15 @@ class ProfiledThread : public ThreadLocalData { static void initCurrentThread(); static void release(); - // Clears TLS without deleting the ProfiledThread. For unit tests only: - // simulates the moment inside release() after pthread_setspecific(NULL) but - // before delete, which is the race window the _thread_ptr fix covers. +#ifdef UNIT_TEST + // Simulates the moment inside release() after pthread_setspecific(NULL) but + // before delete — the race window the clearCurrentThreadTLS fix covers. static void clearCurrentThreadTLS() { if (__atomic_load_n(&_tls_key_initialized, __ATOMIC_ACQUIRE)) { pthread_setspecific(_tls_key, nullptr); } } +#endif static ProfiledThread *current(); static ProfiledThread *currentSignalSafe(); // Signal-safe version that never allocates diff --git a/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp b/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp new file mode 100644 index 000000000..b21a69dd8 --- /dev/null +++ b/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp @@ -0,0 +1,346 @@ +/* + * Copyright 2026 Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + * + * Acceptance tests for the PROF-14603 signal-race fix in thread teardown. + * + * Root cause: SIGVTALRM delivered between ProfiledThread::release() clearing TLS + * (pthread_setspecific(NULL)) and deleting the ProfiledThread object, causing + * a signal handler to call currentSignalSafe() and dereference a freed pointer. + * + * Fix coverage: + * - Signal-safe TLS accessor returns null in the race window (no crash). + * - SignalBlocker properly guards the unregister+release sequence. + * - Thread lifecycle (init/release) is race-free under concurrent signal load. + */ + +#include + +#ifdef __linux__ + +#include "guards.h" +#include "thread.h" + +#include +#include +#include +#include +#include +#include + +// Sentinel value meaning "handler has not run yet" — distinct from both nullptr +// (not registered) and any real ProfiledThread address. +static ProfiledThread* const kNotYetRun = reinterpret_cast(1); + +// Use sigaction() instead of signal() so the handler persists across platforms; +// signal() has implementation-defined reset-on-deliver (SA_RESETHAND) behaviour. +static inline void install_handler(int sig, void (*handler)(int)) { + struct sigaction sa{}; + sa.sa_handler = handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + sigaction(sig, &sa, nullptr); +} + +// ── T-01: currentSignalSafe() is non-null while live, null after release ───── + +static std::atomic g_t01_seen{nullptr}; + +static void t01_handler(int) { + g_t01_seen.store(ProfiledThread::currentSignalSafe(), std::memory_order_relaxed); +} + +static void *t01_body(void *) { + ProfiledThread::initCurrentThread(); + + install_handler(SIGVTALRM, t01_handler); + g_t01_seen.store(kNotYetRun, std::memory_order_relaxed); + pthread_kill(pthread_self(), SIGVTALRM); + EXPECT_NE(nullptr, g_t01_seen.load(std::memory_order_relaxed)) + << "currentSignalSafe() must return non-null while ProfiledThread is live"; + + ProfiledThread::release(); + + g_t01_seen.store(kNotYetRun, std::memory_order_relaxed); + pthread_kill(pthread_self(), SIGVTALRM); + EXPECT_EQ(nullptr, g_t01_seen.load(std::memory_order_relaxed)) + << "currentSignalSafe() must return null after release()"; + + signal(SIGVTALRM, SIG_IGN); + return nullptr; +} + +// Verifies the post-release null guarantee seen from a signal handler. +TEST(ThreadTeardownSafetyTest, TLSAccessibleDuringLifetimeNullAfterRelease) { + pthread_t t; + ASSERT_EQ(0, pthread_create(&t, nullptr, t01_body, nullptr)); + pthread_join(t, nullptr); +} + +// ── T-02: Signal in the TLS-clear/delete window does not crash ─────────────── + +static std::atomic g_t02_seen{nullptr}; + +static void t02_handler(int) { + g_t02_seen.store(ProfiledThread::currentSignalSafe(), std::memory_order_relaxed); +} + +static void *t02_body(void *) { + ProfiledThread::initCurrentThread(); + + install_handler(SIGVTALRM, t02_handler); + g_t02_seen.store(kNotYetRun, std::memory_order_relaxed); + + // Simulate the race window: TLS cleared but object not yet freed. + ProfiledThread::clearCurrentThreadTLS(); + + // Signal delivered in the race window must see null, not a dangling pointer. + pthread_kill(pthread_self(), SIGVTALRM); + EXPECT_EQ(nullptr, g_t02_seen.load(std::memory_order_relaxed)) + << "currentSignalSafe() must return null in the TLS-clear/delete window"; + + signal(SIGVTALRM, SIG_IGN); + // release() with TLS already null must not double-free. + ProfiledThread::release(); + return nullptr; +} + +// Regression for the primary crash path: signal fires between clearTLS and delete. +TEST(ThreadTeardownSafetyTest, SignalInTLSClearDeleteWindowDoesNotCrash) { + pthread_t t; + ASSERT_EQ(0, pthread_create(&t, nullptr, t02_body, nullptr)); + pthread_join(t, nullptr); +} + +// ── T-03: Double release() is idempotent ───────────────────────────────────── + +static void *t03_body(void *) { + ProfiledThread::initCurrentThread(); + ProfiledThread::release(); + ProfiledThread::release(); // must not crash or double-free + return nullptr; +} + +TEST(ThreadTeardownSafetyTest, DoubleReleaseIsIdempotent) { + pthread_t t; + ASSERT_EQ(0, pthread_create(&t, nullptr, t03_body, nullptr)); + pthread_join(t, nullptr); +} + +// ── T-04: Signal-safe accessor returns null without initCurrentThread() ────── + +static std::atomic g_t04_seen{nullptr}; + +static void t04_handler(int) { + g_t04_seen.store(ProfiledThread::currentSignalSafe(), std::memory_order_relaxed); +} + +static void *t04_body(void *) { + // Intentionally no initCurrentThread(). + install_handler(SIGVTALRM, t04_handler); + g_t04_seen.store(kNotYetRun, std::memory_order_relaxed); + pthread_kill(pthread_self(), SIGVTALRM); + EXPECT_EQ(nullptr, g_t04_seen.load(std::memory_order_relaxed)) + << "currentSignalSafe() must return null for unregistered threads"; + signal(SIGVTALRM, SIG_IGN); + return nullptr; +} + +TEST(ThreadTeardownSafetyTest, SignalSafeAccessorReturnsNullWithoutInit) { + pthread_t t; + ASSERT_EQ(0, pthread_create(&t, nullptr, t04_body, nullptr)); + pthread_join(t, nullptr); +} + +// ── T-05: Concurrent signals during teardown stress ────────────────────────── + +static void *t05_worker(void *) { + signal(SIGVTALRM, SIG_IGN); + signal(SIGPROF, SIG_IGN); + ProfiledThread::initCurrentThread(); + for (int i = 0; i < 10; ++i) { + pthread_kill(pthread_self(), SIGVTALRM); + } + ProfiledThread::release(); + return nullptr; +} + +// 20 threads × 5 rounds with signal spray; no crash expected. +TEST(ThreadTeardownSafetyTest, ConcurrentSignalsDuringTeardownStress) { + for (int round = 0; round < 5; ++round) { + std::vector threads(20); + for (auto &tid : threads) { + ASSERT_EQ(0, pthread_create(&tid, nullptr, t05_worker, nullptr)); + } + for (auto &tid : threads) { + pthread_join(tid, nullptr); + } + } +} + +// ── T-06: SignalBlocker masks SIGPROF + SIGVTALRM and restores on exit ──────── + +static void *t06_body(void *) { + sigset_t before, during, after; + + pthread_sigmask(SIG_SETMASK, nullptr, &before); + EXPECT_FALSE(sigismember(&before, SIGVTALRM)); + EXPECT_FALSE(sigismember(&before, SIGPROF)); + + { + SignalBlocker blocker; + pthread_sigmask(SIG_SETMASK, nullptr, &during); + EXPECT_TRUE(sigismember(&during, SIGVTALRM)) + << "SignalBlocker must block SIGVTALRM"; + EXPECT_TRUE(sigismember(&during, SIGPROF)) + << "SignalBlocker must block SIGPROF"; + } + + pthread_sigmask(SIG_SETMASK, nullptr, &after); + EXPECT_FALSE(sigismember(&after, SIGVTALRM)) + << "SignalBlocker must restore SIGVTALRM on exit"; + EXPECT_FALSE(sigismember(&after, SIGPROF)) + << "SignalBlocker must restore SIGPROF on exit"; + return nullptr; +} + +TEST(ThreadTeardownSafetyTest, SignalBlockerMasksAndRestoresProfSignals) { + pthread_t t; + ASSERT_EQ(0, pthread_create(&t, nullptr, t06_body, nullptr)); + pthread_join(t, nullptr); +} + +// ── T-07: Forced unwind with concurrent signal does not crash ───────────────── + +static std::atomic g_t07_cleanup_ran{false}; +static std::atomic g_t07_release_ran{false}; + +static void *t07_body(void *) { + g_t07_cleanup_ran.store(false, std::memory_order_relaxed); + g_t07_release_ran.store(false, std::memory_order_relaxed); + + signal(SIGVTALRM, SIG_IGN); + ProfiledThread::initCurrentThread(); + + try { + // Inject a signal before the cancellation point to exercise the combined path. + pthread_kill(pthread_self(), SIGVTALRM); + while (true) { + pthread_testcancel(); + usleep(100); + } + } catch (abi::__forced_unwind &) { + g_t07_cleanup_ran.store(true, std::memory_order_relaxed); + ProfiledThread::release(); + g_t07_release_ran.store(true, std::memory_order_relaxed); + throw; + } + ProfiledThread::release(); + return nullptr; +} + +TEST(ThreadTeardownSafetyTest, ForcedUnwindWithConcurrentSignalDoesNotCrash) { + pthread_t t; + ASSERT_EQ(0, pthread_create(&t, nullptr, t07_body, nullptr)); + usleep(5000); + pthread_cancel(t); + void *retval; + ASSERT_EQ(0, pthread_join(t, &retval)); + EXPECT_TRUE(g_t07_cleanup_ran.load()); + EXPECT_TRUE(g_t07_release_ran.load()); + EXPECT_EQ(PTHREAD_CANCELED, retval); +} + +// ── T-08: Double initCurrentThread() is idempotent ─────────────────────────── + +static void *t08_body(void *) { + ProfiledThread::initCurrentThread(); + ProfiledThread *first = ProfiledThread::currentSignalSafe(); + EXPECT_NE(nullptr, first); + + ProfiledThread::initCurrentThread(); // second call on the same thread + ProfiledThread *second = ProfiledThread::currentSignalSafe(); + EXPECT_NE(nullptr, second); + EXPECT_EQ(first, second) << "double init must not allocate a second ProfiledThread"; + + ProfiledThread::release(); + return nullptr; +} + +TEST(ThreadTeardownSafetyTest, DoubleInitIsIdempotent) { + pthread_t t; + ASSERT_EQ(0, pthread_create(&t, nullptr, t08_body, nullptr)); + pthread_join(t, nullptr); +} + +// ── T-09: High-frequency signals during thread churn ───────────────────────── + +static std::atomic g_t09_stop{false}; + +static void *t09_injector(void *) { + while (!g_t09_stop.load(std::memory_order_relaxed)) { + kill(getpid(), SIGVTALRM); + usleep(500); // ~2 kHz + } + return nullptr; +} + +static void *t09_worker(void *) { + signal(SIGVTALRM, SIG_IGN); + ProfiledThread::initCurrentThread(); + usleep(100); + ProfiledThread::release(); + return nullptr; +} + +// Mirrors DumpWhileChurningThreadsTest at native level: 100 short-lived threads +// under continuous SIGVTALRM injection must complete without crash. +TEST(ThreadTeardownSafetyTest, HighFrequencySignalsDuringThreadChurn) { + signal(SIGVTALRM, SIG_IGN); + + g_t09_stop.store(false, std::memory_order_relaxed); + pthread_t injector; + ASSERT_EQ(0, pthread_create(&injector, nullptr, t09_injector, nullptr)); + + for (int i = 0; i < 100; ++i) { + pthread_t worker; + ASSERT_EQ(0, pthread_create(&worker, nullptr, t09_worker, nullptr)); + pthread_join(worker, nullptr); + } + + g_t09_stop.store(true, std::memory_order_relaxed); + pthread_join(injector, nullptr); + signal(SIGVTALRM, SIG_DFL); +} + +// ── T-10: CriticalSection reentrancy guard prevents double-entry ────────────── + +static void *t10_body(void *) { + ProfiledThread::initCurrentThread(); + ProfiledThread *pt = ProfiledThread::currentSignalSafe(); + ASSERT_NE(nullptr, pt); + + // Outer critical section must succeed. + EXPECT_TRUE(pt->tryEnterCriticalSection()); + + // Simulated reentrancy from the same thread (e.g. nested signal handler). + EXPECT_FALSE(pt->tryEnterCriticalSection()) + << "reentrancy must be rejected while outer critical section is active"; + + pt->exitCriticalSection(); + + // After exit, entry succeeds again. + EXPECT_TRUE(pt->tryEnterCriticalSection()); + pt->exitCriticalSection(); + + ProfiledThread::release(); + return nullptr; +} + +TEST(ThreadTeardownSafetyTest, CriticalSectionReentrancyGuard) { + pthread_t t; + ASSERT_EQ(0, pthread_create(&t, nullptr, t10_body, nullptr)); + pthread_join(t, nullptr); +} + +#endif // __linux__