/
LinuxStackFramesCollector.cpp
377 lines (308 loc) · 11.8 KB
/
LinuxStackFramesCollector.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
// Unless explicitly stated otherwise all files in this repository are licensed under the Apache 2 License.
// This product includes software developed at Datadog (https://www.datadoghq.com/). Copyright 2022 Datadog, Inc.
#include "LinuxStackFramesCollector.h"
#include <cassert>
#include <chrono>
#include <errno.h>
#include <iomanip>
#include <libunwind.h>
#include <mutex>
#include <ucontext.h>
#include <unordered_map>
#include "CallstackProvider.h"
#include "IConfiguration.h"
#include "Log.h"
#include "ManagedThreadInfo.h"
#include "OpSysTools.h"
#include "ProfilerSignalManager.h"
#include "ScopeFinalizer.h"
#include "StackSnapshotResultBuffer.h"
using namespace std::chrono_literals;
std::mutex LinuxStackFramesCollector::s_stackWalkInProgressMutex;
LinuxStackFramesCollector* LinuxStackFramesCollector::s_pInstanceCurrentlyStackWalking = nullptr;
LinuxStackFramesCollector::LinuxStackFramesCollector(
ProfilerSignalManager* signalManager,
IConfiguration const* const configuration,
CallstackProvider* callstackProvider) :
StackFramesCollectorBase(configuration, callstackProvider),
_lastStackWalkErrorCode{0},
_stackWalkFinished{false},
_processId{OpSysTools::GetProcId()},
_signalManager{signalManager},
_errorStatistics{},
_useBacktrace2{configuration->UseBacktrace2()}
{
_signalManager->RegisterHandler(LinuxStackFramesCollector::CollectStackSampleSignalHandler);
}
LinuxStackFramesCollector::~LinuxStackFramesCollector()
{
_errorStatistics.Log();
}
bool LinuxStackFramesCollector::ShouldLogStats()
{
static std::time_t PreviousPrintTimestamp = 0;
static const std::int64_t TimeIntervalInSeconds = 600; // print stats every 10min
time_t currentTime;
time(¤tTime);
if (currentTime == static_cast<time_t>(-1))
{
return false;
}
if (currentTime - PreviousPrintTimestamp < TimeIntervalInSeconds)
{
return false;
}
PreviousPrintTimestamp = currentTime;
return true;
}
void LinuxStackFramesCollector::UpdateErrorStats(std::int32_t errorCode)
{
if (Log::IsDebugEnabled())
{
_errorStatistics.Add(errorCode);
if (ShouldLogStats())
{
_errorStatistics.Log();
}
}
}
StackSnapshotResultBuffer* LinuxStackFramesCollector::CollectStackSampleImplementation(ManagedThreadInfo* pThreadInfo,
uint32_t* pHR,
bool selfCollect)
{
long errorCode;
if (selfCollect)
{
// In case we are self-unwinding, we do not want to be interrupted by the signal-based profilers (walltime and cpu)
// This will crashing in libunwind (accessing a memory area which was unmapped)
// This lock is acquired by the signal-based profiler (see StackSamplerLoop->StackSamplerLoopManager)
pThreadInfo->GetStackWalkLock().Acquire();
on_leave
{
pThreadInfo->GetStackWalkLock().Release();
};
errorCode = CollectCallStackCurrentThread(nullptr);
}
else
{
if (!_signalManager->IsHandlerInPlace())
{
*pHR = E_FAIL;
return GetStackSnapshotResult();
}
std::unique_lock<std::mutex> stackWalkInProgressLock(s_stackWalkInProgressMutex);
const auto threadId = static_cast<::pid_t>(pThreadInfo->GetOsThreadId());
s_pInstanceCurrentlyStackWalking = this;
on_leave { s_pInstanceCurrentlyStackWalking = nullptr; };
_stackWalkFinished = false;
errorCode = _signalManager->SendSignal(threadId);
if (errorCode == -1)
{
Log::Warn("LinuxStackFramesCollector::CollectStackSampleImplementation:"
" Unable to send signal USR1 to thread with threadId=",
threadId, ". Error code: ", strerror(errno));
}
else
{
// release the lock and wait for a notification or the 2s timeout
auto status = _stackWalkInProgressWaiter.wait_for(stackWalkInProgressLock, 2s);
// The lock is reacquired, but we might have faced an issue:
// - the thread is dead and the lock released
// - the profiler signal handler was replaced
if (status == std::cv_status::timeout)
{
_lastStackWalkErrorCode = E_ABORT;
;
if (!_signalManager->CheckSignalHandler())
{
_lastStackWalkErrorCode = E_FAIL;
Log::Info("Profiler signal handler was replaced but we failed or stopped at restoring it. We won't be able to collect callstacks.");
*pHR = E_FAIL;
return GetStackSnapshotResult();
}
}
errorCode = _lastStackWalkErrorCode;
}
}
// errorCode domain values
// * < 0 : libunwind error codes
// * > 0 : other errors (ex: failed to create frame while walking the stack)
// * == 0 : success
if (errorCode < 0)
{
UpdateErrorStats(errorCode);
}
*pHR = (errorCode == 0) ? S_OK : E_FAIL;
return GetStackSnapshotResult();
}
void LinuxStackFramesCollector::NotifyStackWalkCompleted(std::int32_t resultErrorCode)
{
_lastStackWalkErrorCode = resultErrorCode;
_stackWalkFinished = true;
_stackWalkInProgressWaiter.notify_one();
}
// This symbol is defined in the Datadog.Linux.ApiWrapper. It allows us to check if the thread to be profiled
// contains a frame of a function that might cause a deadlock.
extern "C" unsigned long long dd_inside_wrapped_functions() __attribute__((weak));
std::int32_t LinuxStackFramesCollector::CollectCallStackCurrentThread(void* ctx)
{
if (dd_inside_wrapped_functions != nullptr && dd_inside_wrapped_functions() != 0)
{
return E_ABORT;
}
try
{
// Collect data for TraceContext tracking:
TryApplyTraceContextDataFromCurrentCollectionThreadToSnapshot();
return _useBacktrace2 ? CollectStackWithBacktrace2(ctx) : CollectStackManually(ctx);
}
catch (...)
{
return E_ABORT;
}
}
std::int32_t LinuxStackFramesCollector::CollectStackManually(void* ctx)
{
std::int32_t resultErrorCode;
// if we are in the signal handler, ctx won't be null, so we will use the context
// This will allow us to skip the syscall frame and start from the frame before the syscall.
auto flag = UNW_INIT_SIGNAL_FRAME;
unw_context_t context;
if (ctx != nullptr)
{
context = *reinterpret_cast<unw_context_t*>(ctx);
}
else
{
// not in signal handler. Get the context and initialize the cursor form here
resultErrorCode = unw_getcontext(&context);
if (resultErrorCode != 0)
{
return E_ABORT; // unw_getcontext does not return a specific error code. Only -1
}
flag = static_cast<unw_init_local2_flags_t>(0);
}
unw_cursor_t cursor;
resultErrorCode = unw_init_local2(&cursor, &context, flag);
if (resultErrorCode < 0)
{
return resultErrorCode;
}
do
{
// After every lib call that touches non-local state, check if the StackSamplerLoopManager requested this walk to abort:
if (IsCurrentCollectionAbortRequested())
{
AddFakeFrame();
return E_ABORT;
}
unw_word_t ip;
resultErrorCode = unw_get_reg(&cursor, UNW_REG_IP, &ip);
if (resultErrorCode != 0)
{
return resultErrorCode;
}
if (!AddFrame(ip))
{
return S_FALSE;
}
resultErrorCode = unw_step(&cursor);
} while (resultErrorCode > 0);
return resultErrorCode;
}
std::int32_t LinuxStackFramesCollector::CollectStackWithBacktrace2(void* ctx)
{
auto* context = reinterpret_cast<unw_context_t*>(ctx);
// Now walk the stack:
auto buffer = Data();
auto count = unw_backtrace2((void**)buffer.data(), buffer.size(), context, UNW_INIT_SIGNAL_FRAME);
if (count == 0)
{
return E_FAIL;
}
SetFrameCount(count);
return S_OK;
}
bool LinuxStackFramesCollector::CanCollect(int32_t threadId, pid_t processId) const
{
// on OSX, processId can be equal to 0. https://sourcegraph.com/github.com/dotnet/runtime/-/blob/src/coreclr/pal/src/exception/signal.cpp?L818:5&subtree=true
// Since the profiler does not run on OSX, we leave it like this.
auto* currentThreadInfo = _pCurrentCollectionThreadInfo;
return currentThreadInfo != nullptr && currentThreadInfo->GetOsThreadId() == threadId && processId == _processId;
}
void LinuxStackFramesCollector::MarkAsInterrupted()
{
auto* currentThreadInfo = _pCurrentCollectionThreadInfo;
if (currentThreadInfo != nullptr)
{
currentThreadInfo->MarkAsInterrupted();
}
}
bool IsInSigSegvHandler(void* context)
{
auto* ctx = reinterpret_cast<ucontext_t*>(context);
// If SIGSEGV is part of the sigmask set, it means that the thread was executing
// the SIGSEGV signal handler (or someone blocks SIGSEGV signal for this thread,
// but that less likely)
return sigismember(&(ctx->uc_sigmask), SIGSEGV) == 1;
}
bool LinuxStackFramesCollector::CollectStackSampleSignalHandler(int signal, siginfo_t* info, void* context)
{
// This is a workaround to prevent libunwind from unwind 2 signal frames and potentially crashing.
// Current crash occurs in libcoreclr.so, while reading the Elf header.
if (IsInSigSegvHandler(context))
{
return false;
}
// Libunwind can overwrite the value of errno - save it beforehand and restore it at the end
auto oldErrno = errno;
bool success = false;
LinuxStackFramesCollector* pCollectorInstance = s_pInstanceCurrentlyStackWalking;
if (pCollectorInstance != nullptr)
{
std::unique_lock<std::mutex> stackWalkInProgressLock(s_stackWalkInProgressMutex);
pCollectorInstance = s_pInstanceCurrentlyStackWalking;
// sampling in progress
if (pCollectorInstance != nullptr)
{
pCollectorInstance->MarkAsInterrupted();
// There can be a race:
// The sampling thread has sent the signal and is waiting, but another SIGUSR1 signal was sent
// by another thread and is handled before the one sent by the sampling thread.
if (pCollectorInstance->CanCollect(OpSysTools::GetThreadId(), info->si_pid))
{
// In case it's the thread we want to sample, just get its callstack
auto resultErrorCode = pCollectorInstance->CollectCallStackCurrentThread(context);
// release the lock
stackWalkInProgressLock.unlock();
pCollectorInstance->NotifyStackWalkCompleted(resultErrorCode);
success = true;
}
}
// no need to release the lock and notify. The sampling thread must wait until its signal is handled correctly
}
errno = oldErrno;
return success;
}
void LinuxStackFramesCollector::ErrorStatistics::Add(std::int32_t errorCode)
{
auto& value = _stats[errorCode];
value++;
}
void LinuxStackFramesCollector::ErrorStatistics::Log()
{
if (!_stats.empty())
{
std::stringstream ss;
ss << std::setfill(' ') << std::setw(13) << "# occurrences"
<< " | "
<< "Error message\n";
for (auto& errorCodeAndStats : _stats)
{
ss << std::setfill(' ') << std::setw(10) << errorCodeAndStats.second << " | " << unw_strerror(errorCodeAndStats.first) << " (" << errorCodeAndStats.first << ")\n";
}
Log::Info("LinuxStackFramesCollector::CollectStackSampleImplementation: The sampler thread encoutered errors in the interval\n",
ss.str());
_stats.clear();
}
}