Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add frame tags and send snapshots on addThrowable #6824

Merged
merged 2 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 4 additions & 1 deletion dd-java-agent/agent-debugger/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ excludedClassesCoverage += [
'com.datadog.debugger.agent.DebuggerAgent',
// too old for this coverage (JDK 1.2)
'antlr.*',
'com.datadog.debugger.util.MoshiSnapshotHelper' // only static classes
// only static classes
'com.datadog.debugger.util.MoshiSnapshotHelper',
// based on JDK WeakHashMap
'com.datadog.debugger.util.WeakIdentityHashMap*'
]

dependencies {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import datadog.trace.api.Config;
import datadog.trace.bootstrap.debugger.util.TimeoutChecker;
import datadog.trace.bootstrap.instrumentation.api.AgentSpan;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
Expand Down Expand Up @@ -63,7 +64,7 @@ public interface ValueSerializer {
}

public interface ExceptionDebugger {
void handleException(Throwable t);
void handleException(Throwable t, AgentSpan span);
}

private static volatile ProbeResolver probeResolver;
Expand Down Expand Up @@ -331,11 +332,15 @@ public static void commit(
}
}

public static void handleException(Throwable t) {
ExceptionDebugger exDebugger = exceptionDebugger;
if (exDebugger == null) {
return;
public static void handleException(Throwable t, AgentSpan span) {
try {
ExceptionDebugger exDebugger = exceptionDebugger;
if (exDebugger == null) {
return;
}
exDebugger.handleException(t, span);
} catch (Exception ex) {
LOGGER.debug("Error in handleException: ", ex);
shatzi marked this conversation as resolved.
Show resolved Hide resolved
}
exDebugger.handleException(t);
}
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
package com.datadog.debugger.exception;

import static com.datadog.debugger.agent.ConfigurationAcceptor.Source.EXCEPTION;
import static com.datadog.debugger.util.ExceptionHelper.createThrowableMapping;

import com.datadog.debugger.agent.ConfigurationUpdater;
import com.datadog.debugger.agent.DebuggerAgent;
import com.datadog.debugger.exception.ExceptionProbeManager.ThrowableState;
import com.datadog.debugger.sink.Snapshot;
import com.datadog.debugger.util.ClassNameFiltering;
import com.datadog.debugger.util.ExceptionHelper;
import datadog.trace.bootstrap.debugger.DebuggerContext;
import datadog.trace.bootstrap.instrumentation.api.AgentSpan;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -14,6 +21,10 @@
*/
public class DefaultExceptionDebugger implements DebuggerContext.ExceptionDebugger {
private static final Logger LOGGER = LoggerFactory.getLogger(DefaultExceptionDebugger.class);
public static final String DD_DEBUG_ERROR_EXCEPTION_ID = "_dd.debug.error.exception_id";
public static final String ERROR_DEBUG_INFO_CAPTURED = "error.debug_info_captured";
public static final String SNAPSHOT_ID_TAG_FMT = "_dd.debug.error.%d.snapshot_id";

private final ExceptionProbeManager exceptionProbeManager;
private final ConfigurationUpdater configurationUpdater;
private final ClassNameFiltering classNameFiltering;
Expand All @@ -33,16 +44,66 @@ public DefaultExceptionDebugger(
}

@Override
public void handleException(Throwable t) {
public void handleException(Throwable t, AgentSpan span) {
String fingerprint = Fingerprinter.fingerprint(t, classNameFiltering);
if (fingerprint == null) {
LOGGER.debug("Unable to fingerprint exception", t);
return;
}
Throwable innerMostException = ExceptionHelper.getInnerMostThrowable(t);
if (innerMostException == null) {
LOGGER.debug("Unable to find root cause of exception");
return;
}
if (exceptionProbeManager.isAlreadyInstrumented(fingerprint)) {
// TODO trigger send snapshots already captured
ThrowableState state = exceptionProbeManager.getSateByThrowable(innerMostException);
if (state == null) {
LOGGER.debug("Unable to find state for throwable: {}", innerMostException.toString());
return;
shatzi marked this conversation as resolved.
Show resolved Hide resolved
}
span.setTag(ERROR_DEBUG_INFO_CAPTURED, true);
shatzi marked this conversation as resolved.
Show resolved Hide resolved
if (span.getTag(DD_DEBUG_ERROR_EXCEPTION_ID) != null) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how this can happen?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2 instrumentations at different level are triggered:

  • one at framework level: Spring
  • one at http server level: tomcat
    so 2 calls to addThrowable are made for the same span.

LOGGER.debug("Clear previous frame tags");
// already set for this span, clear the frame tags
span.getTags()
.forEach(
(k, v) -> {
if (k.startsWith("_dd.debug.error.")) {
shatzi marked this conversation as resolved.
Show resolved Hide resolved
span.setTag(k, (String) null);
}
});
}
span.setTag(DD_DEBUG_ERROR_EXCEPTION_ID, state.getExceptionId());
LOGGER.debug(
"add tag to span[{}]: {}: {}",
span.getSpanId(),
DD_DEBUG_ERROR_EXCEPTION_ID,
state.getExceptionId());
int[] mapping = createThrowableMapping(innerMostException, t);
StackTraceElement[] innerTrace = innerMostException.getStackTrace();
int currentIdx = 0;
List<Snapshot> snapshots = state.getSnapshots();
for (int i = 0; i < snapshots.size(); i++) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this deal with recursion functions? if we have all snapshots for all frames this would work, but if some of the frames are missing this can go out of sync.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not, this approach have limitations, but it's good enough for a first approach. it will be refined later

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WE need to revisit later this naive heuristic using stack depth to perform assignment and use the class and method name to ensure we don't mess up

Snapshot snapshot = snapshots.get(i);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because each snapshot have a reference to throwable at that time. we can reduce the search into that section of frames. aka, assume we have throwable T1 -> T2 -> T3 (inner most). then all snapshots that captured T2 should be located on those frames.

String className = snapshot.getProbe().getLocation().getType();
String methodName = snapshot.getProbe().getLocation().getMethod();
while (currentIdx < innerTrace.length
&& !innerTrace[currentIdx].getClassName().equals(className)
&& !innerTrace[currentIdx].getMethodName().equals(methodName)) {
currentIdx++;
}
int frameIndex = mapping[currentIdx++];
if (frameIndex == -1) {
continue;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be a break. as currentIdx already == innerTrace.length.
And maybe we should send error/ warning for skipped snapshots.

Copy link
Member Author

@jpbempel jpbempel Mar 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be a break. as currentIdx already == innerTrace.length.

no it cannot.
-1 is when there is no mapping. Usually for error message or for additional information line number of rames in common (inner exceptions).
Example:

java.lang.RuntimeException: test3
	at com.datadog.debugger.exception.DefaultExceptionDebuggerTest.createNestException(DefaultExceptionDebuggerTest.java:206)
	at com.datadog.debugger.exception.DefaultExceptionDebuggerTest.nestedException(DefaultExceptionDebuggerTest.java:71)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.junit.platform.commons.util.ReflectionUtils.invokeMethod(ReflectionUtils.java:727)
	at org.junit.jupiter.engine.execution.MethodInvocation.proceed(MethodInvocation.java:60)
[...]
	at org.gradle.api.internal.tasks.testing.worker.TestWorker.execute(TestWorker.java:100)
	at org.gradle.api.internal.tasks.testing.worker.TestWorker.execute(TestWorker.java:60)
	at org.gradle.process.internal.worker.child.ActionExecutionWorker.execute(ActionExecutionWorker.java:56)
	at org.gradle.process.internal.worker.child.SystemApplicationClassLoaderWorker.call(SystemApplicationClassLoaderWorker.java:113)
	at org.gradle.process.internal.worker.child.SystemApplicationClassLoaderWorker.call(SystemApplicationClassLoaderWorker.java:65)
	at worker.org.gradle.process.internal.worker.GradleWorkerMain.run(GradleWorkerMain.java:69)
	at worker.org.gradle.process.internal.worker.GradleWorkerMain.main(GradleWorkerMain.java:74)
Caused by: java.lang.RuntimeException: test2
	at com.datadog.debugger.exception.DefaultExceptionDebuggerTest.createTest2Exception(DefaultExceptionDebuggerTest.java:214)
	... 87 more
Caused by: java.lang.RuntimeException: test1
	at com.datadog.debugger.exception.DefaultExceptionDebuggerTest.createTest1Exception(DefaultExceptionDebuggerTest.java:210)
	... 87 more

... 87 more cannot be mapped hence -1 but I still to continue to map when snapshot matches for the inner exceptions

Also some method cannot be mapped because they are native ones (hence no snapshot are generated for those)

And maybe we should send error/ warning for skipped snapshots.

will do in a next PR

}
String tagName = String.format(SNAPSHOT_ID_TAG_FMT, frameIndex);
span.setTag(tagName, snapshot.getId());
LOGGER.debug("add tag to span[{}]: {}: {}", span.getSpanId(), tagName, snapshot.getId());
DebuggerAgent.getSink().addSnapshot(snapshot);
}
} else {
exceptionProbeManager.createProbesForException(fingerprint, t.getStackTrace());
exceptionProbeManager.createProbesForException(
fingerprint, innerMostException.getStackTrace());
// TODO make it async
configurationUpdater.accept(EXCEPTION, exceptionProbeManager.getProbes());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,32 @@

import com.datadog.debugger.probe.ExceptionProbe;
import com.datadog.debugger.probe.Where;
import com.datadog.debugger.sink.Snapshot;
import com.datadog.debugger.util.ClassNameFiltering;
import com.datadog.debugger.util.ExceptionHelper;
import com.datadog.debugger.util.WeakIdentityHashMap;
import datadog.trace.bootstrap.debugger.ProbeId;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** Manages the probes used for instrumentation of exception stacktraces. */
public class ExceptionProbeManager {
private static final Logger LOGGER = LoggerFactory.getLogger(ExceptionProbeManager.class);

private final Set<String> fingerprints = ConcurrentHashMap.newKeySet();
private final Map<String, ExceptionProbe> probes = new ConcurrentHashMap<>();
private final ClassNameFiltering classNameFiltering;
// FIXME: if this becomes a bottleneck, find a way to make it concurrent weak identity hashmap
private final Map<Throwable, ThrowableState> snapshotsByThrowable =
Collections.synchronizedMap(new WeakIdentityHashMap<>());

public ExceptionProbeManager(ClassNameFiltering classNameFiltering) {
this.classNameFiltering = classNameFiltering;
Expand Down Expand Up @@ -64,4 +77,49 @@ public Collection<ExceptionProbe> getProbes() {
public boolean shouldCaptureException(String fingerprint) {
return fingerprints.contains(fingerprint);
}

boolean containsFingerprint(String fingerprint) {
return fingerprints.contains(fingerprint);
}

public void addSnapshot(Snapshot snapshot) {
Throwable throwable = snapshot.getCaptures().getReturn().getCapturedThrowable().getThrowable();
throwable = ExceptionHelper.getInnerMostThrowable(throwable);
if (throwable == null) {
LOGGER.debug(
"Unable to find root cause of exception: {}",
snapshot.getCaptures().getReturn().getCapturedThrowable().getThrowable().toString());
return;
}
ThrowableState state =
snapshotsByThrowable.computeIfAbsent(
throwable, key -> new ThrowableState(UUID.randomUUID().toString()));
snapshot.setExceptionId(state.getExceptionId());
state.addSnapshot(snapshot);
}

public ThrowableState getSateByThrowable(Throwable throwable) {
return snapshotsByThrowable.get(throwable);
}

public static class ThrowableState {
private final String exceptionId;
private final List<Snapshot> snapshots = new ArrayList<>();

private ThrowableState(String exceptionId) {
this.exceptionId = exceptionId;
}

public String getExceptionId() {
return exceptionId;
}

public List<Snapshot> getSnapshots() {
return snapshots;
}

public void addSnapshot(Snapshot snapshot) {
snapshots.add(snapshot);
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.datadog.debugger.exception;

import static com.datadog.debugger.util.ExceptionHelper.getInnerMostThrowable;

import com.datadog.debugger.util.ClassNameFiltering;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
Expand Down Expand Up @@ -41,18 +43,6 @@ public static String fingerprint(Throwable t, ClassNameFiltering classNameFilter
return bytesToHex(bytes);
}

private static Throwable getInnerMostThrowable(Throwable t) {
int i = 100;
while (t.getCause() != null && i > 0) {
t = t.getCause();
i--;
}
if (i == 0) {
return null;
}
return t;
}

// convert byte[] to hex string
private static String bytesToHex(byte[] bytes) {
StringBuilder result = new StringBuilder(bytes.length * 2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,6 @@ public static void emitReflectiveCall(
returnType,
targetType,
Types.STRING_TYPE);
if (sort == org.objectweb.asm.Type.OBJECT || sort == org.objectweb.asm.Type.ARRAY) {
insnList.add(new TypeInsnNode(Opcodes.CHECKCAST, fieldType.getMainType().getInternalName()));
}
// stack: [field_value]
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,7 @@ private void collectFields(InsnList insnList) {
ldc(insnList, fieldNode.name);
// stack: [capturedcontext, capturedcontext, array, array, int, string, type_name,
// this, string]
ASMHelper.emitReflectiveCall(
emitReflectiveCall(
insnList, new ASMHelper.Type(Type.getType(fieldNode.desc)), OBJECT_TYPE);
// stack: [capturedcontext, capturedcontext, array, array, int, string, type_name,
// this, string]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,11 @@ private ASMHelper.Type tryRetrieveField(
ldc(insnList, fieldName);
// stack: [target_object, string]
emitReflectiveCall(insnList, returnType, OBJECT_TYPE);
int sort = returnType.getMainType().getSort();
if (sort == org.objectweb.asm.Type.OBJECT || sort == org.objectweb.asm.Type.ARRAY) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you give me some background on what this does?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The api for reflective call always returns Object (see Field::get) so If we want to continue to retrieve sub fields we need to cast it to the underlying object.
This was moved from emitReflectiveCall method to here because for capturing values we don't need to cast it

insnList.add(
new TypeInsnNode(Opcodes.CHECKCAST, returnType.getMainType().getInternalName()));
}
}
// build null branch which will be added later after the call to emit metric
LabelNode gotoNode = new LabelNode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import com.datadog.debugger.exception.ExceptionProbeManager;
import com.datadog.debugger.exception.Fingerprinter;
import com.datadog.debugger.instrumentation.InstrumentationResult;
import com.datadog.debugger.sink.Snapshot;
import datadog.trace.bootstrap.debugger.CapturedContext;
import datadog.trace.bootstrap.debugger.MethodLocation;
import datadog.trace.bootstrap.debugger.ProbeId;
Expand All @@ -17,7 +18,7 @@

public class ExceptionProbe extends LogProbe {
private static final Logger LOGGER = LoggerFactory.getLogger(ExceptionProbe.class);
private final ExceptionProbeManager exceptionProbeManager;
private final transient ExceptionProbeManager exceptionProbeManager;

public ExceptionProbe(
ProbeId probeId,
Expand Down Expand Up @@ -64,14 +65,14 @@ public void evaluate(
if (context.getCapturedThrowable() == null) {
return;
}
Throwable throwable = context.getCapturedThrowable().getThrowable();
String fingerprint =
Fingerprinter.fingerprint(
context.getCapturedThrowable().getThrowable(),
exceptionProbeManager.getClassNameFiltering());
Fingerprinter.fingerprint(throwable, exceptionProbeManager.getClassNameFiltering());
if (exceptionProbeManager.shouldCaptureException(fingerprint)) {
LOGGER.debug("Capturing exception matching fingerprint: {}", fingerprint);
// capture only on uncaught exception matching the fingerprint
((ExceptionProbeStatus) status).setCapture(true);
ExceptionProbeStatus exceptionStatus = (ExceptionProbeStatus) status;
exceptionStatus.setCapture(true);
super.evaluate(context, status, methodLocation);
}
}
Expand All @@ -81,8 +82,18 @@ public void commit(
CapturedContext entryContext,
CapturedContext exitContext,
List<CapturedContext.CapturedThrowable> caughtExceptions) {
LOGGER.debug("committing exception probe id={}", id);
super.commit(entryContext, exitContext, caughtExceptions);
Snapshot snapshot = createSnapshot();
boolean shouldCommit = fillSnapshot(entryContext, exitContext, caughtExceptions, snapshot);
if (shouldCommit) {
snapshot.recordStackTrace(5);
shatzi marked this conversation as resolved.
Show resolved Hide resolved
// add snapshot for later to wait for triggering point (ExceptionDebugger::handleException)
exceptionProbeManager.addSnapshot(snapshot);
LOGGER.debug(
"committing exception probe id={}, snapshot id={}, exception id={}",
id,
snapshot.getId(),
snapshot.getExceptionId());
}
}

@Override
Expand Down