Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

don't block when there are no buffers left #1641

Merged
merged 3 commits into from
Jun 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.lmax.disruptor.BlockingWaitStrategy;
import com.lmax.disruptor.EventFactory;
import com.lmax.disruptor.EventHandler;
import com.lmax.disruptor.InsufficientCapacityException;
import com.lmax.disruptor.dsl.Disruptor;
import com.lmax.disruptor.dsl.ProducerType;
import datadog.common.exec.DaemonThreadFactory;
Expand All @@ -18,6 +19,7 @@
public class DispatchingDisruptor implements AutoCloseable {

private final Disruptor<TraceBuffer> disruptor;
private final Monitor monitor;

public DispatchingDisruptor(
int disruptorSize,
Expand All @@ -34,6 +36,7 @@ public DispatchingDisruptor(
// block (and use no resources) until there's a batch of data to dispatch
new BlockingWaitStrategy());
disruptor.handleEventsWith(new TraceDispatchingHandler(api, monitor, writer));
this.monitor = monitor;
}

public void start() {
Expand All @@ -46,7 +49,27 @@ public void close() {
}

long beginTransaction() {
return disruptor.getRingBuffer().next();
long backoffMillis = 1;
long nextLogTime = 0;
while (!Thread.currentThread().isInterrupted()) {
try {
return disruptor.getRingBuffer().tryNext();
} catch (InsufficientCapacityException insufficientCapacity) {
monitor.onBackedUpTraceBuffer();
long now = System.currentTimeMillis();
backoffMillis = Math.min(backoffMillis * 2, 1000);
if (now > nextLogTime) { // log every 20 seconds
log.debug("no buffer available, sleeping for {}ms", backoffMillis);
richardstartin marked this conversation as resolved.
Show resolved Hide resolved
nextLogTime = now + 20_000;
}
try {
Thread.sleep(backoffMillis);
} catch (InterruptedException interrupted) {
Thread.currentThread().interrupt();
}
}
}
return -1L;
}

TraceBuffer getTraceBuffer(long sequence) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ public interface Monitor {

void onScheduleFlush(final DDAgentWriter agentWriter, final boolean previousIncomplete);

void onBackedUpTraceBuffer();

void onSerialize(
final DDAgentWriter agentWriter, final List<DDSpan> trace, final int serializedSizeInBytes);

Expand Down Expand Up @@ -114,6 +116,11 @@ public void onScheduleFlush(final DDAgentWriter agentWriter, final boolean previ
// not recorded
}

@Override
public void onBackedUpTraceBuffer() {
statsd.incrementCounter("trace.buffer.backlog");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

}

@Override
public void onFlush(final DDAgentWriter agentWriter, final boolean early) {}

Expand Down Expand Up @@ -203,6 +210,9 @@ public void onFlush(final DDAgentWriter agentWriter, final boolean early) {}
public void onScheduleFlush(
final DDAgentWriter agentWriter, final boolean previousIncomplete) {}

@Override
public void onBackedUpTraceBuffer() {}

@Override
public void onSerialize(
final DDAgentWriter agentWriter,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ package datadog.trace.api.writer
import com.timgroup.statsd.StatsDClient
import datadog.trace.api.sampling.PrioritySampling
import datadog.trace.common.writer.DDAgentWriter
import datadog.trace.common.writer.ddagent.DDAgentApi
import datadog.trace.common.writer.ddagent.DDAgentApi
import datadog.trace.common.writer.ddagent.DispatchingDisruptor
import datadog.trace.common.writer.ddagent.Monitor
import datadog.trace.common.writer.ddagent.MsgPackStatefulSerializer
import datadog.trace.common.writer.ddagent.TraceBuffer
Expand All @@ -18,6 +19,7 @@ import org.msgpack.core.buffer.ArrayBufferOutput
import spock.lang.Retry
import spock.lang.Timeout

import java.util.concurrent.Executors
import java.util.concurrent.Phaser
import java.util.concurrent.Semaphore
import java.util.concurrent.TimeUnit
Expand Down Expand Up @@ -672,6 +674,50 @@ class DDAgentWriterTest extends DDSpecification {
writer.close()
}

def "test backed up buffer"() {
setup:
System.gc() // seems to reduce false negatives
def minimalTrace = createMinimalTrace()
AtomicInteger backedUp = new AtomicInteger(0)
def serializer = Spy(new MsgPackStatefulSerializer())
def monitor = Mock(Monitor)
def api = Mock(DDAgentApi)
def arbitrator = Executors.newSingleThreadScheduledExecutor()
def dispatcher = new DispatchingDisruptor(2,
DDAgentWriter.toEventFactory(serializer),
api,
monitor,
Mock(DDAgentWriter))
dispatcher.start()

when:
api.sendSerializedTraces(_) >> {
// wait to be unblocked, by which time the test aims to have created a backlog
Thread.sleep(500)
DDAgentApi.Response.success(200)
}
// if the code below can't run within a second,
// the test will fail, but this behaviour is hard to provoke with latches
monitor.onBackedUpTraceBuffer() >> { backedUp.incrementAndGet() }
long txnId1 = dispatcher.beginTransaction()
serializer.reset(dispatcher.getTraceBuffer(txnId1))
serializer.serialize(minimalTrace)
dispatcher.commit(txnId1)
long txnId2 = dispatcher.beginTransaction()
serializer.reset(dispatcher.getTraceBuffer(txnId2))
serializer.serialize(minimalTrace)
dispatcher.commit(txnId2)
long txnId3 = dispatcher.beginTransaction()

then:
backedUp.get() >= 1 // reported the congestion
txnId3 == 2 // made progress eventually

cleanup:
dispatcher.close()
arbitrator.shutdownNow()
}

static int calculateSize(List<DDSpan> trace) {
def buffer = new ArrayBufferOutput()
def packer = MessagePack.DEFAULT_PACKER_CONFIG
Expand Down